def __str__(self): import encoding # import late to avoid circular imports code = encoding.encoding(self) types = code["types"] args = dict((a.name, pretty_print(a, types.get(a.name, ""))) for a in self.args) return code["format"].format(**args)
class ScoringService(object): env = { 'GRAPH_BUCKET': kg_path, 'KG_DBPEDIA_KEY': dbpedia_key, 'KG_ENTITY_KEY': entity_key, 'KG_RELATION_KEY': relation_key, 'KG_ENTITY_INDUSTRY_KEY': entity_industry_key, 'KG_VOCAB_KEY': vocab_key, 'DATA_INPUT_KEY': data_input_key, 'TRAIN_OUTPUT_KEY': train_output_key } graph = kg.Kg(env) # Where we keep the model when it's loaded model = encoding.encoding(graph, env) @classmethod def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model == None: # import kg # import encoding cls.model = model # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp: # cls.model = pickle.load(inp) return cls.model @classmethod def predict(cls, input): """For the input, do the predictions and return them. Args: input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" clf = cls.get_model() return clf[input]
def _encoding(self): """helper method to lookup the encoding in the font""" c = cursor(self.data1, "/Encoding") token1 = c.gettoken() token2 = c.gettoken() if token1 == "StandardEncoding" and token2 == "def": self.encoding = encoding.adobestandardencoding else: encvector = [None] * 256 while 1: self.encodingstart = c.pos if c.gettoken() == "dup": break while 1: i = c.getint() glyph = c.gettoken() if 0 <= i < 256: encvector[i] = glyph[1:] token = c.gettoken() assert token == "put" self.encodingend = c.pos token = c.gettoken() if token == "readonly" or token == "def": break assert token == "dup" self.encoding = encoding.encoding(encvector)
class ScoringService(object): import kg import encoding graph = kg.Kg( kg_folder=kg_path) # Where we keep the model when it's loaded model = encoding.encoding(graph) @classmethod def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model == None: # import kg # import encoding cls.model = model # with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'r') as inp: # cls.model = pickle.load(inp) return cls.model @classmethod def predict(cls, input): """For the input, do the predictions and return them. Args: input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" clf = cls.get_model() return clf[input]
def _read_txt(root=None): path = TXT_FILE result = {} for txt in res.get_texts(path, locale=True, root=root): lines = txt.split("\n") encoding_name = encoding.encoding(txt) for line in lines: try: line = line.strip() if line: k, v = line.split(None, 1) if v: try: v = unicode(v, encoding_name) except ValueError: v = unicode(v, encoding_name, "replace") warning("in '%s', encoding error: %s", path, line) result[k] = v else: warning("in '%s', line ignored: %s", path, line) except: warning("in '%s', syntax error: %s", path, line) result["9998"] = u"," result["9999"] = u"." return result
def _encoding(self): """helper method to lookup the encoding in the font""" c = cursor(self.data1, "/Encoding") token1 = c.gettoken() token2 = c.gettoken() if token1 == "StandardEncoding" and token2 == "def": self.encoding = encoding.adobestandardencoding else: encvector = [None]*256 while 1: self.encodingstart = c.pos if c.gettoken() == "dup": break while 1: i = c.getint() glyph = c.gettoken() if 0 <= i < 256: encvector[i] = glyph[1:] token = c.gettoken(); assert token == "put" self.encodingend = c.pos token = c.gettoken() if token == "readonly" or token == "def": break assert token == "dup" self.encoding = encoding.encoding(encvector)
def goClick(self, key, sentence): for k in key: if k in '~!@#$%^&*()_+`1234567890-=<>?,./': tkinter.messagebox.showinfo("오류", "특수문자는 입력 불가입니다.") self.root.destroy() encodeGUI() for s in sentence: if s in '~!@#$%^&*()_+`1234567890-=<>?,./': tkinter.messagebox.showinfo("오류", "특수문자는 입력 불가입니다.") self.root.destroy() encodeGUI() from GUI.result import ResultGUI self.root.destroy() encoding.encoding(key, sentence) ResultGUI()
def encode(block_size, imagePath="test.jpg", encodedFile="encoded", probabilityFile="probability.npy", float_type='float64'): img = cv2.imread(imagePath, cv2.IMREAD_GRAYSCALE).flatten() img = numpy.append(img, [0] * (block_size - len(img) % block_size)) codes = numpy.array([]) probability = (collections.Counter(img)) print("Encoding Started") prob = [None] * 256 for i in range(0, 256): prob[i] = probability[i] if float_type != 'float16' and float_type != 'float32' and float_type != 'float64': float_type = 'float64' prob = numpy.asarray(prob) prob = numpy.true_divide(prob, len(img)) for i in range(0, len(img), block_size): x = e.encoding(prob, img[i:(block_size + i)]) if not (0 <= x <= 1): print("encoding error") codes = numpy.append(codes, [x]) numpy.save('original.npy', img) print("Encoding Done:") numpy.save(encodedFile, codes.astype(float_type)) # save print(" ->" + encodedFile + " is created") numpy.save(probabilityFile, prob) # save print(" ->" + probabilityFile + " is created\n") return prob
def label_apply(labels, tok, pos, signed=True, pc_relative=False): if tok.name == "jmp": pc_relative = True enc = encoding.encoding(tok) for j, arg in enumerate(tok.args): if isinstance(arg, tokens.Expression): label_apply(labels, arg, pos, False, pc_relative) elif isinstance(arg, tokens.Label): bits = 64 signed = True # If we found an encoding then we're materializing a label within # an instruction and we want to ensure the proper size/sign. # Otherwise, we're inside an expression, so use a bit value that # won't truncate as expression do their own bit checking. if enc: syntax = [x[1] for x in enc["ast"] if len(x) == 2][j] signed = syntax.startswith("s") bits = int(syntax[1:]) tok.args[j] = label_find(labels, arg, pos, bits, signed, pc_relative)
def run(self): # initializing the vocabularies trainData = dataSet(self.training_file, 'train') # print(trainData.getIntentLabels()) testData = dataSet(self.test_file, 'test', trainData.getWordVocab(), trainData.getTagVocab(), trainData.getIntentVocab(), trainData.getIndex2Word(), trainData.getIndex2Tag(), trainData.getIntentLabels()) intent_target_file = self.result_path + '/' + 'intent.list' with open(intent_target_file, 'w') as f: for intent in trainData.getIntentLabels(): f.write(f"{intent}\n") tag_target_file = self.result_path + '/' + 'tag.list' with open(tag_target_file, 'w') as f: for tag in trainData.getIndex2Tag(): f.write(f"{tag}\n") # preprocessing by padding 0 until maxlen X_train = sequence.pad_sequences(trainData.dataset['utterances'], maxlen=self.time_length, dtype='int32', padding='pre') X_test = sequence.pad_sequences(testData.dataset['utterances'], maxlen=self.time_length, dtype='int32', padding='pre') y_intent_train = trainData.dataset['intents'] pad_y_tags_train = sequence.pad_sequences(trainData.dataset['tags'], maxlen=self.time_length, dtype='int32', padding='pre') y_intent_test = testData.dataset['intents'] pad_y_tags_test = sequence.pad_sequences(testData.dataset['tags'], maxlen=self.time_length, dtype='int32', padding='pre') num_sample_train, max_len = np.shape(X_train) num_sample_test, _ = np.shape(X_test) if not self.nodev: validData = dataSet(self.validation_file, 'val', trainData.getWordVocab(), trainData.getTagVocab(), trainData.getIntentVocab(), trainData.getIndex2Word(), trainData.getIndex2Tag(), trainData.getIntentLabels()) X_dev = sequence.pad_sequences(validData.dataset['utterances'], maxlen=self.time_length, dtype='int32', padding='pre') y_intent_dev = validData.dataset['intents'] pad_y_tag_dev = sequence.pad_sequences(validData.dataset['tags'], maxlen=self.time_length, dtype='int32', padding='pre') num_sample_dev, _ = np.shape(X_dev) # encoding input vectors self.input_vocab_size = trainData.getWordVocabSize() self.output_intent_size = trainData.getIntentVocabSize() self.output_vocab_size = trainData.getTagVocabSize() print('Building model architecture!!!!') self.build() print(self.model.summary()) # data generation sys.stderr.write("Vectorizing the input.\n") y_intent_train = to_categorical(y_intent_train, num_classes=self.output_intent_size) y_tags_train = encoding(pad_y_tags_train, '1hot', self.time_length, self.output_vocab_size) if not self.nodev: y_intent_dev = to_categorical(y_intent_dev, num_classes=self.output_intent_size) y_tags_dev = encoding(pad_y_tag_dev, '1hot', self.time_length, self.output_vocab_size) # encode history for memory network H_train = sequence.pad_sequences(history_build(trainData, X_train), maxlen=(self.time_length * self.his_length), dtype='int32', padding='pre') H_test = sequence.pad_sequences(history_build(testData, X_test), maxlen=(self.time_length * self.his_length), dtype='int32', padding='pre') if not self.nodev: H_dev = sequence.pad_sequences(history_build(validData, X_dev), maxlen=(self.time_length * self.his_length), dtype='int32', padding='pre') if self.record_epoch != -1 and self.load_weight is None: total_epochs = self.max_epochs self.max_epochs = self.record_epoch for i in range(1, total_epochs / self.record_epoch + 1): num_iter = i * self.record_epoch self.train(H_train=H_train, X_train=X_train, y_train=[y_intent_train, y_tags_train], H_dev=H_dev, X_dev=X_dev, y_dev=[y_intent_dev, y_tags_dev]) if not self.nodev: self.test(H=H_dev, X=X_dev, data_type='dev.' + str(num_iter), tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_dev) self.test(H=H_test, X=X_test, data_type='test.' + str(num_iter), tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_test) # save weights for the current model whole_path = self.mdl_path + '/' + self.model_arch + '.' + str( num_iter) + '.h5' sys.stderr.write("Writing model weight to %s...\n" % whole_path) self.model.save_weights(whole_path, overwrite=True) else: self.train(H_train=H_train, X_train=X_train, y_train=[y_intent_train, y_tags_train], H_dev=H_dev, X_dev=X_dev, y_dev=[y_intent_dev, y_tags_dev]) # if not self.nodev: # self.test(H=H_dev, X=X_dev, data_type='dev', tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_dev) # self.test(H=H_test, X=X_test, data_type='test', tagDict=trainData.dataSet['id2tag'], pad_data=pad_X_test) with open('model.json') as f: json.dump(f, self.model.to_json()) if self.load_weight is None: whole_path = self.mdl_path + '/' + self.model_arch + '.final-' + str( self.max_epochs) + '.h5' sys.stderr.write("Writing model weight to %s...\n" % whole_path) self.model.save_weights(whole_path, overwrite=True)
continue writer.writerow([ round(float(line[index] * 100), 3), route, cmd[i][dico_pivot['acheteur.dateCreation']], cmd[i][dico_pivot['id']], cmd[i][dico_pivot['typeParcours']], cmd[i][dico_pivot['modeLivraison.type']], cmd[i][dico_pivot['caracteristiquesLigne.techno']], cmd[i][dico_pivot['acte']], cmd[i][dico_pivot['historiques.historiqueStatut.valeur']] ]) if __name__ == '__main__': TRAIN, PREDICT, EPOCH, BATCH, RESTORE, PATH = gestion_arg() encode = encoding(PATH) by_model = ByModel() save_data = encode.recover_data(encode) pivot = save_data[0] dico_pivot = {} for i, d in enumerate(pivot): for a in LISTPIVOT: if a == d: dico_pivot[a] = i result = encode.encoding_list_ascii(encode) del (result[0]) if (PREDICT == True): by_model.load_weights(RESTORE) result = scaler.fit_transform(result) try: prediction = by_model(result)
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from missing_data import imputate_missing_data, drop_col_with_missing from encoding import encoding from model import xgboost_model, forest_model train_data = pd.read_csv('../input/train.csv') test_data = pd.read_csv('../input/test.csv') y = train_data.SalePrice X = train_data.drop(['SalePrice'], axis=1) encoded_train, encoded_test = encoding(X, test_data) # handle missing data final_data = [] final_data.append( imputate_missing_data(encoded_train.copy(), encoded_test.copy())) final_data.append( drop_col_with_missing(encoded_train.copy(), encoded_test.copy())) # split the data set for data in final_data: (data['train_X'], data['test_X'], data['train_y'], data['test_y']) = train_test_split(data['final_train'], y, random_state=43) # train the models for data in final_data: model = (xgboost_model(data['train_X'], data['train_y'], data['test_X'], data['test_y']))
"batch_size": 1000, "target": "raw", # "raw" or "edges" "total_epochs": 50, "type_n2v": "n2v", # "n2v" or "nn2v" "n_hidden": [128, 32, 8, 2], "data_type": "npy" # "npy" or "pickle" } input_dir = configs["basedir"] for i in [ "encoded_vector", configs["type_n2v"], str(configs["h_size"]), configs["data_type"] ]: input_dir += i + "/" if not os.path.isdir(input_dir): os.mkdir(input_dir) output_dir = "../output/" if not os.path.isdir(output_dir): os.mkdir(output_dir) for i in [configs["type_n2v"], str(configs["h_size"]), configs["data_type"]]: output_dir += i + "/" if not os.path.isdir(output_dir): os.mkdir(output_dir) preprocessing(configs) encoding(configs) if configs["data_type"] == "npy": pickle2npy(configs) training(configs) testing(configs)