def add(self, data=None, row_ids=None, sync=False): '''add the data to the multiverso MatrixTable If row_ids is None, we will add all data, and the data should be a list, e.g. [1, 2, 3, ...] Otherwise we will add the data according to the row_ids Data type of `data` is numpy.ndarray with two-dimensional If sync is True, this call will blocked by IO until the call finish. Otherwise it will return immediately ''' assert(data is not None) data = convert_data(data) if row_ids is None: assert(data.size == self._size) if sync: mv_lib.MV_AddMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: mv_lib.MV_AddAsyncMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: row_ids_n = len(row_ids) assert(data.size == row_ids_n * self._num_col) int_array_type = ctypes.c_int * row_ids_n if sync: mv_lib.MV_AddMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P), row_ids_n * self._num_col, int_array_type(*row_ids), row_ids_n) else: mv_lib.MV_AddAsyncMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P), row_ids_n * self._num_col, int_array_type(*row_ids), row_ids_n)
def run_models(model_ensemble, data_file): feature_names = model_ensemble[0].feature_names df = read_data(data_file) df = convert_data(df) # Drop extra columns extra_columns = set(df.columns) - set(feature_names) logging.info(f'Columns present in now, but not in training {extra_columns}') df.drop(extra_columns, axis=1, inplace=True) # Add missing columns missing_columns = set(feature_names) - set(df.columns) logging.info(f'Columns present in training, but not in test {missing_columns}') for col in missing_columns: df[col] = 0 df = df[feature_names] preds = pd.DataFrame(model.predict(df) for model in model_ensemble) result = pd.DataFrame() result['ident'] = df.reset_index()['ident'] result['probs'] = preds.mean() > 0.5 return result
def add(self, data, sync=False): '''add the data to the multiverso ArrayTable Data type of `data` is numpy.ndarray with one-dimensional If sync is True, this call will blocked by IO until the call finish. Otherwise it will return immediately ''' data = convert_data(data) assert(data.size == self._size) if sync: mv_lib.MV_AddArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: mv_lib.MV_AddAsyncArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
def __init__(self, size, init_value=None): '''Constructor for syncing array-like (one-dimensional) value. The `size` should be a int equal to the size of value we want to sync. If init_value is None, zeros will be used to initialize the tables, otherwise the table will be initialized as the init_value. Notice: if the init_value is different in different processes, the average of them will be used. ''' self._handler = ctypes.c_void_p() self._size = size mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. self.add(init_value / api.workers_num(), sync=True)
def __init__(self, size, init_value=None): '''Constructor for syncing array-like (one-dimensional) value. The `size` should be a int equal to the size of value we want to sync. If init_value is None, zeros will be used to initialize the table, otherwise the table will be initialized as the init_value. *Notice*: if the init_value is different in different processes, the average of them will be used. ''' self._handler = ctypes.c_void_p() self._size = size mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. self.add(init_value / api.workers_num(), sync=True)
def add(self, data, sync=False): '''add the data to the multiverso ArrayTable Data type of `data` is numpy.ndarray with one-dimensional If sync is True, this call will blocked by IO until the call finish. Otherwise it will return immediately ''' data = convert_data(data) assert (data.size == self._size) if sync: mv_lib.MV_AddArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: mv_lib.MV_AddAsyncArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
def __init__(self, size, init_value=None): '''Constructor for syncing array-like (one-dimensional) value. The `size` should be a int equal to the size of value we want to sync. If init_value is None, zeros will be used to initialize the table, otherwise the table will be initialized as the init_value. *Notice*: Only the init_value from the master will be used! ''' self._handler = ctypes.c_void_p() self._size = size mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. No matter whether # it is master worker, we should call add to make sure it works in # sync mode self.add(init_value if api.is_master_worker() else np.zeros(init_value.shape), sync=True)
def main(*data_files): if len(data_files) < 1: raise TypeError( f'No data files found! Usage: python {os.path.basename(__file__)} data_file [data_file]...' ) start = datetime.now() logger.info(f'Starting {start}') df = read_data(*data_files) df = convert_data(df) model = train_model(df) save_model(model=model, filename=MODEL_PATH.joinpath(SESSION_NAME + '.pkl')) end = datetime.now() logger.info(f'All done {end}, elapsed: {end - start}')
def __init__(self, num_row, num_col, init_value=None): '''Constructor for syncing matrix-like (two-dimensional) value. The `num_row` should be the number of rows and the `num_col` should be the number of columns. If init_value is None, zeros will be used to initialize the table, otherwise the table will be initialized as the init_value. Notice: if the init_value is different in different processes, the average of them will be used. ''' self._handler = ctypes.c_void_p() self._num_row = num_row self._num_col = num_col self._size = num_col * num_row mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. self.add(init_value / api.workers_num(), sync=True)
def __init__(self, num_row, num_col, init_value=None): '''Constructor for syncing matrix-like (two-dimensional) value. The `num_row` should be the number of rows and the `num_col` should be the number of columns. If init_value is None, zeros will be used to initialize the tables, otherwise the table will be initialized as the init_value. Notice: if the init_value is different in different processes, the average of them will be used. ''' self._handler = ctypes.c_void_p() self._num_row = num_row self._num_col = num_col self._size = num_col * num_row mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. self.add(init_value / api.workers_num(), sync=True)
def add(self, data=None, row_ids=None, sync=False): '''add the data to the multiverso MatrixTable If row_ids is None, we will add all data, and the data should be a list, e.g. [1, 2, 3, ...] Otherwise we will add the data according to the row_ids Data type of `data` is numpy.ndarray with two-dimensional If sync is True, this call will blocked by IO until the call finish. Otherwise it will return immediately ''' assert (data is not None) data = convert_data(data) if row_ids is None: assert (data.size == self._size) if sync: mv_lib.MV_AddMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: mv_lib.MV_AddAsyncMatrixTableAll( self._handler, data.ctypes.data_as(C_FLOAT_P), self._size) else: row_ids_n = len(row_ids) assert (data.size == row_ids_n * self._num_col) int_array_type = ctypes.c_int * row_ids_n if sync: mv_lib.MV_AddMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P), row_ids_n * self._num_col, int_array_type(*row_ids), row_ids_n) else: mv_lib.MV_AddAsyncMatrixTableByRows( self._handler, data.ctypes.data_as(C_FLOAT_P), row_ids_n * self._num_col, int_array_type(*row_ids), row_ids_n)
def __init__(self, num_row, num_col, init_value=None): '''Constructor for syncing matrix-like (two-dimensional) value. The `num_row` should be the number of rows and the `num_col` should be the number of columns. If init_value is None, zeros will be used to initialize the table, otherwise the table will be initialized as the init_value. *Notice*: Only the init_value from the master will be used! ''' self._handler = ctypes.c_void_p() self._num_row = num_row self._num_col = num_col self._size = num_col * num_row mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler)) if init_value is not None: init_value = convert_data(init_value) # sync add is used because we want to make sure that the initial # value has taken effect when the call returns. No matter whether # it is master worker, we should call add to make sure it works in # sync mode self.add(init_value if api.is_master_worker() else np.zeros(init_value.shape), sync=True)
def de_parse(cls, prefix, json): ''' de parse ''' result_dict = {} for key, value in json.iteritems(): if key in {'_id'}: pass else: key_name = make_column_name(prefix, key) if key_name in USER_DATETIME_COLUMN_SET: result_dict[key_name] = parse_datetime_into_hbase(value) elif key_name in USER_BOOLEAN_COLUMN_SET: result_dict[key_name] = parse_boolean_into_hbase(value) elif key_name in USER_INT_COLUMN_SET: result_dict[key_name] = parse_int_into_hbase(value) elif key_name in USER_LIST_COLUMN_SET: json = import_simplejson() result_dict[key_name] = json.dumps(value) else: result_dict[key_name] = convert_data(value) return result_dict
name_model = "dt_gridsearchcv.joblib" model_path = os.path.join(save_path,name_model) joblib.dump(tree_clf, model_path) tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5) print('DecisionTree Classifier Cross Validation Score', \ round(tree_score.mean() * 100, 2).astype(str) + '%') print("-"*80) if __name__ =="__main__": path = "/home/hoangnv68/BankFraudDetection/creditcard.csv" df = read_data(path) sub_df = Downsample_data(df) sub_df = Remove_ouliers(sub_df, ["V14", "V12", "V10"], 25, 75, 1.5) X = sub_df.drop("Class", axis=1) y = sub_df["Class"] X_train, X_test, y_train, y_test = convert_data(X, y) classifiers = { "LogisiticRegression": LogisticRegression(max_iter=1000), "KNearest": KNeighborsClassifier(), "Support Vector Classifier": SVC(), "DecisionTreeClassifier": DecisionTreeClassifier() } save_path = "/home/hoangnv68/BankFraudDetection/supervise_model/pretrained_model" train(X_train, X_test, y_train, y_test, save_path) gridsearchCV(X_train, X_test, y_train, y_test, save_path)
def main(argv): print '\nSYSTEM START\n' print 'Emb Dim: %d\tHidden Dim: %d\tOptimization: %s\tLayer: %d\tEpoch: %d' %\ (argv.emb, argv.hidden, argv.opt, argv.layer, argv.epoch) print 'Parameters to be saved: %s' % argv.save """data preprocessing""" print 'DATA Preprocessing...' corpus, vocab_word = utils.load_conll(argv.data) id_corpus = utils.convert_words_into_ids(corpus, vocab_word) train_samples = utils.convert_data(id_corpus) n_samples = len(id_corpus) print 'Samples: %d\tVocab: %d' % (n_samples, vocab_word.size()) """symbol definition""" index = T.iscalar() w = T.ivector() d = T.ivector() n_hidden = argv.hidden n_words = argv.n_words batch_size = argv.batch """model setup""" print 'Compiling Theano Code...' model = lstm.LSTM(w=w, d=d, n_layers=argv.layer, vocab_size=vocab_word.size(), n_in=n_hidden, n_h=n_hidden, n_words=n_words, batch_size=batch_size ) cost = model.nll opt = optimizers.main(name=argv.opt, cost=cost, params=model.params, emb=model.emb, x=model.x, w=model.w) """ train """ def _train(): train_model = theano.function( inputs=[index], outputs=[model.nll, model.errors], updates=opt, givens={ w: train_samples[index * n_words * batch_size: (index+1) * n_words * batch_size], d: train_samples[index * n_words * batch_size + 1: (index+1) * n_words * batch_size + 1] }, mode='FAST_RUN' ) n_batch_samples = n_samples / n_words / batch_size print 'Vocabulary Size: %d\tBatch Sample Size: %d' % (vocab_word.size(), n_batch_samples) print '\nTrain START' for epoch in xrange(argv.epoch): print '\nEpoch: %d' % (epoch + 1) print '\tIndex: ', start = time.time() losses = [] errors = [] for b_index in xrange(n_batch_samples): if b_index % 100 == 0 and b_index != 0: print b_index, sys.stdout.flush() loss, error = train_model(b_index) losses.append(loss) errors.append(error) avg_loss = np.mean(losses) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAverage Negative Log Likelihood: %f' % avg_loss total = 0.0 correct = 0 for sent in errors: total += len(sent) for y_pred in sent: if y_pred == 0: correct += 1 print '\tTrain Accuracy: %f' % (correct / total) if argv.save: model.save() _train()
def s2s_data_generator(s2s_df=duplets, all_catalog=catalog_images, batch_size=None): """ A data generator for generating triplets, i.e, (input_image, positive_image, negative_image) on the fly before training. Select a random input image from the training set and select an positive_image with same product id and negative image with different product id and generate batch of triplets The function keeps yielding a batch of triplets until the whole training process is complete. """ orig_index_list = duplets.index.tolist() all_shop_index_list = catalog_images.index.tolist() dummy = np.zeros((1, 3 * N)) while True: q_list = list() p_list = list() n_list = list() dummy_list = list() index_list = copy.copy(orig_index_list) while len(index_list) > 0: index = random.choice(index_list) product_id = duplets.loc[index, 'product_id'] q_temp = duplets.loc[index, 'street_images'] q_img = os.path.join(Path, q_temp + '.jpeg') p_temp = duplets.loc[index, 'shop_images'] p_img = os.path.join(Path, p_temp + '.jpeg') while True: idx = random.choice(all_shop_index_list) prod_idx = catalog_images.loc[idx, 'product_id'] if prod_idx != product_id: temp = random.choice(catalog_images.loc[idx, 'shop_images']) n_img = os.path.join(Path, temp + '.jpeg') q_img = os.path.join(Path, q_index + '.jpeg') p_img = os.path.join(Path, p_index + '.jpeg') n_img = os.path.join(Path, n_index + '.jpeg') res = bbox_mappings[q_index] left = res['left'] top = res['top'] right = left + res['width'] bottom = top + res['height'] query_img = Image.open(q_img) query_crop = query_img.crop((left, top, right, bottom)) positive_img = Image.open(p_img) negative_img = Image.open(n_img) query = np.array(query_crop.resize((300, 300), Image.NEAREST)) positive = np.array(positive_img.resize((300, 300), Image.NEAREST)) negative = np.array(negative_img.resize((300, 300), Image.NEAREST)) q_list.append(query_array) p_list.append(positive_array) n_list.append(negative_array) dummy_list.append(dummy) index_list.remove(index) if len(q_list) == batch_size or (len(index_list) == 0 and len(q_list) > 0): yield convert_data(q_list, p_list, n_list, dummy_list) q_list = list() p_list = list() n_list = list() dummy_list = list()
def main(argv): print '\nSYSTEM START' print '\nMODE: Training' print '\nRECURRENT HIDDEN UNIT: %s\n' % argv.unit print '\tTRAINING\t\tBatch: %d Epoch: %d Parameters Save: %s' % ( argv.batch, argv.epoch, argv.save) print '\tINITIAL EMBEDDING\t %s' % argv.init_emb print '\tNETWORK STRUCTURE\tEmb Dim: %d Hidden Dim: %d Layers: %d' % ( argv.emb, argv.hidden, argv.layer) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f %f L2 Reg: %f' % ( argv.opt, argv.lr1, argv.lr2, argv.reg) """ load corpus""" print '\n\tCorpus Preprocessing...' train_corpus = load_conll(argv.train_data, exclude=True) print '\tTrain Sentences: %d' % len(train_corpus) if argv.dev_data: dev_corpus = load_conll(argv.dev_data) print '\tDev Sentences: %d' % len(dev_corpus) if argv.test_data: test_corpus = load_conll(argv.test_data) print '\tTest Sentences: %d' % len(test_corpus) """ load initial embedding file """ print '\n\tInitial Embedding Loading...' init_emb, vocab_word = load_init_emb(init_emb=argv.init_emb) print '\tVocabulary Size: %d' % vocab_word.size() """ convert words into ids """ print '\n\tConverting Words into IDs...' tr_id_sents, tr_id_ctx, tr_marks, tr_prds, train_y, arg_dict = get_id_samples( train_corpus, vocab_word=vocab_word, sort=True) if argv.dev_data: dev_id_sents, dev_id_ctx, dev_marks, dev_prds, dev_y, dev_arg_dict =\ get_id_samples(dev_corpus, vocab_word=vocab_word, a_dict=arg_dict) if argv.test_data: te_id_sents, te_id_ctx, te_marks, te_prds, test_y, test_arg_dict =\ get_id_samples(test_corpus, vocab_word=vocab_word, a_dict=arg_dict) print '\tLabel size: %d' % arg_dict.size() dump_data(data=arg_dict, fn=argv.train_dir + 'arg_dict-%d' % (arg_dict.size())) """ convert formats for theano """ print '\n\tCreating Training/Dev/Test Samples...' train_sample_x, train_sample_y = convert_data(tr_id_sents, tr_prds, tr_id_ctx, tr_marks, train_y, init_emb) print '\tTrain Samples: %d' % len(train_sample_x) if argv.dev_data: dev_sample_x, dev_sample_y = convert_data_test(dev_id_sents, dev_prds, dev_id_ctx, dev_marks, dev_y, init_emb) print '\tDev Samples: %d' % len(dev_sample_x) if argv.test_data: test_sample_x, test_sample_y = convert_data_test( te_id_sents, te_prds, te_id_ctx, te_marks, test_y, init_emb) print '\tTest Samples: %d' % len(test_sample_x) """symbol definition""" x = T.ftensor3() d = T.imatrix() n_in = init_emb.shape[1] n_h = argv.hidden n_y = arg_dict.size() reg = argv.reg batch = argv.batch """ Model Setup """ print '\nTheano Code Compiling...' tagger = RNN(unit=argv.unit, x=x, d=d, n_layers=argv.layer, n_in=n_in, n_h=n_h, n_y=n_y, reg=reg) train_model = theano.function(inputs=[x, d], outputs=[tagger.nll, tagger.errors], updates=tagger.updates, mode='FAST_RUN') test_model = theano.function(inputs=[x, d], outputs=[tagger.y_pred, tagger.errors], mode='FAST_RUN') """ Training """ print '\nTRAIN START' best_dev_f = 0.0 best_test_f = 0.0 best_epoch = -1 flag = False for epoch in xrange(argv.epoch): _train_sample_x, _train_sample_y = shuffle(train_sample_x, train_sample_y) print '\nEpoch: %d' % (epoch + 1) print '\tIndex: ', start = time.time() losses = [] errors = [] sample_index = 0 for index in xrange(len(train_sample_x)): batch_x = _train_sample_x[index] batch_y = _train_sample_y[index] for b_index in xrange(len(batch_x) / batch + 1): sample_index += 1 if sample_index % 100 == 0: print '%d' % sample_index, sys.stdout.flush() sample_x = batch_x[b_index * batch:(b_index + 1) * batch] sample_y = batch_y[b_index * batch:(b_index + 1) * batch] if len(sample_x) == 0: continue loss, error = train_model(sample_x, sample_y) losses.append(loss) errors.extend(error) end = time.time() avg_loss = np.mean(losses) total, correct = count_correct(errors) print '\tTime: %f seconds' % (end - start) print '\tAverage Negative Log Likelihood: %f' % avg_loss print '\tTrain Accuracy: %f' % (correct / total) """ Check model performance """ if argv.dev_data: dev_f, predicts = test(test_model, dev_sample_x, dev_sample_y, dev_arg_dict, 'Dev') if best_dev_f < dev_f: best_dev_f = dev_f best_epoch = epoch """ Save Parameters """ if argv.save: fn = 'Layer-%d_Dim-%d_Batch-%d_Hidden-%d_Reg-%f_Epoch-%d' % ( argv.layer, argv.hidden, argv.batch, argv.hidden, argv.reg, epoch) dump_data(data=tagger, fn=argv.train_dir + fn) """ Output Results """ output_results( dev_corpus, dev_prds, arg_dict, predicts, argv.train_dir + 'Dev-result.layer%d.batch%d.hidden%d.opt-%s.reg-%f.epoch%d.txt' % (argv.layer, argv.batch, argv.hidden, argv.opt, argv.reg, epoch)) flag = True print '\t### Best Dev F Score: %f Epoch: %d ###' % ( best_dev_f, best_epoch + 1) if argv.test_data: test_f, predicts = test(test_model, test_sample_x, test_sample_y, test_arg_dict, 'Test') if flag: best_test_f = test_f flag = False output_results( test_corpus, te_prds, arg_dict, predicts, argv.train_dir + 'Test-result.layer%d.batch%d.hidden%d.opt-%s.reg-%f.epoch%d.txt' % (argv.layer, argv.batch, argv.hidden, argv.opt, argv.reg, epoch)) if argv.dev_data: print '\t### Best Test F Score: %f Epoch: %d ###' % ( best_test_f, best_epoch + 1)
testing_data = readData("KBQA_data/sq_relations/test.replace_ne.withpool", False) print "Start to read validation data" valid_data = readData("KBQA_data/sq_relations/valid.replace_ne.withpool", False) print "\n" print "start to get word dictionary for questions and relations" question_words = wordStatForQuestion(training_data) relation_words = wordStatForRelation(relation_list_seg, relation_list_seg_all, training_data) print "\n" print "Start to convert data to vectors..." training_data_conv = convert_data(question_words, relation_words, relation_list_seg, relation_list_seg_all, training_data) print "\nThere are", len(training_data_conv), "in the training data" testing_data_conv = convert_data(question_words, relation_words, relation_list_seg, relation_list_seg_all, testing_data) print "\nThere are", len(testing_data_conv), "in the testing data" valid_data_conv = convert_data(question_words, relation_words, relation_list_seg, relation_list_seg_all, valid_data) print "\nThere are", len(valid_data_conv), "in the valid data" print "\n" print "Start to calculate the max length for sequence length..." max_length_dict = data_static(training_data_conv, testing_data_conv, valid_data_conv)