def main(_): ''' count = [['<eos>', 20], ['word1', 24]...] ''' count = [] ''' word2idx = {'<eos>':0, 'word1':1, ...} ''' word2idx = {} if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) ''' idx2word = {0:'<eos>', 1:'word1'} ''' idx2word = dict(zip(word2idx.values(), word2idx.keys())) # number of words FLAGS.nwords = len(word2idx) pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() if FLAGS.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data)
def main(_): count = [] word2idx = {} train_data = read_data('data/ptb.train.txt', count, word2idx) valid_data = read_data('data/ptb.valid.txt', count, word2idx) test_data = read_data('data/ptb.test.txt', count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) FLAGS.nwords = len(word2idx) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() model.run(train_data, test_data)
def simulate_skin(steps=5, max_iter=100, learning_rate=0.1): """Simulate learning skin data set.""" data = read_data('Skin_NonSkin.txt') train_data, test_data = split_list(data, 0.75) start = len(train_data)/steps # First step training set size. end = len(train_data) # Final step training set size. sizes = [] # Training data set sizes. success = [] # Success rates according to training data set sizes. for i in xrange(steps): # Increase training data size according to iteration. size = start + i*end/steps current_train_data = train_data[:size] w = train(current_train_data, max_iter=max_iter, r=learning_rate) error = test(test_data, w) status(current_train_data, test_data, error) print # Record size-success statistics. sizes.append(size) success.append(100 - error) plot_success_per_size(sizes, success) show()
def main(_): count = [] word2idx = {} train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) FLAGS.nwords = len(word2idx) pp.pprint(tf.app.flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() model.run(train_data, valid_data)
def main(): dataFile = "/home/laga/uni/kurse/2013/ss/dist_semantik/hausarbeit/nmr_relations_5class.data" # only nounds words = data.read_data(dataFile) # build vocabulary vocab = set() concepts = set() for word in words: vocab.update(wn.get_hypernyms(word["firstWordSynset"], 7)) concepts.add(word["firstWordSynset"]) concepts.add(word["secondWordSynset"]) vocab.update(wn.get_hypernyms(word["secondWordSynset"], 7)) s = sorted(vocab) print s indices = {} for x in range(0, len(s)): indices[s[x]] = x print indices vocabSize = len(indices) print "WN vocab size: %s" % len(indices) failed = set() # download image for concepts for w in concepts: ret = imgnet.download_image_urls(w, "/home/laga/uni/kurse/2013/ss/dist_semantik/hausarbeit/imgnet-data/") #time.sleep(0.3) if not ret: failed.add(w) print "Failed to download img for synsets: %s" % failed rep = {} for word in words: vector = [0 for x in xrange(0, 2 * vocabSize)] w1 = word["firstWordSynset"] w2 = word["secondWordSynset"] if w1 in failed or w2 in failed: print "Skipping current word, one of synsets does not have image: %s" % word continue rep[(w1,w2)] = vector for w in wn.get_hypernyms(w1, 7): idx = indices[w] vector[idx] = 1 for w in wn.get_hypernyms(w2, 7): idx = indices[w] vector[idx] = 1 vector.append(word["relation"]) fh = open("wordnet.data", "w") for (key,val) in rep.iteritems(): fh.write(key[0].name) fh.write(",") fh.write(key[1].name) fh.write(",") for f in val: fh.write(str(f)) fh.write(",") fh.write("\n") fh.close()
def main(_): count = [] word2idx = {} train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) FLAGS.nwords = len(word2idx) pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(sess, **(flags.FLAGS.__flags)) model.build_model() if FLAGS.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data)
def data_processing_model(): init_data = read_data("../data/raw/heart.csv") train_params = read_training_pipeline_params("../configs/config_lr.yml") classifier = Classifier(train_params.classifier_params, train_params.model_type) pipeline = DataProcessingPipeline( train_params.feature_params.categorical_features, train_params.feature_params.numerical_features) pipeline.fit(init_data) transformed_data = pipeline.transform(init_data) classifier.fit(transformed_data, init_data['target'].values) return pipeline, classifier
def get_k_day_return(code, days=5, start_date="2012-01-01", end_date="2017-01-01"): """ 获得k日收益率,计算公式:(当日收盘价-k日前收盘价)/当日收盘价 """ fname = data.get_filename(code) df = data.read_data(fname) df = df[df.index >= start_date] factor_name = "%d-day return" % (days) df[factor_name] = df['close'].pct_change(periods=days) return df[factor_name]
def main(verbose, path): ## # Function that read all data ## # read all arguments if verbose: print('###\tStart reading data') start = timer() train, validation, test = dt.read_data(path) end = timer() if verbose: print('###\tTime for reading data: %.2f sec' % (end - start)) return train, validation, test, path
def main(_): word2idx = {} max_words = 0 max_sentences = 0 if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) train_stories, train_questions, max_words, max_sentences = read_data( '{}/qa{}_train.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx, max_words, max_sentences) valid_stories, valid_questions, max_words, max_sentences = read_data( '{}/qa{}_valid.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx, max_words, max_sentences) test_stories, test_questions, max_words, max_sentences = read_data( '{}/qa{}_test.txt'.format(FLAGS.data_dir, FLAGS.babi_task), word2idx, max_words, max_sentences) pad_data(train_stories, train_questions, max_words, max_sentences) pad_data(valid_stories, valid_questions, max_words, max_sentences) pad_data(test_stories, test_questions, max_words, max_sentences) idx2word = dict(zip(word2idx.values(), word2idx.keys())) FLAGS.nwords = len(word2idx) FLAGS.max_words = max_words FLAGS.max_sentences = max_sentences pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() if FLAGS.is_test: model.run(valid_stories, valid_questions, test_stories, test_questions) else: model.run(train_stories, train_questions, valid_stories, valid_questions)
def main(): #tweets,labels=data.read_data("../data/dataset_terremoto_iquique_2014.csv") tweets, labels = data.read_data( "../data/tweets-iquique-2014-tipo-informacion.csv") processed_tweets = process_tweet.process_tweets(tweets) X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.30, random_state=42) classifier(X_train, y_train, X_test, y_test)
def prepare_data(): ''' Prepare Data: 1. read in audio meta data 2. set data loaders for training 3. save categorical look up tabels ''' train_data, valid_data = data.read_data(data.path) train_loader, valid_loader = data.gen_data_loader(train_data, valid_data) utils.save_cat_idx(train_data, 'models/idx2cat.pkl') return train_loader, valid_loader
def do_retrieval_experiments( descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = { '_solutions': solutions, '_descriptions': descriptions, '_evaluation': 'retrieval' } print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/' + descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/' + solutions + '_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ', gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path + '_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) print pp.pprint(results) return results
def main(_): count = [] word2idx = {} if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) if FLAGS.hdfs: write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir, FLAGS.data_name + '.train.txt') write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir, FLAGS.data_name + '.valid.txt') write_data_to_local(FLAGS.pai_data_dir, FLAGS.data_dir, FLAGS.data_name + '.test.txt') train_data = read_data( os.path.join(FLAGS.data_dir, FLAGS.data_name + '.train.txt'), count, word2idx, FLAGS.hdfs) valid_data = read_data( os.path.join(FLAGS.data_dir, FLAGS.data_name + '.valid.txt'), count, word2idx, FLAGS.hdfs) test_data = read_data( os.path.join(FLAGS.data_dir, FLAGS.data_name + '.test.txt'), count, word2idx, FLAGS.hdfs) idx2word = dict(zip(word2idx.values(), word2idx.keys())) FLAGS.nwords = len(word2idx) pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() if FLAGS.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data)
def main(_): source_count = [] source_word2idx = {} train_data = read_data(FLAGS.train_data, source_count, source_word2idx) test_data = read_data(FLAGS.test_data, source_count, source_word2idx) FLAGS.pad_idx = source_word2idx['<pad>'] FLAGS.nwords = len(source_word2idx) FLAGS.mem_size = train_data[4] if train_data[4] > test_data[4] else test_data[4] pp.pprint(flags.FLAGS.__flags) print('loading pre-trained word vectors...') FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx) # pad idx has to be 0 FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0 # FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() model.run(train_data, test_data)
def get_k_day_average_return(code, days=5, start_date="2012-01-01", end_date="2017-01-01"): """ 获得k日平均收益率 """ fname = data.get_filename(code) df = data.read_data(fname) df = df[df.index >= start_date] factor_name = "%d-day average return" % (days) df["return"] = df['close'].pct_change() df[factor_name] = df["return"].rolling(window=days).mean() return df[factor_name]
def get_k_day_volatility(code, days, start_date, end_date): """ 获得k日波动率 """ fname = data.get_filename(code) df = data.read_data(fname) df = df[df.index >= start_date] factor_name = "%d-day volatility"%(days) df["return"] = df["close"].pct_change() df[factor_name] = df["return"].rolling(window=days).std() * np.sqrt(243) return df[factor_name]
def main(_): source_count = [] source_word2idx = {} if os.path.isfile("abc") and os.path.isfile("def"): train_data = pickle.load(open("abc", "rb")) test_data = pickle.load(open("def", "rb")) else: train_data = read_data(FLAGS.train_data, source_count, source_word2idx) test_data = read_data(FLAGS.test_data, source_count, source_word2idx) pickle.dump(train_data, open("abc", "wb")) pickle.dump(test_data, open("def", "wb")) # test_data = read_data("./data/cust_sent.xml", source_count, source_word2idx) source_word2idx = train_data[5] FLAGS.pad_idx = source_word2idx['<pad>'] FLAGS.nwords = len(source_word2idx) FLAGS.mem_size = train_data[ 4] if train_data[4] > test_data[4] else test_data[4] pp.pprint(flags.FLAGS.__flags) print('loading pre-trained word vectors...') FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx) # pad idx has to be 0 FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0 # FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx) # for i in range(15): # print " Source_data: {}, target_data: {}, Label: {}, Og_source_data: {}, og_target_data:{}".format(test_data[0][i],test_data[2][i],test_data[3][i],test_data[6][i],test_data[7][i]) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() # print(np.array(train_data).shape) model.run(train_data, test_data)
def train_and_predict_by_self_realization(train_data_path, valid_data_path, vocab_path): """ 利用pytorch自实现fasttext. 14轮达到最优, 测试集准确率是0.723 :param train_data_path: 训练集路径 :param valid_data_path: 测试集路径 :param vocab_path: 字典路径 """ tokenizer, label_map, data = read_data(train_data_path, valid_data_path, vocab_path) model = self_fasttext(vocab_size=tokenizer.get_vocab_size(), embedding_size=32, label_size=len(label_map)) learner = Learner(data, model) learner.fit(epochs=20, init_lr=0.001, opt_fn=optim.Adam)
def main(): df = data.read_data('Fri', 'subset-100').set_index('Timestamp') time = pd.Timestamp(datetime(2014, 6, 6, 12)) df2 = get_positions_at(df, time) im = data.read_image('Grey') fig, ax = plt.subplots() plt.imshow(im, extent=[0, 100, 0, 100]) plt.plot() plt.show() ani = animation.FuncAnimation(fig, animate, blit=False, interval=10, repeat=False)
def main(_): count = [] word2idx = {} if not os.path.exists(FLAGS.checkpoint_dir): # 체크포인트 디렉토리가 없으면 os.makedirs(FLAGS.checkpoint_dir) # 만든다 train_data = read_data('%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) valid_data = read_data('%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) # word2index 를 key,value만 바꾸면 index2word dict! FLAGS.nwords = len(word2idx) pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() if FLAGS.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data)
def main(_): count = [] word2idx = {} train_data = read_data( '%s/%s.train.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) valid_data = read_data( '%s/%s.valid.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) test_data = read_data('%s/%s.test.txt' % (FLAGS.data_dir, FLAGS.data_name), count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) FLAGS.nwords = len(word2idx) pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() if FLAGS.is_test: model.run(valid_data, test_data) else: model.run(train_data, valid_data)
def update_frequency_factor_data(asset_df, frequency='m'): """ 把因子数据按frequency提取出来并保存,为了提升速度 """ codes = asset_df.index for code in codes: fname = data.get_factor_filename(code, frequency='d') temp = data.read_data(fname) if frequency == "m": temp = temp.resample('M').last() if frequency == "y": temp = temp.resample('A').last() fname = data.get_factor_filename(code, frequency=frequency) temp.to_excel(fname)
def main(): """Entrypoint""" dataset = read_data('input.weight.txt') tasks = [ calc(data[0], [(data[i], data[i + 1]) for i in range(1, len(data), 2)]) for data in dataset ] width = max(len(x) for x, _ in tasks) collect = [ pad(x, (0, width - len(x)), mode='constant', constant_values=nan) for x, _ in tasks ] print('') print('Output:') print(mat(collect))
def main(): """Entrypoint""" raw = mat(read_data('input.super.txt')) count, diff = 0, 1 data = raw while count < 100 and diff > 0.5 * (10**-4): new = data * raw count += 1 diff = max([ row_diff([col for col in row if col != 0]) for row in new.tolist() ]) # pylint: disable=E1101 if isinf(diff): break data = new print(data)
def get_asset_factor_data(asset_df, factors, frequency='m'): """ 得到资产的价格与因子数据 frequency: m/y """ print("getting asset factor data...") codes = asset_df.index dic = {} for code in codes: fname = data.get_factor_filename(code, frequency=frequency) temp = data.read_data(fname) temp = temp[factors + ["close"]] dic[code] = temp pnl = pd.Panel(dic) return pnl
def main(args): logging.info('Running trainer with {}'.format(args)) language = led_parser.propositional_language() parser = data.Parser(language) n_ops = len(language.symbols) # construct a pwn using a treenn with a sat3 cell sat3 = csat.Sat3Cell(n_ops, args.num_units, args.batch_size, args.n_worlds) nn = treenn.TreeNN(sat3, parser, args.batch_size) possibleworldsnet = pwn.PossibleWorlds(nn, args.n_worlds, args.num_units) logging.info('N variables = {}'.format( np.sum([np.prod(var.shape) for var in possibleworldsnet.variables]))) opt = tf.train.AdamOptimizer() checkpoint = tf.contrib.eager.Checkpoint( **{var.name: var for var in possibleworldsnet.variables}) writer = tf.contrib.summary.create_file_writer(args.logdir) writer.set_as_default() for e in range(args.epochs): # Train for A, B, E in data.batch_data( data.read_data(os.path.join(args.datadir, 'train.txt')), args.batch_size): loss, grads, p = compute_step(possibleworldsnet, A, B, E) gnvs = zip(grads, possibleworldsnet.variables) step = tf.train.get_or_create_global_step() opt.apply_gradients(gnvs, global_step=step) logging.info('step: {} loss: {}'.format(step.numpy(), tf.reduce_mean(loss))) with tf.contrib.summary.record_summaries_every_n_global_steps(10): tf.contrib.summary.scalar('loss', loss) tf.contrib.summary.scalar('acc', accuracy(p, E)) # Evaluate for test_name, test_set in data.fetch_test_sets( args.datadir, args.batch_size): logging.info('Eval: {}'.format(test_name)) acc = np.mean( [accuracy(possibleworldsnet(A, B), E) for A, B, E in test_set]) tf.contrib.summary.scalar(test_name, acc) checkpoint.save(os.path.join(args.logdir, 'ckpt{}'.format(e)))
def run(): """ Run the experiment. """ is_ptr = False np.random.seed(RANDOM_SEED) max_val, max_length, pairs = read_data(name="test") np.random.shuffle(pairs) training_pairs = [tensors_from_pair(pair) for pair in pairs] data_dim = max_val + 1 hidden_dim = embedding_dim = 256 encoder = Encoder(input_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) if is_ptr: decoder = PtrDecoder(output_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) else: decoder = AttnDecoder(output_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) checkpoint = load_checkpoint("ptr" if is_ptr else "vanilla") if checkpoint: encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) else: print("Count not find checkpoint file.") permutation_count, nondecreasing_count = 0, 0 for i in range(len(training_pairs)): input_tensor, target_tensor = training_pairs[i] output_tensor = evaluate(encoder=encoder, decoder=decoder, input_tensor=training_pairs[i][0], is_ptr=is_ptr) target, output = list(np.asarray( input_tensor.data).squeeze()), output_tensor[:-1] if is_permutation(target, output): permutation_count += 1 if nondecreasing(output) == 0: nondecreasing_count += 1 print("Permutation: %s" % (permutation_count / len(training_pairs))) print("Nondecreasing: %s" % (nondecreasing_count / len(training_pairs)))
def train_and_predict_by_textcnn(train_data_path, valid_data_path, vocab_path): """ 利用pytorch实现textcnn进行建模. 14轮, 测试集准确率为0.702 :param train_data_path: 训练集路径 :param valid_data_path: 测试集路径 :param vocab_path: 字典路径 """ tokenizer, label_map, data = read_data(train_data_path, valid_data_path, vocab_path) model = TextCNN(vocab_size=tokenizer.get_vocab_size(), embedding_size=32, max_seq_length=48, kernel_num=32, kernel_sizes=(2,3,4,5), dropout=0.3, label_size=len(label_map)) learner = Learner(data, model) learner.fit(epochs=20, init_lr=0.001, opt_fn=optim.Adam)
def main(): dataset = 'abalone' data_path = '../data' out_path = '../out/rf/models' model_path = f'{out_path}/{dataset}.pkl' seed = 2019 np.random.seed(seed) model = RandomForestClassifier(n_estimators=100, n_jobs=-1) data_dict = data.read_data(data_path, dataset, validation=False) trn_x = data_dict['trn_x'] trn_y = data_dict['trn_y'] model.fit(trn_x, trn_y) os.makedirs(os.path.dirname(model_path), exist_ok=True) joblib.dump(model, model_path)
def main(): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') df = data.read_data('Fri', 'subset-300') # df = df[(df.id % 3) == 0] p_id = 391338 path = df[df['id'] == p_id] # plot_trajectories(ax=ax, df=path, show_map=True, colors='#2FB787', line_kw=dict(lw=2)) plot_trajectories(ax=ax, df=path, show_map=True, line_kw=dict(lw=2)) # for angle in range(0, 360, 10): # print(angle) # ax.view_init(30, angle) # plt.savefig('Data61 angle {}.png'.format(angle), bbox_inches='tight') plt.show()
def do_retrieval_experiments(descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence','dependency','random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_solutions':solutions, '_descriptions':descriptions, '_evaluation':'retrieval'} print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/'+descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/'+solutions+'_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ',gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path+'_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) print pp.pprint(results) return results
def predict_pipeline(params: PredictionPipelineParams): logger.info(f"Start prediction.") data = read_data(params.data_path) logger.info(f"Data loaded. Raw data shape: {data.shape}") pipeline = load_transformer(params.transformer_path) logger.info(f"Transformer loaded: {pipeline}") model = load_model(params.model_path) logger.info(f"Model loaded: {model}") train_features = make_features(pipeline, data) logger.info(f"Test features shape: {train_features.shape}") predictions = predict_model(train_features, model) predictions_path = save_prediction(predictions, params.output_path) logger.info(f"Predictions saved in {predictions_path}")
def main(): path = os.getcwd() path_save, path_source = os.path.split(path) path_save = path_save + "/Results" if not os.path.exists(path_save): os.makedirs(path_save) #tweets,labels=data.read_data("../data/dataset_terremoto_iquique_2014.csv") #tweets,labels = data.read_data("../data/tweets-iquique-2014-tipo-informacion.csv") tweets_data = data.read_data("../data/results-random-inf.csv") #print(tweets_data[13]) #group = 0 for tweet_info in tweets_data: processed_tweets = process_tweet.process_tweets( tweets_data[tweet_info]["tweets"]) #ACA EMPIEZA EL PROCESO DE "ACTIVE LEARNING" #se entrena el grupo 0 solo inicialmente, luego dentro del loop se entrena el resto de los grupos aucmulados print("entrenando grupo 0") best_score_svm, best_score_dt = data_split.train_cv_grid(tweets_data[0]["tweets"], tweets_data[0]["labels"], \ path_save, 0) tweets_data_new = {} dict_new = {} for group in tweets_data: for key in tweets_data[0].keys(): if group == 0: dict_new[key] = tweets_data[group][key] + tweets_data[group + 1][key] tweets_data_new[group] = dict_new else: if (group + 1) in tweets_data: dict_new[key] = dict_new[key] + tweets_data[group + 1][key] tweets_data_new[group] = dict_new #print(dict_new) #X_train, X_test, y_train, y_test = train_test_split(dict_new["tweets"],dict_new["labels"], stratify = dict_new["labels"], \ # test_size=0.20, random_state=199993) if group > 0: print("entrenando con grupo: ", group) best_score_svm, best_score_dt = data_split.train_cv_grid(dict_new["tweets"], dict_new["labels"], \ path_save, group)
def run(): """ Run the experiment. """ name = "train" is_ptr = True hidden_dim = embedding_dim = 256 n_epochs = 1 grad_clip = 2 teacher_force_ratio = 0.5 optimizer = optim.Adam optimizer_params = {} max_val, max_length, pairs = read_data(name) set_max_length(max_length) training_pairs = [tensors_from_pair(pair) for pair in pairs] data_dim = max_val + 1 encoder = Encoder(input_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) if is_ptr: decoder = PtrDecoder(output_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) else: decoder = AttnDecoder(output_dim=data_dim, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) train(encoder=encoder, decoder=decoder, optim=optimizer, optim_params=optimizer_params, weight_init=weight_init, grad_clip=grad_clip, is_ptr=True, training_pairs=training_pairs, n_epochs=n_epochs, teacher_force_ratio=teacher_force_ratio, print_every=50, plot_every=50, save_every=100)
def main() -> None: """ main entry point for program """ strategy = initialize() # read in the data (tokenized with tf-idf) x_train_padded, x_valid_padded, y_train, y_valid, max_len, word_indexes = read_data() # run base models # simple RNN model simple_rnn(strategy, x_train_padded, x_valid_padded, y_train, y_valid, max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, EPOCHS_BASE) # create embeddings layer embeddings_output = build_embeddings(EMBEDDING_SIZE_Y, word_indexes) # train and run LSTM model run_lstm(strategy, x_train_padded, x_valid_padded, y_train, y_valid, max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE) # train and run GRU model run_gru(strategy, x_train_padded, x_valid_padded, y_train, y_valid, max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE) # train and run bidirectional LSTM model run_rnn(strategy, x_train_padded, x_valid_padded, y_train, y_valid, max_len, len(word_indexes) + 1, EMBEDDING_SIZE_Y, embeddings_output, EPOCHS_BASE) # max length for transformers models (different from base models above) attention_max_len = 192 # read in attention data (into tensorflow datasets) x_train, x_valid, y_train, y_valid, train_dataset, \ valid_dataset, test_dataset, batch_size = read_data_attention( strategy, attention_max_len) # build, train and test distilBERT model run_distilibert(strategy, x_train, x_valid, y_train, y_valid, train_dataset, valid_dataset, test_dataset, attention_max_len, EPOCHS_TRANSFORMERS, batch_size) # read second dataset (tokenized with xlm roberta tokenizer) x_train, x_valid, y_train, y_valid, train_dataset, \ valid_dataset, test_dataset, batch_size = read_data_attention( strategy, attention_max_len) # build, train, and run xlm roberta run_roberta(strategy, x_train, x_valid, y_train, y_valid, train_dataset, valid_dataset, test_dataset, attention_max_len, EPOCHS_TRANSFORMERS, batch_size)
def doit(): args = command_parser() print(args) states = args.states variables = args.vars datfilename = args.data[0] + ".csv" startdate = "2020-09-01" dim = [4, 2.5] if args.data: print("covid: saving data in " + datfilename) df = data.read_data(startdate, states, variables) df.to_csv(datfilename) if args.graph: graphfilename = args.graph[0] print("covid: generating graph from " + datfilename + " into " + graphfilename) df = (pd.read_csv(datfilename, parse_dates=['date']).query("state in @states")) graph.graph_b(df, states, variables, graphfilename, dim)
def test_batch_image(): data = read_data(gt_path='E:\All_My_Files\All_My_DataSets\derain_datasets\Derain_ML_Proj\\final_testset\ground_truth\\', rain_path='E:\All_My_Files\All_My_DataSets\derain_datasets\Derain_ML_Proj\\final_testset\\rainy_image\\', num_channel=3, size_input=128, batch_size=1 ) rain_image, gt_image = data out1 = torch.squeeze(rain_image) out1 = norm_range(out1, None) out1 = out1.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() # W H C imag1 = Image.fromarray(out1) # W H C out2 = torch.squeeze(gt_image) out2 = norm_range(out2, None) out2 = out2.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() # W H C imag2 = Image.fromarray(out2) # W H C imag1.save('C:\\Users\Administrator\Desktop\Derain_ML_proj_v2.5\\test\output\\rain.jpg') imag2.save('C:\\Users\Administrator\Desktop\Derain_ML_proj_v2.5\\test\output\\gt.jpg')
def train(feature_transform): inputs, labels = data.read_data() # shuffle data random.seed(1234) shuffled = zip(inputs, labels) random.shuffle(shuffled) inputs = numpy.asarray([example[0] for example in shuffled]) labels = [numpy.asarray([1 if example[1] == label else 0 for example in shuffled]) for label in range(4)] if feature_transform: inputs = numpy.log2(inputs) training_steps = 20000 x = [T.matrix('x{}'.format(label)) for label in range(4)] y = [T.vector('y{}'.format(label)) for label in range(4)] w = [theano.shared(numpy.random.randn(16), name='w{}'.format(label)) for label in range(4)] b = [theano.shared(0., name='b{}'.format(label)) for label in range(4)] # Construct Theano expression graph p_1 = [1 / (1 + T.exp(-T.dot(x[l], w[l]) - b[l])) for l in range(4)] # Probability that target = 1 xent = [-y[l] * T.log(p_1[l]) - (1-y[l]) * T.log(1-p_1[l]) for l in range(4)] # Cross-entropy loss function cost = [xent[l].mean() + 0.01 * (w[l] ** 2).sum() for l in range(4)] # The cost to minimize gwb = [T.grad(cost[l], [w[l], b[l]]) for l in range(4)] # Compute the gradient of the cost train = [theano.function( inputs=[x[l],y[l]], outputs=[p_1[l], xent[l]], updates=((w[l], w[l] - 0.1 * gwb[l][0]), (b[l], b[l] - 0.1 * gwb[l][1]))) for l in range(4)] predict = [theano.function(inputs=[x[l]], outputs=p_1[l]) for l in range(4)] # Train for l in range(4): print('Training for label {}'.format(l)) for i in range(training_steps): pred, err = train[l](inputs, labels[l]) return predict
def do_classification_experiments(dataset='tasa/TASA900', graph_types = ['co-occurrence','dependency','random'], use_frequency = True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset':dataset, '_evaluation':'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/'+dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ',gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification(vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path+'_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification(vectors, labels) print pp.pprint(results) return results
def main(): name, print_names = parse_options(argv) results, features, feature_names = read_data(name) features = np.hstack((features, results.reshape((len(results), 1)))) correlation = find_correlation(features) correlationorder = sorted(correlation, key=lambda x: abs(correlation.get(x)[0]), reverse=True) #print("Feature numbers") #print() #print("number\tname") #print("------\t----") #for i in range(len(feature_names)): #print("{}\t{}".format(i, feature_names[i])) #print() #print() #print("Correlations") #print() #print("ft1\tft2\tpearson") #print("---\t---\t-------") #for c in correlationorder: #print("{}\t{}\t{}".format(c[0], c[1], correlation[c][0])) for c in correlationorder: if print_names: print('"{}","{}",{}'.format(feature_names[c[0]], feature_names[c[1]], correlation[c][0])) else: print('{},{},{}'.format(c[0], c[1], correlation[c][0]))
def main(): relieff_file = open(RELIEFF_FILENAME, "r") relieff_features = parse_relieff_list(relieff_file.readlines()) relieff_features.sort(key=lambda x: x[0], reverse=True) results, features, feature_names = read_data(FEATURE_FILENAME) #features = np.hstack((features, results.reshape((len(results), 1)))) correlations = find_correlation(features) #print("\n".join("{}: {}".format(x, correlations[x]) for x in correlations)) #return for threshold in frange(1.0, 0.0, -0.1): selected = select_features(features, relieff_features, correlations, threshold) print("============================================================") print("THRESHOLD =", threshold) print("Count:", len(selected)) print("\n".join(["{} ({})".format(x, feature_names[x]) for x in selected])) print() print() print()
def train_with_val(learning_rate=0.13, n_epochs=3, batch_size=600): inputs, labels = data.read_data() # shuffle data random.seed(1234) shuffled = zip(inputs, labels) random.shuffle(shuffled) inputs = [example[0] for example in shuffled] labels = [example[1] for example in shuffled] def shared_dataset(data_x, data_y): shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=True) return (shared_x, T.cast(shared_y, 'int32')) train_set_x, train_set_y = shared_dataset(inputs[:int(0.8 * len(inputs))], labels[:int(0.8 * len(inputs))]) valid_set_x, valid_set_y = shared_dataset(inputs[int(0.8 * len(labels)):int(0.9 * len(inputs))], labels[int(0.8 * len(labels)):int(0.9 * len(inputs))]) test_set_x, test_set_y = shared_dataset(inputs[int(0.9 * len(inputs)):], labels[int(0.9 * len(inputs)):]) # train_set_x = numpy.asarray(inputs[:int(0.8 * len(inputs))]) # train_set_y = numpy.asarray(inputs[:int(0.8 * len(labels))]) # valid_set_x = numpy.asarray(inputs[int(0.8 * len(labels)):int(0.9 * len(labels))]) # valid_set_y = numpy.asarray(inputs[int(0.8 * len(labels)):int(0.9 * len(labels))]) # test_set_x = numpy.asarray(inputs[int(0.9 * len(inputs)):]) # test_set_y = numpy.asarray(inputs[int(0.9 * len(labels)):]) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) ) # save the best model with open(REFLEX_MODEL_PICKLE, 'w') as f: cPickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) )
import tensorflow as tf from data import read_data from model import MemN2N count = [] word2idx = {} train_data = read_data('data/ptb.train.txt', count, word2idx) valid_data = read_data('data/ptb.valid.txt', count, word2idx) test_data = read_data('data/ptb.test.txt', count, word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) params = { 'show': True, 'nhop': 3, 'edim': 150, 'lindim': 75, 'mem_size': 100, 'batch_size': 125, 'max_grad_norm': 50, 'init_hid': 0.01, 'nwords': 10000, } if __name__ == '__main__': with tf.Session() as sess: model = MemN2N(params, sess) model.build_model() model.run(train_data, test_data, 100)
(query), sent.answer)) return d if __name__ == '__main__': import data vocab = collections.defaultdict(lambda: len(vocab)) # data_dir = '/home/unno/qa/tasks_1-20_v1-2' data_dir = '../../data/tasks_1-20_v1-2' data_type = 'en' for data_id in range(1, 21): train_data = data.read_data( vocab, glob.glob('%s/%s/qa%d_*train.txt' % (data_dir, data_type, data_id))[0]) test_data = data.read_data( vocab, glob.glob('%s/%s/qa%d_*test.txt' % (data_dir, data_type, data_id))[0]) print('Training data: %d' % len(train_data)) gpu = 0 train_data = convert_data(train_data, gpu) test_data = convert_data(test_data, gpu) model = MemNN(20, len(vocab), 50) opt = optimizers.Adam() #opt = optimizers.SGD(lr=0.01) #opt.add_hook(chainer.optimizer.GradientClipping(40)) batch_size = 100