def main(): preprocess.main() nodes = [] sentences = [] with open('sentences.txt') as f: while(True): line = f.readline() if(line=='\n' or line==''): break nodes.append(sentence_node(0,line.strip('\n'))) print len(nodes) for x in range(len(nodes)): sentences.append(nodes[x].sentence) tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix) G = nx.Graph() for x in range(len(nodes)): G.add_node(x) # G.add_nodes_from(nodes) for i in range(len(nodes)): for j in range(len(nodes)): if(i<j and similarity_matrix[i][j]!=0): G.add_edge(i,j,weight=similarity_matrix[i][j]) pdb.set_trace() for i in range(len(nodes)): if len(G[i]) == 0: print "No out edges" pr = nx.pagerank(G,alpha=0.85) # print pr sorted_pr = sorted(pr.items(), key=operator.itemgetter(1), reverse=True) print sorted_pr[:10] for item in sorted_pr[:10]: print nodes[item[0]].sentence
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ '--only-source', '--trainpref', os.path.join(data_dir, 'train.out'), '--validpref', os.path.join(data_dir, 'valid.out'), '--testpref', os.path.join(data_dir, 'test.out'), '--destdir', data_dir, ]) preprocess.main(preprocess_args)
def preprocess_lm_data(data_dir): preprocess_parser = preprocess.get_parser() preprocess_args = preprocess_parser.parse_args([ '--only-source', '--trainpref', os.path.join(data_dir, 'train.out'), '--validpref', os.path.join(data_dir, 'valid.out'), '--testpref', os.path.join(data_dir, 'test.out'), '--destdir', data_dir, ]) preprocess.main(preprocess_args)
def preprocess_data(self, data_dir): preprocess_parser = preprocess.get_parser() preprocess_args = preprocess_parser.parse_args([ '--source-lang', 'in', '--target-lang', 'out', '--trainpref', os.path.join(data_dir, 'train'), '--validpref', os.path.join(data_dir, 'valid'), '--testpref', os.path.join(data_dir, 'test'), '--thresholdtgt', '0', '--thresholdsrc', '0', '--destdir', data_dir, ]) preprocess.main(preprocess_args)
def preprocess_lm_data(data_dir): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ]) preprocess.main(preprocess_args)
def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ '--source-lang', 'in', '--target-lang', 'out', '--trainpref', os.path.join(data_dir, 'train'), '--validpref', os.path.join(data_dir, 'valid'), '--testpref', os.path.join(data_dir, 'test'), '--thresholdtgt', '0', '--thresholdsrc', '0', '--destdir', data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = preprocess.get_parser() preprocess_args = preprocess_parser.parse_args( [ '--source-lang', 'in', '--target-lang', 'out', '--trainpref', os.path.join(data_dir, 'train'), '--validpref', os.path.join(data_dir, 'valid'), '--testpref', os.path.join(data_dir, 'test'), '--thresholdtgt', '0', '--thresholdsrc', '0', '--destdir', data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def test_fl_sms(isolated_filesystem): os.chdir("advanced/Federated SMS Spam prediction/") Path("data").mkdir(parents=True, exist_ok=True) url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip" urllib.request.urlretrieve(url, "data.zip") with ZipFile("data.zip", "r") as zipObj: # Extract all the contents of the zip file in current directory zipObj.extractall() import preprocess preprocess.main() res = pm.execute_notebook("Federated SMS Spam prediction.ipynb", "/dev/null", parameters={"epochs": 1}, timeout=300) assert isinstance(res, nbformat.notebooknode.NotebookNode)
def createDataDump(): data = {} data['docList'], data['fullText'], data['classDict'] = preprocess.main(); data['vocabList'] = createVocabList(data['docList']) f = open('yyy_all_data.pkl', 'wb') pickle.dump(data, f) f.close()
def get_all_tweets(screen_name): #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] result = [] #make initial request for most recent tweets (200 is the maximum allowed count) user = api.get_user(screen_name = screen_name) location = user.location lang = user.lang print location if location and lang =="en": print 1 new_tweets = api.user_timeline(screen_name = screen_name,count=200) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: #print "getting tweets before %s" % (oldest) #all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest) #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #print "...%s tweets downloaded so far" % (len(alltweets)) #transform the tweepy tweets into a 2D array that will populate the csv #outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),screen_name] for tweet in alltweets] x=0 for count in range(len(alltweets)): char = alltweets[count].text.encode("utf-8") char = main(char) result.append(char) #outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),screen_name] for tweet in alltweets] outtweets = [[alltweets[i].id_str, alltweets[i].created_at, result[i], location, screen_name] for i in range(len(alltweets))] #write the csv direct = '/home/minghao/Downloads/big data proj/unsupervised/data_unsupervised' d = dirname(dirname(abspath(__file__))) with open(direct+'/%s_tweets.csv' % screen_name, 'wb') as f: writer = csv.writer(f) writer.writerow(["id","created_at","text","location","screen_name"]) writer.writerows(outtweets) pass
def eval_face(input_dir): """ function that recognizes the face on a picture. input: location of the picture. (str) output: predicted label of the picture. (str) errors: + picture_name does not exist/is not and image - throw type error + no positive match to any of the labels - throw nonexistent error """ # se procesa la foto. pre.main(input_dir, input_dir, 180) # se evalúa la foto con el modelo. return main(input_dir, conf["model_path"], conf["classifier_output_path"], conf["batch_size"], conf["num_threads"], conf["num_epochs"], conf["min_num_images_per_class"], conf["split_ratio"], False)
def main(): raw_data, raw_data, duplicate_sets, question_texts = preprocess.main() number_of_categories = len(duplicate_sets) tokenized_sentences, word_index = tokenize_data(question_texts.values()) # Y_processed = to_categorical(np.asarray(Y_raw), 2) embedded_sequences = make_embedding_layer(word_index) model = make_model(embedded_sequences, number_of_categories)
def test_train_mode(self): """Runs pipeline in train mode outputting train, test and eval filesets.""" test_pipeline = TestPipeline() # Set extra options to the pipeline for test purpose test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) self.addCleanup(shutil.rmtree, test_dir) # Checks that pipeline reaches state "Done" pipeline_verifiers = [PipelineStateMatcher()] extra_opts = { 'project': PROJECT, 'output_path': test_dir, 'on_success_matcher': all_of(*pipeline_verifiers), 'runner': 'DirectRunner', } res = preprocess.main( test_pipeline.get_full_options_as_args(**extra_opts), query=self.TEST_QUERY, await_completion=True) # Check counts coming out of GetFirstClaim step. parse_first_claim_cnt = get_pipeline_metric( res, 'parse_firstclaim_success') self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) # Check counts coming out of AddFeatures step. add_features_cnt = get_pipeline_metric(res, 'create_features_success') self.assertEqual(self.TOTAL_RECORDS, add_features_cnt) # Check counts coming out of AddLabel step. broad_cnt = get_pipeline_metric(res, 'add_label_broad') narrow_cnt = get_pipeline_metric(res, 'add_label_narrow') self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt) # Check if the number of records coming out of Train/Test = limit step. splits = ['train_cnt', 'eval_cnt', 'test_cnt'] train_test_split_cnt = sum( [get_pipeline_metric(res, m) for m in splits]) self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt) # Check if number of protos created matched output of train/test split. create_proto_success = sum([ get_pipeline_metric(res, 'create_proto_success', index=i) for i in range(3) ]) self.assertEqual(self.TOTAL_RECORDS, create_proto_success) # Open a tf Example and check fields. example = read_example_proto(test_dir) for feature_name in preprocess.FEATURE_NAMES: self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) # Make sure label feature is present. labels = ['broad', 'narrow'] self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args([ "--source-lang", "in", "--target-lang", "out", "--trainpref", os.path.join(data_dir, "train"), "--validpref", os.path.join(data_dir, "valid"), "--testpref", os.path.join(data_dir, "test"), "--thresholdtgt", "0", "--thresholdsrc", "0", "--destdir", data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args)
def infer_anomaly_model(clf, infer_data): print("<<<<<< preprocess data") df = preprocess.main(infer_data) X_test = np.array(df.iloc[:, 1:]) X_test = X_test.astype('int') #normalize my_scaler = joblib.load('./scaler.gz') X_test_std = my_scaler.transform(X_test) y_test_pred = clf.predict(X_test_std) print(y_test_pred) return y_test_pred
def add_face(input_dir): """ function that retrains the model to add a new face. input: location of the images. (str) output: error code, 0 otherwise. (exception, none) errors: + pictures_folder is not a folder - throw type error + pictures_folder contains a non-image - fail silently and continue for the rest + pictures_folder does not have [enough] pictures - throw exception and exit + pictures_folder contains pictures that are not from the same people - this kills the model. """ # se procesan las fotos. pre.main(input_dir, input_dir, 180) # se cargan los embeddings originales, reentrena el modelo, y guarda # en el mismo directorio el nuevo modelo y embeddings. main(input_dir, conf["model_path"], conf["classifier_output_path"], conf["batch_size"], conf["num_threads"], conf["num_epochs"], conf["min_num_images_per_class"], conf["split_ratio"], True)
def main(input_file, file_type, label_col, model_file): if file_type == 'file': print("<<<<<< preprocess data") df = preprocess.main(input_file) if file_type == 'folder': print("<<<<<< preprocess data") df = preprocess.process_file_list(input_file) print("<<<<< data split") y_train, y_test, X_train, X_test = split_data(df, label_col) #data normalization mm = MinMaxScaler() mm.fit(X_train) joblib.dump(mm, './scaler.gz') X_train_std = mm.transform(X_train) X_test_std = mm.transform(X_test) for model_name in ['KNN', 'XGBOD']: print("<<<<< model: ", model_name) model_test(model_name, y_train, y_test, X_train_std, X_test_std, model_file, '0')
def test_inference_mode(self): """Runs a pipeline in inference mode which should output one fileset.""" test_pipeline = TestPipeline() # Set extra options to the pipeline for test purpose test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) self.addCleanup(shutil.rmtree, test_dir) # Checks that pipeline reaches state "Done" pipeline_verifiers = [PipelineStateMatcher()] extra_opts = { 'project': PROJECT, 'output_path': test_dir, 'on_success_matcher': all_of(*pipeline_verifiers), 'runner': 'DirectRunner', 'pipeline_mode': 'inference', } res = preprocess.main( test_pipeline.get_full_options_as_args(**extra_opts), query=self.TEST_QUERY, await_completion=True) # Check counts coming out of GetFirstClaim step. parse_first_claim_cnt = get_pipeline_metric( res, 'parse_firstclaim_success') self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) # Ensure a proto is created for all input records create_proto_success = get_pipeline_metric(res, 'create_proto_success') self.assertEqual(self.TOTAL_RECORDS, create_proto_success) # Open a tf Example and check fields. example = read_example_proto(test_dir) for feature_name in preprocess.FEATURE_NAMES: self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) # Make sure label feature is not present since we are in inference. with self.assertRaises(IndexError): get_tf_feature(example, 'label', 'bytes_list')
def main(argv): ''' controls the over-arching implmentation of the algorithms ''' directory = argv[0] features = argv[1] algorithms = argv[2] #parsing print("parsing json data...") clusters, order, data, test_clusters, test_order, test_data, corpusdict = parse_json.main([directory]) #preprocessing vocab = preprocess.main([features, corpusdict]) #featurization step 1 print("generating observations and features...") train_scores = observations.main([clusters, order, data, directory, features, vocab]) test_scores = observations.main([test_clusters, test_order, test_data, directory, features, vocab]) #featurization step 2 print("generating training and testing data...") train_data, train_target = features_and_labels.main([train_scores, features]) test_data, test_target = features_and_labels.main([test_scores, features]) #modeling print("running algorithms...") if algorithms.log_reg: predicted_labels, perform_results = log_reg.main([train_data, train_target, test_data, test_target]) if algorithms.svm: predicted_labels, perform_results = svm.main([train_data, train_target, test_data, test_target]) #results print("Algorithm details and Results:") print(perform_results)
def main(): """ Test for the feature extraction class :return: """ import preprocess ftr = FeatureExtraction(6) filename = "all_tweets.txt" lines = preprocess.main(filename) all_tweets = " ".join([" ".join(line[1]) for line in lines]) print "The most frequent bigrams are :", ftr.most_frequent_bigrams(all_tweets) print "The most frequent unigrams are :", ftr.most_frequent_unigrams(all_tweets) hashtag_dic = PatternsFeatures().pattern_classifier(lines, '#') print 'The 10 most frequent hashtags', PatternsFeatures().get_most_frequent_pattern(hashtag_dic) print "number of tweets without hashtag is %d, it's %d percent of the data set" % (len(hashtag_dic['no_pattern_tweet']), int(100*len(hashtag_dic['no_pattern_tweet'])/len(lines))) name_dic = PatternsFeatures().pattern_classifier(lines, '@') print 'The 10 most frequent usernames: ', PatternsFeatures().get_most_frequent_pattern(name_dic) print "number of tweets without a user name is %d, it's %d percent of the data set" % (len(name_dic['no_pattern_tweet']), int(100*len(name_dic['no_pattern_tweet'])/len(lines)))
def lm_scoring(preprocess_directory, bpe_status, gen_output, pre_gen, cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code, batch_size, lm_score_file, target_lang, source_lang, prefix_len=None): if prefix_len is not None: assert bpe_status == "different", "bpe status must be different to use prefix len" if bpe_status == "no bpe": # run lm on output without bpe write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen+"/rescore_data_no_bpe.de", pre_gen+"/rescore_data_no_bpe.en", pre_gen+"/reference_file_no_bpe") preprocess_lm_param = ["--only-source", "--trainpref", pre_gen+"/rescore_data_no_bpe."+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "shared": preprocess_lm_param = ["--only-source", "--trainpref", pre_gen+"/rescore_data."+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "different": rescore_file = pre_gen+"/rescore_data_no_bpe" rescore_bpe = pre_gen+"/rescore_data_new_bpe" rescore_file += "." rescore_bpe += "." write_reprocessed(gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, rescore_file+source_lang, rescore_file+target_lang, pre_gen+"/reference_file_no_bpe", bpe_symbol=None) # apply LM bpe to nbest list bpe_src_param = ["-c", cur_lm_bpe_code, "--input", rescore_file+target_lang, "--output", rescore_bpe+target_lang] subprocess.call(["python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py")] + bpe_src_param, shell=False) # uncomment to use fastbpe instead of subword-nmt bpe # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code] # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False) preprocess_dir = preprocess_directory preprocess_lm_param = ["--only-source", "--trainpref", rescore_bpe+target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_dir] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [preprocess_dir, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train"] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, 'w') as f: with redirect_stdout(f): eval_lm.main(input_args)
print "+-----------------------------------------------------------------+" ################################################################################ print " time taken for the classification process %f sec " % (time() - t0) ##################################################################################################### x_axis = [i for i in range(10, 200, 20)] plt.figure(facecolor='white') fig1, = plt.plot(x_axis, accuracy_list_nb, 'r*-', label='Naive bayes accuracy') fig2, = plt.plot(x_axis, f_measure_list_nb, 'ro-', label='Naive bayes f-measure') fig3, = plt.plot(x_axis, accuracy_list_svm, 'g*-', label='SVM accuracy') fig4, = plt.plot(x_axis, f_measure_list_svm, 'go-', label='SVM f-measure') fig5, = plt.plot(x_axis, accuracy_list_maxent, '*-', label='max Entropy accuracy') fig6, = plt.plot(x_axis, f_measure_list_maxent, 'o-', label='max Entropy f-measure') plt.xlabel('Number of features') plt.ylabel('Results') plt.title('Results of the classification using unigrams and bigrams') plt.legend(handles=[fig1, fig2, fig3, fig4, fig5, fig6], loc=4) plt.show() t0 = time() filename = 'all_tweets.txt' lines = preprocess.main(filename) bigram_evaluation(lines) unigram_evaluation(lines) uni_and_bi_validation(lines)
KEEP_PROB = tf.placeholder(tf.float32) ###dropout end ###网络定义从这里开始 with slim.arg_scope(xception_arg_scope()): Y_prediction,end_points = xception(X, num_classes=16, is_training=Is_training, scope='xception', keep_prob=KEEP_PROB) ###网络结构这里结束 Y_softmax = tf.nn.softmax(Y_prediction) initialization()#初始化函数,包括初始化训练集和测试集,得到训练集和测试集的个数,取出id对应的种类 preprocess.main()#creat TFrecord variables_to_restore = slim.get_variables_to_restore() ###saver saver = tf.train.Saver(variables_to_restore,max_to_keep = 1) # 保存所有的变量,最多保存10个 model_file=tf.train.latest_checkpoint('./save/')#尝试加载上次最新的训练结果 with open("./prediction-split-softmax.csv", 'a', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow(["file_id","Blues","Classical","Country","Easy Listening",'Electronic','Experimental','Folk','Hip-Hop','Instrumental','International','Jazz','Old-Time / Historic','Pop','Rock','Soul-RnB',"Spoken"]) with tf.Session() as sess: #加载最新的模型 if model_file !=None: saver.restore(sess,model_file)
import preprocess as p import rbm import tensorflow as tf import numpy as np input_matrix, labels = p.main() print "Input matrix shape = ", input_matrix.shape[0] print "labels shape = ", labels.shape[0] print labels[0, 0:10] #for row in input_matrix: visible = input_matrix[0] hidden = labels[0] vis = tf.Variable(visible) r = rbm.RBM("chr0.0",visible.shape[0], hidden.shape[0]) with tf.Session() as session: # Run the model #session.run(r) session.run(r.propup(vis)) # Run just the variable y and print #print(session.run(y)) #sess.Run(rbm #x = RBM("test",
#get rects rects = img.find_rects(threshold=0) if (len(rects) == 0): continue #draw raw rects for k, r in enumerate(rects): c = r.corners() for i, p in enumerate(c): p_ = c[i - 1] if draw: img.draw_line(p[0], p[1], p_[0], p_[1], 5, color=(0, 0, 0)) try: theta, translation = preprocess.main(rects, img) except NoEdgeException as e: print("NoEdgeException") continue except NotEnoughDataException as e: print("NotEnoughDataException") continue except NoRectException as e: print("NoRectException") continue # gRotation = getGlobalRotation(gRotation, lRotation, theta) # protocol.feedGlobalRotation(gRotation, pyb.millis() - startTime,frame_id) protocol.feedLocalRotation(theta, pyb.millis() - startTime, frame_id) lRotation = theta
def UpdateData(self,dataLocation): preprocess.main(dataLocation)
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_set.py,v $ # $Date: 2007/10/29 07:32:56 $ # $Revision: 1.1.1.1 $ import preprocess preprocess.main(["plain"], "set", "boost\\mpl\\set\\aux_\\preprocessed")
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_map.py,v $ # $Date: 2007/10/29 07:32:56 $ # $Revision: 1.1.1.1 $ import preprocess preprocess.main(["plain", "typeof_based", "no_ctps"], "map", "boost\\mpl\\map\\aux_\\preprocessed")
#!/usr/bin/env python3 import numpy as np import matplotlib.pyplot as plt import icepack, icepack.plot # This function pulls in the mesh and observational data that we'll use. import preprocess preprocess.main() # Read in the observational data. vx_obs = icepack.read_arc_ascii_grid(open("ross-vx.txt", "r")) vy_obs = icepack.read_arc_ascii_grid(open("ross-vy.txt", "r")) h_obs = icepack.read_arc_ascii_grid(open("ross-h.txt", "r")) mesh = icepack.read_msh("ross.msh") fig, ax = plt.subplots() ax.set_aspect('equal') icepack.plot.plot_mesh(ax, mesh) plt.show(fig) discretization = icepack.make_discretization(mesh, 1) v = icepack.interpolate(discretization, vx_obs, vy_obs) h = icepack.interpolate(discretization, h_obs) # Make a dumb guess for the ice temperature. In "real life", you would want to # use an inverse method that would tune the temperature to fit observations. theta = icepack.interpolate(discretization, lambda x: 253.0)
# Copyright Aleksey Gurtovoy 2001-2006 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_set.py,v $ # $Date: 2006/11/23 19:57:11 $ # $Revision: 1.2.8.1 $ import preprocess import os.path preprocess.main(["plain"], "set", os.path.join("boost", "mpl", "set", "aux_", "preprocessed"))
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /home/project/cvs/hivm/version2/trunk/src/3rd_party/build/boost_1_33_1/libs/mpl/preprocessed/preprocess_list.py,v $ # $Date: 2006/08/07 05:34:09 $ # $Revision: 1.1 $ import preprocess preprocess.main( [ "plain" ] , "list" , "boost\\mpl\\list\\aux_\\preprocessed" )
def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert args.right_to_left1 is False, "prefix length not compatible with right to left models" assert args.right_to_left2 is False, "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = os.path.join( os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name if not os.path.exists(store_data): os.makedirs(store_data) pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, \ backwards_preprocessed_dir, lm_preprocessed_dir = \ rerank_utils.get_directories(args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac) assert not (args.right_to_left1 and args.backwards1), "backwards right to left not supported" assert not (args.right_to_left2 and args.backwards2), "backwards right to left not supported" assert not (args.prefix_len is not None and args.target_prefix_frac is not None), \ "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = args.gen_model == args.score_model1 and args.source_prefix_frac is None rerank2_is_gen = args.gen_model == args.score_model2 and args.source_prefix_frac is None if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print( "STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [ args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--max-sentences", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang ] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, 'w') as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.remove_bpe, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac) if args.diff_bpe: rerank_utils.write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/source_gen_bpe." + args.source_lang, pre_gen + "/target_gen_bpe." + args.target_lang, pre_gen + "/reference_gen_bpe." + args.target_lang) bitext_bpe = args.rescore_bpe_code bpe_src_param = [ "-c", bitext_bpe, "--input", pre_gen + "/source_gen_bpe." + args.source_lang, "--output", pre_gen + "/rescore_data." + args.source_lang ] bpe_tgt_param = [ "-c", bitext_bpe, "--input", pre_gen + "/target_gen_bpe." + args.target_lang, "--output", pre_gen + "/rescore_data." + args.target_lang ] subprocess.call([ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py") ] + bpe_src_param, shell=False) subprocess.call([ "python", os.path.join(os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py") ] + bpe_tgt_param, shell=False) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or \ (args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen): print( "STEP 2: process the output of generate.py so we have clean text files with the translations" ) rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix" + str( args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = rescore_file + "target_prefix_frac" + str( args.target_prefix_frac) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = rescore_file + "source_prefix_frac" + str( args.source_prefix_frac) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + rescore_file + "." + args.source_lang, pre_gen + rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + prefix_len_rescore_file + "." + args.source_lang, pre_gen + prefix_len_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.remove_bpe) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + target_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + target_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe, target_prefix_frac=args.target_prefix_frac) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + source_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + source_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.remove_bpe, source_prefix_frac=args.source_prefix_frac) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + "/right_to_left_rescore_data." + args.source_lang, pre_gen + "/right_to_left_rescore_data." + args.target_lang, pre_gen + "/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.remove_bpe) print("STEP 3: binarize the translations") if not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen: if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + fw_rescore_file, "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", left_to_right_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + "/right_to_left_rescore_data", "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", right_to_left_preprocessed_dir ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output
# Copyright Aleksey Gurtovoy 2001-2006 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Id$ # $Date$ # $Revision$ import preprocess import os.path preprocess.main( [ "plain" ] , "list" , os.path.join( "boost", "mpl", "list", "aux_", "preprocessed" ) )
def preprocessing(main_config_fpath): '''Run preprocessing''' print 'Running preprocessing...' preprocess.main(main_config_fpath)
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_vector.py,v $ # $Date: 2004/09/02 15:41:30 $ # $Revision: 1.2 $ import preprocess preprocess.main( [ "no_ctps", "plain", "typeof_based" ] , "vector" , "boost\\mpl\\vector\\aux_\\preprocessed" )
import preprocess as p import rbm import tensorflow as tf import numpy as np input_matrix, labels = p.main() print "Input matrix shape = ", input_matrix.shape[0] print "labels shape = ", labels.shape[0] print labels[0, 0:10] #for row in input_matrix: visible = input_matrix[0] hidden = labels[0] vis = tf.Variable(visible) r = rbm.RBM("chr0.0", visible.shape[0], hidden.shape[0]) with tf.Session() as session: # Run the model #session.run(r) session.run(r.propup(vis)) # Run just the variable y and print #print(session.run(y)) #sess.Run(rbm #x = RBM("test", #rbm = tf.Variable(x+5, name = 'y') #sess = tf.Session() #sess.Run(
# Copyright Aleksey Gurtovoy 2001-2006 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Id$ # $Date$ # $Revision$ import preprocess import os.path preprocess.main( [ "plain", "typeof_based" ] , "vector" , os.path.join( "boost", "mpl", "vector", "aux_", "preprocessed" ) )
def main(): ################################ # Step 1: Pre-processing Check # Pipeline Starts Here: Data is checked to see if pre-processing is necessary. If so, 'preprocess.py' is envoked. longdescription() # Prints a description of the project. # Finding the counts of the country files relative to the biological sex files (should be a 1:2 ratio) parsedfiles, originalfiles = countfiles( ) # if the ratio is different, then runpreproccessing is called. # Checking to see if all of the files for each country have been parsed by biological sex (represented by 1 or 2). if runpreprocessing(numofcountryfiles=originalfiles, numofsexfiles=parsedfiles): preprocess.main() print("\nBeginning Analysis") time.sleep(2) print( "\n\n###########################################################################" ) print("# Step 2: Data Storage and Management") ################################ # Step 2: Data Storage and Management # Two Dictionaries (for each sex) are created to house the data. Minimum support criterion are also calculated here. sex1_file_dict = {} sex1_age_icd_support = {} sex2_file_dict = {} sex2_age_icd_support = {} for filename in os.listdir("."): sex1_datafile = re.match("(^Sex1_\w+)_\d+.csv", filename) sex2_datafile = re.match("(^Sex2_\w+)_\d+.csv", filename) if sex1_datafile: #print("Match Sex1:", filename, ":", sex1_datafile.group(1)) argus = files2dictionary(filename, sex1_datafile.group(1), sex1_age_icd_support) print("Sex1", "\n", argus[0], "\n", argus[1]) sex1_file_dict.update(argus[0]), sex1_age_icd_support.update( argus[1]) # concatenating dicts elif sex2_datafile: #print("Match Sex2:", filename, ":", sex2_datafile.group(1), "\n") argus = files2dictionary(filename, sex2_datafile.group(1), sex2_age_icd_support) print("Sex2", "\n", argus[0], "\n", argus[1]) sex2_file_dict.update(argus[0]), sex2_age_icd_support.update( argus[1]) print( "\n\n###########################################################################" ) print("# Step 3: Apriori Algorithm") ################################ # Step 3: Apriori Algorithm # Implementing a modified version of the Apriori algorithm for speeding up an otherwise exhaustive HPC problem # Creating a list of all countries sex1_countries_list = sex1_file_dict.keys() sex2_countries_list = sex2_file_dict.keys() sex1_file_dict.values() # Creating an age support dictionary. This is used to make sure the minimum support count for each age is met. age_support_dict = {} for key, value in sex1_age_icd_support.items(): try: int(key) if sex1_age_icd_support[key] == len( sex1_countries_list): # checking minimum support counts age_support_dict[key] = value except ValueError: pass print("Age Support Dict: ", age_support_dict) signifOUTFH = open("results.tsv", "w") signifOUTFH.write("Sex\tAge\tSignificant_Combination\n") counter1 = 0 print("Countries Evaluated: {}\n".format(sex1_countries_list)) for country_age_dict in sex1_file_dict.values(): counter1 += 1 for age, icds_dict in country_age_dict.items(): if age in age_support_dict and counter1 <= 1: # meaning this age is in all six files qu = [str(i) for i in range(1, 36, 1)] #icd_count = round(float(sex1_file_dict[country][age][i]) * 1000000) qu, insig = bottom_up_trim(qu, sex1_file_dict, sex1_countries_list, age) print("Sex1:\tAge\t{}\nNew Queue\t{}\nInsignificant\t{}\n". format(age, qu, insig)) """!!!!!!!!!!!!! Tie in APRIORI ALGORITHM here :D !!!!!!!!!!!!!""" significant_combinations = apriori_v3(qu, insig, sex1_file_dict, sex1_countries_list, age) if len(significant_combinations) > 0: signifOUTFH.write("{}\t{}\t{}\n".format( "1", age, significant_combinations)) print("Apriori Significant Combs", significant_combinations) print("##################################\nSuccess!") print("Countries Evaluated: {}\n".format(sex2_countries_list)) counter2 = 0 for country_age_dict in sex2_file_dict.values(): counter2 += 1 for age, icds_dict in country_age_dict.items(): if age in age_support_dict and counter2 <= 1: # meaning this age is in all six files qu = [str(i) for i in range(1, 36, 1)] qu, insig = bottom_up_trim(qu, sex2_file_dict, sex2_countries_list, age) print("Sex2:\tAge\t{}\nNew Queue\t{}\nInsignificant\t{}\n". format(age, qu, insig)) """!!!!!!!!!!!!! Tie in APRIORI ALGORITHM here :D !!!!!!!!!!!!!""" significant_combinations = apriori_v3(qu, insig, sex2_file_dict, sex2_countries_list, age) if len(significant_combinations) > 0: signifOUTFH.write("{}\t{}\t{}\n".format( "2", age, significant_combinations)) print("Apriori Significant Combs", significant_combinations) print("##################################\nSuccess!") signifOUTFH.close()
# Copyright Aleksey Gurtovoy 2001-2006 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Id: preprocess_map.py 49241 2008-10-10 09:24:39Z agurtovoy $ # $Date: 2008-10-10 02:24:39 -0700 (Fri, 10 Oct 2008) $ # $Revision: 49241 $ import preprocess import os.path preprocess.main( [ "plain", "typeof_based", "no_ctps" ] , "map" , os.path.join( "boost", "mpl", "map", "aux_", "preprocessed" ) )
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_set.py,v $ # $Date: 2004/09/02 15:41:30 $ # $Revision: 1.2 $ import preprocess preprocess.main( [ "plain" ] , "set" , "boost\\mpl\\set\\aux_\\preprocessed" )
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /CVSROOT/boost/libs/mpl/preprocessed/preprocess_vector.py,v $ # $Date: 2007/10/29 07:32:56 $ # $Revision: 1.1.1.1 $ import preprocess preprocess.main(["no_ctps", "plain", "typeof_based"], "vector", "boost\\mpl\\vector\\aux_\\preprocessed")
##Normalize the weight within [0,1] W = W / sum_W ##for each classifier, update their weights for k in range(0, 3): W_clf[k] += log((1 - error[k]) / error[k]) print(W_clf) return W_clf if __name__ == '__main__': # loading from preprocess.py train_X, train_y = preprocess.main(True) test_X, test_y = preprocess.main(False) # sklearn.metrics.precision_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)[source] # sklearn.metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)[source] # ---------------- KNN ---------------- # start_time = time.time() knn_classifier = KNN(train_X, train_y, k=20) # create KNN classifier ### need change k here end_time = time.time() y_train_pred = knn_classifier.predict(train_X) y_test_pred = knn_classifier.predict(test_X)
# Copyright Aleksey Gurtovoy 2001-2004 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Source: /cvsroot/boost/boost/libs/mpl/preprocessed/preprocess_map.py,v $ # $Date: 2004/09/02 15:41:30 $ # $Revision: 1.2 $ import preprocess preprocess.main( [ "plain", "typeof_based" ] , "map" , "boost\\mpl\\map\\aux_\\preprocessed" )
import preprocess import lgb preprocess.main(update_means_only = False, forced_update = False) # preprocessing data lgb.main() # train model and predict print('done.')
# Copyright Aleksey Gurtovoy 2001-2006 # # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) # # See http://www.boost.org/libs/mpl for documentation. # $Id$ # $Date$ # $Revision$ import preprocess import os.path preprocess.main(["plain", "typeof_based", "no_ctps"], "map", os.path.join("boost", "mpl", "map", "aux_", "preprocessed"))
def main(arguments): global args parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '--tasks', default=range(1, 21), type=int, nargs='+', help='Tasks list') args = parser.parse_args(arguments) # Filter the tasks expecting more than one output tasks = args.tasks for t in [19]: if t in tasks: tasks.remove(t) # Wrap up in an argument new_arguments = ['--task'] + [str(t) for t in tasks] # Preprocessing preprocess.main(new_arguments) # Loading the data sentences_train, questions_train, questions_sentences_train, answers_train = read_preprocessed_matrix_data( 'new_train') with open('../Data/preprocess/new_word2index', 'rb') as file: word2index = pickle.load(file) # ###### Training the questions embeddings # Count the answers words (indexed from 1 to len(answer_words)) answer_words = set(answers_train.flatten()) aw_number = len(answer_words) # Questions embeddings questions_embeddings_train = train_question_vector(questions_train, answers_train, aw_number, alpha=0.1) # ##### Predictions # ##### Train # Batch predictions predictions_train = batch_prediction(questions_train, questions_sentences_train, sentences_train, aw_number, questions_embeddings_train, word2index) # Select response (index start at 1) output = np.argmax(predictions_train, axis=1) + 1 # Compute global accuracy response = answers_train.flatten() print(len(response)) accuracy = np.sum(output == response)/(1.*len(output)) # Accuracy per tasks on train results_train = np.ones((len(tasks), 2)) for i in xrange(len(tasks)): task_id = questions_train[1000*i, 0] local_acc = np.sum( output[1000*i:1000*(i+1)] == response[1000*i:1000*(i+1)])/(1000.) results_train[i, 0] = task_id results_train[i, 1] = local_acc print('---------------TRAIN------------------') for i in xrange(len(tasks)): print 'Results for task {}'.format(results_train[i, 0]) print 'Average Accuracy is {}'.format(results_train[i, 1]) print('----------------------------------------') print('----------------------------------------') print('Number of possible answers {}'.format(aw_number)) print 'Results for {}'.format(tasks) print 'Average Accuracy is {}'.format(accuracy) print('----------------------------------------') # ##### Test sentences_test, questions_test, questions_sentences_test, answers_test = read_preprocessed_matrix_data( 'new_test') # Batch predictions predictions_test = batch_prediction(questions_test, questions_sentences_test, sentences_test, aw_number, questions_embeddings_train, word2index) # Select response (index start at 1) output = np.argmax(predictions_test, axis=1) + 1 # Compute global accuracy response = answers_test.flatten() print(len(response)) accuracy = np.sum(output == response)/(1.*len(output)) # Accuracy per tasks on train results_test = np.ones((len(tasks), 2)) for i in xrange(len(tasks)): task_id = questions_test[1000*i, 0] local_acc = np.sum( output[1000*i:1000*(i+1)] == response[1000*i:1000*(i+1)])/(1000.) results_test[i, 0] = task_id results_test[i, 1] = local_acc print('---------------TEST------------------') for i in xrange(len(tasks)): print 'Results for task {}'.format(results_test[i, 0]) print 'Average Accuracy is {}'.format(results_test[i, 1]) print('----------------------------------------') print('----------------------------------------') print('Number of possible answers {}'.format(aw_number)) print 'Results for {}'.format(tasks) print 'Average Accuracy is {}'.format(accuracy) print('----------------------------------------')