def main(unused_args): myconfig = Config() w2v = WordEmbedding.Word2Vec(myconfig) start_time = time.time() w2v.loadWordFile("word_table_merge") end_time = time.time() sys.stderr.write(' %.2f' % (end_time - start_time) + ' seconds escaped...\n') trainnames = getFileNames("data/") trainnames.sort() print(trainnames) os.environ["CUDA_VISIBLE_DEVICES"] = "3" configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: filenum = len(trainnames) lstm_model = createLstmModel(myconfig, w2v) init = tf.global_variables_initializer() sess.run(init) for i in range(myconfig.max_epoch): run_epoch(sess, lstm_model, w2v, trainnames[i % filenum], i + 1)
def readIntoBST(): tree = None with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file while content: word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst tree = bst.Insert(tree, n) content = ef.readline() print("Done") return tree
def readIntoBTree(max): tree = btree.BTree([], max_data=max) with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file while content: word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on btree btree.Insert(tree, n) btree.Print content = ef.readline() print("Done") return tree
def main(unused_args): with tf.device('/cpu:0'): myconfig = Config() w2v = WordEmbedding.Word2Vec(myconfig) start_time = time.time() w2v.loadWordFile(myconfig.wordTablePath) end_time = time.time() sys.stderr.write(' %.2f' % (end_time - start_time) + ' seconds escaped...\n') trainnames = getFileNames(myconfig.trainFileDir) #按照文件顺序读取数据 trainnames.sort() print("train file list:\n") for i in trainnames: print(i) os.environ["CUDA_VISIBLE_DEVICES"] = myconfig.CUDA_VISIBLE_DEVICES configproto = tf.ConfigProto() #configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True configproto.log_device_placement = True timeList = [] with tf.Session(config=configproto) as sess: filenum = len(trainnames) lstm_models = createLstmModel(myconfig) init = tf.global_variables_initializer() sess.run(init) var_list = tf.global_variables() print_variable(var_list) #loader = tf.train.Saver() #loader.restore(sess, "models/lstmp_imp_refine_100") steps = 0 for i in range(myconfig.max_epoch): steps, elapsed_time = run_epoch(sess, lstm_models, w2v, trainnames[i % filenum], i + 1, steps, myconfig.saveModelEvery, myconfig.modelDir, myconfig.modelName, len(myconfig.gpu_list)) timeList.append(elapsed_time) all = 0 for epoch, t in enumerate(timeList): print("epoch:{} used time:{}".format(epoch, t)) all += t print("run {} epoch use time:{}".format(len(timeList), all))
def buildBST(T, file): """ Read through the text file to retrieve each word and its embedding. If a word starts with an alphabetic character, create a WordEmbedding object for that word and its embedding and insert it in its proper place in the BST. """ for line in file: word_line = line.split(' ') word = word_line[0] embedding = word_line[1:] embedding = [float(i) for i in embedding] if word[0].isalpha(): word_emb_object = WordEmbedding.WordEmbedding(word, embedding) T = insert(T, word_emb_object) return T
def main(unused_args): myconfig = Config() w2v = WordEmbedding.Word2Vec(myconfig) start_time = time.time() w2v.loadWordFile("word_table_merge") end_time = time.time() sys.stderr.write(' %.2f' % (end_time - start_time) + ' seconds escaped...\n') os.environ["CUDA_VISIBLE_DEVICES"] = "3" configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: model = createModel(myconfig, w2v) loader = tf.train.Saver() loader.restore(sess, "models/qrnn_imp_refine_009") decode(sess, model, w2v, 'src_code/input.demo')
def main(unused_args): myconfig = Config() w2v = WordEmbedding.Word2Vec(myconfig) start_time = time.time() w2v.loadWordFile("word_table_merge") end_time = time.time() sys.stderr.write(' %.2f'%(end_time-start_time) + ' seconds escaped...\n') os.environ["CUDA_VISIBLE_DEVICES"] = "3" configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: #w2v.loadEmbeddings(sess,'word_embedding.tensorflow') #end_time = time.time() #sys.stderr.write(' %.2f'%(end_time-start_time) + ' seconds escaped...\n') lstm_model = createLstmModel(myconfig,w2v) loader = tf.train.Saver() loader.restore(sess, "models/lstmp_tw_refine_199") decode(sess, lstm_model, w2v, 'data/test.txt')
import Datasets import BugReportPreprocessing import SourceCodePreprocessing import rVSM import WordEmbedding import TokenMatch import BugRecency import Evaluation data_set = Datasets.zxing BugReportPreprocessing.main(data_set) SourceCodePreprocessing.main(data_set) rVSM.main(data_set) TokenMatch.main(data_set) WordEmbedding.main(data_set) BugRecency.main(data_set) print('---------------------------------------------------') Evaluation.main(data_set)
from tensorflow.keras.models import load_model import Prepare import WordEmbedding import SiameseModel word_embedding_file = 'glove_vector_dict_300d.pickle' model_file = 'SRA_trained_model.h5' word_embedding = WordEmbedding(word_embedding_file) input_ = Prepare(data_frame) label_map = {0: 'correct', 1: 'incorrect', 2: 'contradictory'} model = SiameseModel(word_embedding, input_) model.load_weights(model_file, by_name=False, skip_mismatch=False) predictions = model.predict(input_.premise, input_.hypothesis) predicted_labels = [label_map[np.argmax(p)] for p in predictions]
BST_tree = None start_time = time.time() BST_tree = BST.buildBST(BST_tree, words_file) end_time = time.time() print('Binary Search Tree stats:') num_nodes = BST.numNodes(BST_tree) print('Number of nodes: ' + str(num_nodes)) height = BST.height(BST_tree) print('Height: ' + str(height)) total_time = end_time - start_time print('Running time for binary search tree construction: ' + str(round(total_time, 6)) + ' seconds.\n') if compare_words == 'y': start_time = time.time() similarity = WordEmbedding.findSimilarityC(BST_tree, user_words) end_time = time.time() print('Word similarities found:') print('Similarity [' + user_words[0] + ',' + user_words[1] + '] = ' + str(round(similarity, 4))) print() total_time = end_time - start_time print('Running time for binary search tree query processing: ' + str(round(total_time, 6)) + ' seconds.\n') elif compare_words == 'n': print('Reading word file to determine similarities\n') start_time = time.time() similarities = WordEmbedding.findSimilarityA(BST_tree, word_pairs_file) end_time = time.time() print('Word similarities found:')
def readIntoHashLP(size, choice): h = htlp.HashTableLP(size) if choice == "1": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertS(n) content = ef.readline() elif choice == "2": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertAscii(n) content = ef.readline() elif choice == "3": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertPAscii(n) content = ef.readline() elif choice == "4": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertSAscii(n) content = ef.readline() elif choice == "5": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertRecursive(n) content = ef.readline() elif choice == "6": with open(embed_file, encoding="utf8") as ef: content = ef.readline() #goes through every line on the file cnt = 0 for i in range(10000): word = content.split() #creates Word Embedding with information from file n = WordEmbedding.WordEmbedding(word[0], word[1:]) #inserts on bst h.insertFE(n) cnt += 1 if cnt % 10000 == 0: print(cnt) content = ef.readline() print("Done") return h
def train(): w2v_path = r'C:\Users\fkarl\PycharmProjects\Image2SequenceFiles\w2vModel\en.wiki.bpe.op200000.d300.w2v.bin' w2v_model = WordEmbedding.EmbeddingModel(300, w2v_path, True, True) # restrict to newer data for sake of training time YEAR_START = 2010 YEAR_END = 2017 # produce model model = Models.get_stock_pred_model(None, HEADER_LENGTH, SENT_LENGTH) model.summary() epochs = 10 for i in range(epochs): print('EPOCH: ' + str(i) + ' of ' + str(epochs)) # Outer Loop for yearly validation for year in range(YEAR_START, YEAR_END): # load data for year print('Year: ' + str(year)) df_year_path = path_to_df + str(year) df_paths_per_month = [ os.path.join(df_year_path, f) for f in os.listdir(df_year_path) if os.path.isfile(os.path.join(df_year_path, f)) ] print(df_paths_per_month) rand_int = random.randint(0, len(df_paths_per_month) - 1) val_df = df_paths_per_month[rand_int] df_paths_per_month.remove(val_df) with open(val_df, 'rb') as f: data_frame = pickle.loads(f.read()) data_frame = refine_dataframe(data_frame) Y_val = data_frame['BA'].values # take only Boeing stock prices Y_val = get_targets( Y_val ) # make some adjustment to the targets; interested in large deviations. print('Validation Set size', Counter(list(Y_val))) headers = data_frame['abstract'].map(str).map(str.lower).map( nltk.word_tokenize).values # tokenise embeddings = [ np.array(w2v_model.get_embeddings(sent)) for sent in headers ] padded_headders = keras.preprocessing.sequence.pad_sequences( embeddings, maxlen=HEADER_LENGTH, dtype='float32', padding='post', truncating='post', value=0.0) X_headders_val = np.array(padded_headders) X_sents_val = get_refined_news(data_frame, w2v_model) assert (len(X_headders_val) == len(Y_val)) # Inner loop for training for data_frame_path in df_paths_per_month: with open(data_frame_path, 'rb') as f: data_frame = pickle.loads(f.read()) data_frame = refine_dataframe(data_frame) Y = data_frame['BA'].values Y = get_targets(Y) print('Training Set size', Counter(list(Y))) headers = data_frame['abstract'].map(str).map(str.lower).map( nltk.word_tokenize).values embeddings = [ np.array(w2v_model.get_embeddings(sent)) for sent in headers ] padded_headders = keras.preprocessing.sequence.pad_sequences( embeddings, maxlen=HEADER_LENGTH, dtype='float32', padding='post', truncating='post', value=0.0) X_headders = np.array(padded_headders) X_sents = get_refined_news(data_frame, w2v_model) assert (len(Y) == len(X_headders)) assert (len(headers) == len(X_headders)) print('*** Training *** on ' + data_frame_path + ' with ' + str(len(Y)) + ' samples') model.fit([X_headders, X_sents], Y, batch_size=16, validation_split=0.05, verbose=2, epochs=3) model.save('stock_pred_model.h5') print('*** Validation ***') predictions = model.predict([X_headders_val, X_sents_val], batch_size=64) rounded_predicitons = [ np.round(elem[0]) for elem in predictions ] # could also use conservative_results = [1 if elem >= TOL else 0 for elem in predictions] print('TestResults', Counter(rounded_predicitons)) print( metrics.precision_recall_fscore_support( Y_val, rounded_predicitons)) fpr, tpr, threshold = metrics.roc_curve( Y_val, [elem[0] for elem in predictions]) roc_auc = metrics.auc(fpr, tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()