Esempio n. 1
0
def main(unused_args):
    myconfig = Config()
    w2v = WordEmbedding.Word2Vec(myconfig)
    start_time = time.time()
    w2v.loadWordFile("word_table_merge")
    end_time = time.time()
    sys.stderr.write(' %.2f' % (end_time - start_time) +
                     ' seconds escaped...\n')
    trainnames = getFileNames("data/")
    trainnames.sort()
    print(trainnames)

    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True

    with tf.Session(config=configproto) as sess:
        filenum = len(trainnames)
        lstm_model = createLstmModel(myconfig, w2v)
        init = tf.global_variables_initializer()
        sess.run(init)

        for i in range(myconfig.max_epoch):
            run_epoch(sess, lstm_model, w2v, trainnames[i % filenum], i + 1)
Esempio n. 2
0
def readIntoBST():
    tree = None
    with open(embed_file, encoding="utf8") as ef:
        content = ef.readline()
        #goes through every line on the file
        while content:
            word = content.split()
            #creates Word Embedding with information from file
            n = WordEmbedding.WordEmbedding(word[0], word[1:])
            #inserts on bst
            tree = bst.Insert(tree, n)
            content = ef.readline()
    print("Done")
    return tree
Esempio n. 3
0
def readIntoBTree(max):
    tree = btree.BTree([], max_data=max)
    with open(embed_file, encoding="utf8") as ef:
        content = ef.readline()
        #goes through every line on the file
        while content:
            word = content.split()
            #creates Word Embedding with information from file
            n = WordEmbedding.WordEmbedding(word[0], word[1:])
            #inserts on btree
            btree.Insert(tree, n)
            btree.Print
            content = ef.readline()
    print("Done")
    return tree
def main(unused_args):
    with tf.device('/cpu:0'):
        myconfig = Config()
        w2v = WordEmbedding.Word2Vec(myconfig)
        start_time = time.time()
        w2v.loadWordFile(myconfig.wordTablePath)
        end_time = time.time()
        sys.stderr.write(' %.2f' % (end_time - start_time) +
                         ' seconds escaped...\n')
        trainnames = getFileNames(myconfig.trainFileDir)

        #按照文件顺序读取数据
        trainnames.sort()
        print("train file list:\n")
        for i in trainnames:
            print(i)

        os.environ["CUDA_VISIBLE_DEVICES"] = myconfig.CUDA_VISIBLE_DEVICES
        configproto = tf.ConfigProto()
        #configproto.gpu_options.allow_growth = True
        configproto.allow_soft_placement = True
        configproto.log_device_placement = True
        timeList = []
        with tf.Session(config=configproto) as sess:
            filenum = len(trainnames)
            lstm_models = createLstmModel(myconfig)
            init = tf.global_variables_initializer()
            sess.run(init)

            var_list = tf.global_variables()
            print_variable(var_list)
            #loader = tf.train.Saver()
            #loader.restore(sess, "models/lstmp_imp_refine_100")
            steps = 0

            for i in range(myconfig.max_epoch):
                steps, elapsed_time = run_epoch(sess, lstm_models, w2v,
                                                trainnames[i % filenum], i + 1,
                                                steps, myconfig.saveModelEvery,
                                                myconfig.modelDir,
                                                myconfig.modelName,
                                                len(myconfig.gpu_list))
                timeList.append(elapsed_time)
        all = 0
        for epoch, t in enumerate(timeList):
            print("epoch:{}   used time:{}".format(epoch, t))
            all += t
        print("run {} epoch use time:{}".format(len(timeList), all))
Esempio n. 5
0
def buildBST(T, file):
    """
    Read through the text file to retrieve each word and its embedding. If a
    word starts with an alphabetic character, create a WordEmbedding object
    for that word and its embedding and insert it in its proper place in the
    BST.
    """
    for line in file:
        word_line = line.split(' ')
        word = word_line[0]
        embedding = word_line[1:]
        embedding = [float(i) for i in embedding]
        if word[0].isalpha():
            word_emb_object = WordEmbedding.WordEmbedding(word, embedding)
            T = insert(T, word_emb_object)
    return T
def main(unused_args):
    myconfig = Config()
    w2v = WordEmbedding.Word2Vec(myconfig)
    start_time = time.time()
    w2v.loadWordFile("word_table_merge")
    end_time = time.time()
    sys.stderr.write(' %.2f' % (end_time - start_time) +
                     ' seconds escaped...\n')

    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True

    with tf.Session(config=configproto) as sess:
        model = createModel(myconfig, w2v)
        loader = tf.train.Saver()
        loader.restore(sess, "models/qrnn_imp_refine_009")

        decode(sess, model, w2v, 'src_code/input.demo')
def main(unused_args):
  myconfig = Config()
  w2v = WordEmbedding.Word2Vec(myconfig)
  start_time = time.time()
  w2v.loadWordFile("word_table_merge")
  end_time = time.time()
  sys.stderr.write(' %.2f'%(end_time-start_time) + ' seconds escaped...\n')

  os.environ["CUDA_VISIBLE_DEVICES"] = "3"
  configproto = tf.ConfigProto()
  configproto.gpu_options.allow_growth = True
  configproto.allow_soft_placement = True

  with tf.Session(config=configproto) as sess:
    #w2v.loadEmbeddings(sess,'word_embedding.tensorflow')
    #end_time = time.time()
    #sys.stderr.write(' %.2f'%(end_time-start_time) + ' seconds escaped...\n')

    lstm_model = createLstmModel(myconfig,w2v)
    loader = tf.train.Saver()
    loader.restore(sess, "models/lstmp_tw_refine_199")

    decode(sess, lstm_model, w2v, 'data/test.txt')
Esempio n. 8
0
import Datasets
import BugReportPreprocessing
import SourceCodePreprocessing
import rVSM
import WordEmbedding
import TokenMatch
import BugRecency
import Evaluation

data_set = Datasets.zxing
BugReportPreprocessing.main(data_set)
SourceCodePreprocessing.main(data_set)
rVSM.main(data_set)
TokenMatch.main(data_set)
WordEmbedding.main(data_set)
BugRecency.main(data_set)
print('---------------------------------------------------')
Evaluation.main(data_set)
Esempio n. 9
0
from tensorflow.keras.models import load_model
import Prepare
import WordEmbedding
import SiameseModel

word_embedding_file = 'glove_vector_dict_300d.pickle'
model_file = 'SRA_trained_model.h5'

word_embedding = WordEmbedding(word_embedding_file)
input_ = Prepare(data_frame)
label_map = {0: 'correct', 1: 'incorrect', 2: 'contradictory'}

model = SiameseModel(word_embedding, input_)
model.load_weights(model_file, by_name=False, skip_mismatch=False)
predictions = model.predict(input_.premise, input_.hypothesis)

predicted_labels = [label_map[np.argmax(p)] for p in predictions]
Esempio n. 10
0
    BST_tree = None
    start_time = time.time()
    BST_tree = BST.buildBST(BST_tree, words_file)
    end_time = time.time()
    print('Binary Search Tree stats:')
    num_nodes = BST.numNodes(BST_tree)
    print('Number of nodes: ' + str(num_nodes))
    height = BST.height(BST_tree)
    print('Height: ' + str(height))
    total_time = end_time - start_time
    print('Running time for binary search tree construction: ' +
          str(round(total_time, 6)) + ' seconds.\n')

    if compare_words == 'y':
        start_time = time.time()
        similarity = WordEmbedding.findSimilarityC(BST_tree, user_words)
        end_time = time.time()
        print('Word similarities found:')
        print('Similarity [' + user_words[0] + ',' + user_words[1] + '] = ' +
              str(round(similarity, 4)))
        print()
        total_time = end_time - start_time
        print('Running time for binary search tree query processing: ' +
              str(round(total_time, 6)) + ' seconds.\n')

    elif compare_words == 'n':
        print('Reading word file to determine similarities\n')
        start_time = time.time()
        similarities = WordEmbedding.findSimilarityA(BST_tree, word_pairs_file)
        end_time = time.time()
        print('Word similarities found:')
Esempio n. 11
0
def readIntoHashLP(size, choice):
    h = htlp.HashTableLP(size)
    if choice == "1":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line on the file
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertS(n)
                content = ef.readline()

    elif choice == "2":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertAscii(n)
                content = ef.readline()
    elif choice == "3":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertPAscii(n)
                content = ef.readline()

    elif choice == "4":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line on the file
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertSAscii(n)
                content = ef.readline()
    elif choice == "5":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line on the file
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertRecursive(n)
                content = ef.readline()
    elif choice == "6":
        with open(embed_file, encoding="utf8") as ef:
            content = ef.readline()
            #goes through every line on the file
            cnt = 0
            for i in range(10000):
                word = content.split()
                #creates Word Embedding with information from file
                n = WordEmbedding.WordEmbedding(word[0], word[1:])
                #inserts on bst
                h.insertFE(n)
                cnt += 1
                if cnt % 10000 == 0:
                    print(cnt)
                content = ef.readline()
    print("Done")
    return h
Esempio n. 12
0
def train():
    w2v_path = r'C:\Users\fkarl\PycharmProjects\Image2SequenceFiles\w2vModel\en.wiki.bpe.op200000.d300.w2v.bin'
    w2v_model = WordEmbedding.EmbeddingModel(300, w2v_path, True, True)

    # restrict to newer data for sake of training time
    YEAR_START = 2010
    YEAR_END = 2017

    # produce model
    model = Models.get_stock_pred_model(None, HEADER_LENGTH, SENT_LENGTH)
    model.summary()
    epochs = 10
    for i in range(epochs):
        print('EPOCH: ' + str(i) + ' of ' + str(epochs))
        # Outer Loop for yearly validation
        for year in range(YEAR_START, YEAR_END):

            # load data for year
            print('Year: ' + str(year))
            df_year_path = path_to_df + str(year)

            df_paths_per_month = [
                os.path.join(df_year_path, f) for f in os.listdir(df_year_path)
                if os.path.isfile(os.path.join(df_year_path, f))
            ]
            print(df_paths_per_month)

            rand_int = random.randint(0, len(df_paths_per_month) - 1)
            val_df = df_paths_per_month[rand_int]
            df_paths_per_month.remove(val_df)

            with open(val_df, 'rb') as f:
                data_frame = pickle.loads(f.read())

            data_frame = refine_dataframe(data_frame)

            Y_val = data_frame['BA'].values  # take only Boeing stock prices
            Y_val = get_targets(
                Y_val
            )  # make some adjustment to the targets; interested in large deviations.
            print('Validation Set size', Counter(list(Y_val)))

            headers = data_frame['abstract'].map(str).map(str.lower).map(
                nltk.word_tokenize).values  # tokenise
            embeddings = [
                np.array(w2v_model.get_embeddings(sent)) for sent in headers
            ]
            padded_headders = keras.preprocessing.sequence.pad_sequences(
                embeddings,
                maxlen=HEADER_LENGTH,
                dtype='float32',
                padding='post',
                truncating='post',
                value=0.0)
            X_headders_val = np.array(padded_headders)

            X_sents_val = get_refined_news(data_frame, w2v_model)

            assert (len(X_headders_val) == len(Y_val))

            # Inner loop for training
            for data_frame_path in df_paths_per_month:

                with open(data_frame_path, 'rb') as f:
                    data_frame = pickle.loads(f.read())

                data_frame = refine_dataframe(data_frame)

                Y = data_frame['BA'].values
                Y = get_targets(Y)
                print('Training Set size', Counter(list(Y)))

                headers = data_frame['abstract'].map(str).map(str.lower).map(
                    nltk.word_tokenize).values
                embeddings = [
                    np.array(w2v_model.get_embeddings(sent))
                    for sent in headers
                ]
                padded_headders = keras.preprocessing.sequence.pad_sequences(
                    embeddings,
                    maxlen=HEADER_LENGTH,
                    dtype='float32',
                    padding='post',
                    truncating='post',
                    value=0.0)
                X_headders = np.array(padded_headders)

                X_sents = get_refined_news(data_frame, w2v_model)

                assert (len(Y) == len(X_headders))
                assert (len(headers) == len(X_headders))

                print('*** Training *** on ' + data_frame_path + ' with ' +
                      str(len(Y)) + ' samples')

                model.fit([X_headders, X_sents],
                          Y,
                          batch_size=16,
                          validation_split=0.05,
                          verbose=2,
                          epochs=3)
                model.save('stock_pred_model.h5')

                print('*** Validation ***')
                predictions = model.predict([X_headders_val, X_sents_val],
                                            batch_size=64)
                rounded_predicitons = [
                    np.round(elem[0]) for elem in predictions
                ]  # could also use conservative_results = [1 if elem >= TOL else 0 for elem in predictions]
                print('TestResults', Counter(rounded_predicitons))
                print(
                    metrics.precision_recall_fscore_support(
                        Y_val, rounded_predicitons))

                fpr, tpr, threshold = metrics.roc_curve(
                    Y_val, [elem[0] for elem in predictions])
                roc_auc = metrics.auc(fpr, tpr)

                plt.title('Receiver Operating Characteristic')
                plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
                plt.legend(loc='lower right')
                plt.plot([0, 1], [0, 1], 'r--')
                plt.xlim([0, 1])
                plt.ylim([0, 1])
                plt.ylabel('True Positive Rate')
                plt.xlabel('False Positive Rate')
                plt.show()