コード例 #1
0
def generate(q1, q2, answer, model_google, options):
    sentences = []
    for i in options:
        sentences.append(q1 + answer[i] + q2)
    sentences = Word2Vec.cleanText(sentences)
    n_dim = 300
    vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in sentences]
    dataset = []
    for a in vectors:
        sentence = np.zeros((49, 300))
        m = len(a)
        start = int((49 - m) / 2)
        sentence[start:start + m] = a
        dataset.append(np.array(sentence))
    return dataset
コード例 #2
0
ファイル: main.py プロジェクト: lingyiliu016/Word2Vector
def main():
    print("数据预处理阶段")
    DataPretreat.prepare_data(windows_size=3)
    vocabulary_size = len(DataPretreat.vocabulary_list)
    SkipGram = Word2Vec.SkipGram(vocabulary_size)
    print("创建SkipGram神经网络")
    SkipGram.build_network()
    print("训练SkipGram神经网络")
    SkipGram.train()
    print("可视化SkipGram训练效果")
    SkipGram.visualize()
コード例 #3
0
 def yes():
     try:
         global bVectorSpace, rVectorSpace, up_to_date
         if up_to_date: return
         bVectorSpace, rVectorSpace = wv.start()
     except Exception as e:
         mw.MessageWindow(
             "Error",
             "Error occurred during generating vector spaces" + str(e))
         return
     mw.MessageWindow("Reload", "Vector spaces are up-to-date.")
     up_to_date = True
     del gf.bWords[:], gf.rWords[:]
コード例 #4
0
    def run(self, flag):
        nx_graphs, _ = Reader.multi_readG(self.path)

        if flag == "LN":
            r_t = Reader.true_cluster(self.path).tolist()
            print(clustering(r_t))
            cluster_true = [r[0] - 1 for r in r_t]
            k_list = [k for k in range(2, 11)]
        else:
            cluster_true = []
            k_list = [2, 3, 6, 8]
            for i in range(29):
                if i < 12:
                    cluster_true.append(0)
                else:
                    cluster_true.append(1)

        w_dict = Reader.weight(self.path)
        print(nx_graphs[0])
        MK_G = Node2Vec_LayerSelect.Graph(nx_graphs, self.p, self.q, 0.5)
        MK_G.preprocess_transition_probs(w_dict, 2)
        MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

        MK_words = []
        for walk in MK_walks:
            MK_words.extend([str(step) for step in walk])

        M_L = Word2Vec.Learn(MK_words)
        M_matrix, M_mapping = M_L.train()

        result = {}
        for k in k_list:
            cluster_trained = KMeans(
                n_clusters=k, random_state=0).fit_predict(M_matrix).tolist()

            length = min(len(cluster_true), len(cluster_trained))

            r = normalized_mutual_info_score(cluster_true[0:length],
                                             cluster_trained[0:length])
            f = f1_score(cluster_true[0:length],
                         cluster_trained[0:length],
                         average='micro')
            print(cluster_trained)
            print(cluster_true)

            result[k] = (r, f)
            #pickle.dump(cluster_trained, open(self.path+str(k)+'.pickle', '+wb'))

        print(result)
コード例 #5
0
def predict(sentence, num_words, model):  # 预测此句是否为要点句,是返回1,不是返回0
    data = np.empty((1, num_words, size), dtype="float64")

    sentence = u.remove_useless(sentence)
    word_list = u.seg2words_long(sentence)
    word_list = word_list[:num_words]  # 把长度缩减到训练模型的长度
    num = 0

    vector_model = wv.load_model()
    for i in range(len(word_list)):
        word = word_list[i].encode('utf-8')
        vector = wv.get_vector(vector_model, word)
        if vector == []:
            continue

        data[0, i, :] = vector
        num += 1

    for j in range(num, num_words):
        data[0, j, :] = -1

    prediction = model.predict(data)
    # print "%.2f%%" % (float(prediction[0][0]) * 100) + " " + "%.2f%%" % (float(prediction[0][1]) * 100)
    return np.argmax(prediction)
コード例 #6
0
def generate(q1, q2, answer, model_google, options):
    sentences = []
    for i in options:
        sentences.append(q1 + answer[i] + q2)
    sentences = Word2Vec.cleanText(sentences)
    n_dim = 300
    vectors = [
        Word2Vec.buildWordVector(model_google, z, n_dim) for z in sentences
    ]
    dataset = []
    for a in vectors:
        sentence = np.zeros((49, 300))
        m = len(a)
        start = int((49 - m) / 2)
        sentence[start:start + m] = a
        dataset.append(np.array(sentence))

    question = []
    for i in options:
        question.append(q1 + q2)
    question = Word2Vec.cleanText(question)
    n_dim = 300
    q = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in question]
    q_set = []
    for a in q:
        sentence = np.zeros((49, 300))
        m = len(a)
        start = int((49 - m) / 2)
        sentence[start:start + m] = a
        q_set.append(np.array(sentence))

    option = []
    for i in options:
        option.append(answer[i])
    option = Word2Vec.cleanText(option)
    n_dim = 300
    a = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in option]
    a_set = []
    for a in a:
        sentence = np.zeros((4, 300))
        m = len(a)
        if not m == 0:
            start = int((4 - m) / 2)
            sentence[start:start + m] = a
        a_set.append(np.array(sentence))
    return dataset, q_set, a_set
コード例 #7
0
    def run(self):
        path = self.path
        nx_graphs, total_edges = Reader.multi_readG(path)
        r_list, nx_graphs_sampled = Sampler.multi_sampling(path, self.s_p)
        print('%d edges sampled, graph length is %d' %
              (len(r_list), len(nx_graphs_sampled)))
        MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q)
        MK_G.preprocess_transition_probs()
        MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

        MK_words = []
        for walk in MK_walks:
            MK_words.extend([str(step) for step in walk])

        M_L = Word2Vec.Learn(MK_words)
        M_matrix, M_mapping = M_L.train()

        r_set = set([node for edge in r_list for node in edge])

        eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                          r_set, self.e_p)
        M_precision = eval_p.eval()
        print("*** Merged graph precision: ", M_precision)
コード例 #8
0
    def run(self):
        path = self.path
        #### Step 1: reading and sampling graphs

        nx_graphs, airport_mapping, airport_dst = Reader.read_airline(path)
        print(nx_graphs[0].nodes())

        r_set = set()

        if self.flag == 0 or self.flag == 4:
            w_dict = {}

            MK_G = Node2Vec_LayerSelect.Graph(nx_graphs, self.p, self.q,
                                              self.r)
            MK_G.preprocess_transition_probs(w_dict, 1)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.edge_list_eval(
                airport_dst, airport_mapping)
            print("*** MKII Random: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))
            '''
            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ MKII Random AUC:", M_auc)
            '''
            print(
                "-----------------------DONE--------------------------------")
コード例 #9
0
def main():
	filename = "../Data/Final_Dataset_Word2Vec_Emoji2Vec.csv"
	print("1. Train with Word2Vec, 2. Train with Emoji2Vec 3. Both")
	print("Enter choice (1/2/3):")
	ch = int(input())

	if ch == 1:

		word_vec = w.main(filename)
		return word_vec

	elif ch == 2:
		
		Emoji_vec = ex.main(filename)
		return Emoji_vec

	elif ch == 3:

		print("Concatenating...")
		Concatenated_Vector = c.main()
		return Concatenated_Vector

	else:
		print("Invalid")
コード例 #10
0
    def run(self):
        path = self.path
        #### Step 1: reading and sampling graphs

        m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path)
        print("%d total nodes" % len(m_graph.nodes()))
        r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg(
            path, self.s_p)
        print(
            "%d edges before sampling, %d edges after sampling. sampled %d " %
            (len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list)))

        r_set = set([node for edge in r_list for node in edge])

        if self.flag == 0 or self.flag == 1:

            #### Step 2: Aggregated graph
            #for i in range(2):

            M_G = Node2Vec.Graph(m_graph_sampled, self.p, self.q)
            M_G.preprocess_transition_probs()
            M_walks = M_G.simulate_walks(self.num_walks, self.walk_length)

            M_words = []
            for walk in M_walks:
                M_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(M_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, m_graph,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** Aggregated graph: precision %f, accuracy %f, F %f " %
                  (precision, recall, F))

            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, m_graph,
                                        m_graph_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ Merged graph AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")
        #### Step 3: Aggregated result

        if self.flag == 0 or self.flag == 2:

            T_matrix = {}
            T_mapping = {}
            for g in nx_graphs_sampled:
                #print(g.edges())
                G = Node2Vec.Graph(g, self.p, self.q)
                G.preprocess_transition_probs()
                walks = G.simulate_walks(self.num_walks, self.walk_length)
                words = []
                for walk in walks:
                    words.extend([str(step) for step in walk])

                L = Word2Vec.Learn(words)
                matrix, mapping = L.train()
                T_matrix[g] = matrix
                T_mapping[g] = mapping

            eval_p_s = Evaluator.combining_Precision_Eval(
                T_matrix, T_mapping, nx_graphs, r_set, self.e_p)
            precision, recall, F = eval_p_s.eval()
            print("*** Aggregated result: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))

            eval_a = Evaluator.combining_AUC_Eval(T_matrix, T_mapping,
                                                  nx_graphs, nx_graphs_sampled)
            S_auc = eval_a.eval_auc(1)
            print('@@@ Separated garph AUC:', S_auc)

            print(
                "-----------------------DONE--------------------------------")

        #### Step 4: MKII verification

        if self.flag == 0 or self.flag == 3:
            graph_list_sampled = []
            graph_list_sampled.append(m_graph_sampled)
            graph_list = []
            graph_list.append(m_graph)
            w_dict = {}
            MK_G = Node2Vec_LayerSelect.Graph(graph_list, self.p, self.q,
                                              self.r)
            MK_G.preprocess_transition_probs(w_dict, 1)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping,
                                              graph_list[0], r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** MKII verification: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))

            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, m_graph,
                                        m_graph_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ Merged graph AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")

        #### Step 5: MKII Random
        if self.flag == 0 or self.flag == 4:
            w_dict = Reader.weight(self.path)
            #print(w_dict)

            MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                              self.q, self.r)
            MK_G.preprocess_transition_probs(w_dict, 1)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** MKII Random: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))

            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                        nx_graphs_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ MKII Random AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")

        #### Step 6: MKII Weighted
        if self.flag == 0 or self.flag == 4:
            w_dict = Reader.weight(self.path)
            #print(w_dict)

            MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                              self.q, self.r)
            MK_G.preprocess_transition_probs(w_dict, 2)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** MKII Weighted: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))

            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                        nx_graphs_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ MKII Weighted AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")

        #### Step 7: MKII Biased
        if self.flag == 0 or self.flag == 4:
            w_dict = Reader.weight(self.path)
            #print(w_dict)

            MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                              self.q, self.r)
            MK_G.preprocess_transition_probs(w_dict, 0)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** MKII Biased: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))
            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                        nx_graphs_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ MKII Biased AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")

        #### Step 8: MKII Biased_ii
        if self.flag == 0 or self.flag == 4:
            w_dict = Reader.weight(self.path)
            #print(w_dict)

            MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                              self.q, self.r)
            MK_G.preprocess_transition_probs(w_dict, 3)
            MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length)

            MK_words = []
            for walk in MK_walks:
                MK_words.extend([str(step) for step in walk])

            M_L = Word2Vec.Learn(MK_words)
            M_matrix, M_mapping = M_L.train()

            eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs,
                                              r_set, self.e_p)
            precision, recall, F = eval_p.eval()
            print("*** MKII Biased_ii: precision %f, accuracy %f, F %f" %
                  (precision, recall, F))
            eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                        nx_graphs_sampled)
            M_auc = eval_a.eval_auc(1)
            print("@@@ MKII Biased_ii AUC:", M_auc)

            print(
                "-----------------------DONE--------------------------------")

        if self.flag == 4:

            for r in range(11):

                r_t = r / 10.0

                if r_t == 0:
                    w_dict = Reader.weight(self.path)
                    #print(w_dict)

                    MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled,
                                                      self.p, self.q, 0.1)
                    MK_G.preprocess_transition_probs(w_dict, 1)
                    MK_walks = MK_G.simulate_walks(self.num_walks,
                                                   self.walk_length)

                    MK_words = []
                    for walk in MK_walks:
                        MK_words.extend([str(step) for step in walk])

                    M_L = Word2Vec.Learn(MK_words)
                    M_matrix, M_mapping = M_L.train()

                    eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping,
                                                      nx_graphs, r_set,
                                                      self.e_p)
                    precision, recall, F = eval_p.eval()
                    print("*** MKII Random: precision %f, accuracy %f, F %f" %
                          (precision, recall, F))
                    eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                                nx_graphs_sampled)
                    M_auc = eval_a.eval_auc(1)
                    print("@@@ MKII Random AUC:", M_auc)

                    print(
                        "-----------------------DONE--------------------------------"
                    )

                else:
                    w_dict = Reader.weight(self.path)
                    #print(w_dict)

                    MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled,
                                                      self.p, self.q, r_t)
                    MK_G.preprocess_transition_probs(w_dict, 3)
                    MK_walks = MK_G.simulate_walks(self.num_walks,
                                                   self.walk_length)

                    MK_words = []
                    for walk in MK_walks:
                        MK_words.extend([str(step) for step in walk])

                    M_L = Word2Vec.Learn(MK_words)
                    M_matrix, M_mapping = M_L.train()

                    eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping,
                                                      nx_graphs, r_set,
                                                      self.e_p)
                    precision, recall, F = eval_p.eval()
                    print(
                        "*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f"
                        % (r_t, precision, recall, F))
                    eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs,
                                                nx_graphs_sampled)
                    M_auc = eval_a.eval_auc(1)
                    print("@@@ MKII Biased_ii AUC:", M_auc)

        #### Step 9: CommoneNeighbors and Jaccard
        if self.flag == 0 or self.flag == 5:
            p = link_pred.Prediction()
            v_set = p.create_vertex(m_graph.edges())
            matrix_perm = p.create_adjmatrix(
                [edge for edge in itertools.combinations(r_set, 2)], v_set)
            matrix_ori = p.create_adjmatrix(m_graph.edges(), v_set)
            matrix_samp = p.create_adjmatrix(m_graph_sampled.edges(), v_set)
            cn = link_pred.CommonNeighbors()
            score_cn = cn.fit(matrix_ori)
            C_precision, C_recall, C_F = p.acc(score_cn, matrix_ori,
                                               matrix_perm, self.e_p)
            print("*** CommonNeighbors: precision %f, accuracy %f, F %f" %
                  (C_precision, C_recall, C_F))
            C_auc = p.auc_score(score_cn, matrix_ori, matrix_samp, "cc")
            print("@@@ CommonNeighbors: AUC %f", C_auc)

            ja = link_pred.Jaccard()
            score_ja = ja.fit(matrix_ori)
            J_precision, J_recall, J_F = p.acc(score_ja, matrix_ori,
                                               matrix_perm, self.e_p)
            print("*** Jaccard: precision %f, accuracy %f, F %f" %
                  (J_precision, J_recall, J_F))
            J_auc = p.auc_score(score_ja, matrix_ori, matrix_samp, "cc")
            print("@@@ Jaccard: AUC %f", J_auc)
            print(
                "-----------------------DONE--------------------------------")
コード例 #11
0
def featuresExtraction(dataSet):

    #calculate feature 1 - TFIDF
    textDataSet = []

    for line in dataset:
        textDataSet.append(line[0])
        textDataSet.append(line[1])

    #add synonyms
    newDataSet = Tep.addSynonyms(textDataSet)
    finalDataSet = []

    #stemming
    for line in newDataSet:
        text = mnlp.stemming(line)
        finalDataSet.append(mnlp.convertText(text))

    #obtendo os vetores TF-IDF de cada frase
    vector = Tfidf.calculateTFIDF(finalDataSet)

    similarities = []
    """aqui calculamos a distância do cosseno entre a frase 1 e a frase 2, ou seja, entre os pares de frases
       esse vector vai ter os vetores tf-idf de cada frase, no caso, é como se na posição 0 estivesse a frase 1, 
       na posição 1 estivesse a frase 2, na posição 2 estivesse a frase 3... e assim por diante
       Então se queremos calcular a similaridade entre a frase 1 e a frase 2 do nosso banco, devemos calcular
       a distância do cosseno entre vector[0] e vector[1]
       Por isso o for abaixo intera de 2 em 2 --> range(0, len(vector), 2)
    """

    for i in range(0, len(vector), 2):
        distance = spatial.distance.cosine(vector[i], vector[i + 1])
        similarities.append(1 - distance)

    #calculando as outras features
    #iniciando o modelo do word2vec
    word_vectors, model = Word2Vec.startModel()
    features = []

    #para cada linha do meu csv, vou calcular a similaridade utilizando esses métodos a seguir:
    for x in range(len(dataSet)):

        featuresLine = []
        """calculando a feature 2 entre a coluna 0 e coluna 1 do meu csv
            esse método obtive de um trabalho da literatura, vou te passar o pdf dele também
        """
        feature2 = Word2Vec.wordOrderSimilarity(word_vectors, model,
                                                dataSet[x][0], dataSet[x][1])
        """A feature 3 é distância do cosseno entre os vetores de cada frase, ou seja, 
        o vetor de cada frase é a soma dos vetores de embeddings de cada palavra"""

        sim2 = Word2Vec.embeddingsSimilarity(model, dataSet[x][0],
                                             dataSet[x][1])
        if math.isnan(sim2):
            feature3 = 1.0
        else:
            feature3 = sim2
        """A feature 4 utiliza uma matriz de similaridades com tamanho: 
        numero de palavras da frase 1 X numero de palavras da frase 2 
        Esse é o método que utilizei na minha dissertação. O word2vec aqui foi utilizado para calcular a similaridade
        entre as palavras. E a similaridade entre as frases é obtida utilizando esse método da matriz"""

        sim3 = Word2Vec.calculateSimilarity(word_vectors, model, dataSet[x][0],
                                            dataSet[x][1])
        if math.isnan(sim3):
            feature4 = 1.0
        else:
            feature4 = sim3
        """Esse feature utiliza a mesma matriz da feature acima, só que no lugar de calcular a similaridade entre
        as palavras usando word2vec, nós usamos uma abordagem binária. Se as palavras forem iguais, a similaridade entre
        elas será 1, se forem diferentes a similaridade entre elas será 0"""

        feature5 = Word2Vec.binarySimilarity(dataSet[x][0], dataSet[x][1])
        """A feature 6 será o tamanho da frase menor dividido pelo tamanho da frase maior"""
        size1 = len(mnlp.tokenize(dataset[x][0]))
        size2 = len(mnlp.tokenize(dataset[x][1]))

        if (size1 > size2):
            feature6 = size2 / size1
        else:
            feature6 = size1 / size2

        #salvo um aquivo com as features extraídas e a classe a qual elas pertencem
        featuresLine.append(similarities[x])
        featuresLine.append(feature2)
        featuresLine.append(feature3)
        featuresLine.append(feature4)
        featuresLine.append(feature5)
        featuresLine.append(feature6)
        featuresLine.append(dataSet[x][2])  #similarity class

        #print(featuresLine)
        features.append(featuresLine)

        #imprimindo o valor de similaridade obtido, combinando as features
        similaridade = (0.3 * similarities[x]) + (0.1 * feature2) + (
            0.2 * feature3) + (0.2 * feature4) + (0.1 * feature5) + (0.1 *
                                                                     feature6)
        print(similaridade)
コード例 #12
0
    def run(self):
        path = self.path
        #### Step 1: reading and sampling graphs
        '''
        m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path)
        print("%d total nodes"%len(m_graph.nodes()))
        r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg(path, self.s_p)
        print("%d edges before sampling, %d edges after sampling. sampled %d "%(len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list)))

        r_set = set([node for edge in r_list for node in edge])
        '''
        nx_graphs_sampled, _ = Reader.multi_readG(self.path)
        cluster_true = []
        for i in range(29):
            if i < 12:
                cluster_true.append(0)
            else:
                cluster_true.append(1)

        for r in range(11):

            r_t = r / 10.0

            if r_t == 0:
                w_dict = Reader.weight(self.path)
                #print(w_dict)

                MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                                  self.q, 0.1)
                MK_G.preprocess_transition_probs(w_dict, 1)
                MK_walks = MK_G.simulate_walks(self.num_walks,
                                               self.walk_length)

                MK_words = []
                for walk in MK_walks:
                    MK_words.extend([str(step) for step in walk])

                M_L = Word2Vec.Learn(MK_words)
                M_matrix, M_mapping = M_L.train()
                '''
                eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p)
                precision, recall, F = eval_p.eval()
                print("*** MKII Biased: precision %f, accuracy %f, F %f"%(precision, recall, F))
                eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled)
                M_auc = eval_a.eval_auc(1)
                print("@@@ MKII Biased AUC:", M_auc)
                '''

            else:
                w_dict = Reader.weight(self.path)
                #print(w_dict)

                MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p,
                                                  self.q, r_t)
                MK_G.preprocess_transition_probs(w_dict, 3)
                MK_walks = MK_G.simulate_walks(self.num_walks,
                                               self.walk_length)

                MK_words = []
                for walk in MK_walks:
                    MK_words.extend([str(step) for step in walk])

                M_L = Word2Vec.Learn(MK_words)
                M_matrix, M_mapping = M_L.train()
                '''
                eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p)
                precision, recall, F = eval_p.eval()
                print("*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f"%(r_t, precision, recall, F))
                eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled)
                M_auc = eval_a.eval_auc(1)
                print("@@@ MKII Biased_ii AUC:", M_auc)
                '''

            cluster_trained = KMeans(
                n_clusters=2, random_state=0).fit_predict(M_matrix).tolist()

            length = min(len(cluster_true), len(cluster_trained))

            r = normalized_mutual_info_score(cluster_true[0:length],
                                             cluster_trained[0:length])
            mi_f = f1_score(cluster_true[0:length],
                            cluster_trained[0:length],
                            average='micro')
            ma_f = f1_score(cluster_true[0:length],
                            cluster_trained[0:length],
                            average='macro')
            print("r is %f: nmi %f, micro_f %f, macro_f %f" %
                  (r_t, r, mi_f, ma_f))
            print(
                "-----------------------DONE--------------------------------")
コード例 #13
0
    # Node2Vec
    # generating all the walks that needed in learning
    p = 0.5
    q = 0.5
    num_walks = 10
    walk_length = 80

    G = Node2Vec.Graph(New_BFSlist, New_Edgelist, p, q)
    G.preprocess_transition_probs()
    walks = G.simulate_walks(num_walks, walk_length)
    print('walk list size', len(walks))
    words = []
    for walk in walks:
        words.extend([str(step) for step in walk])

    L = Word2Vec.Learn(words)
    matrix, mapping = L.train()
    percentage, AUC = T.run_test(Removelist, matrix, mapping, BFSlist)
    print(("the percetion of prediction is %f " % percentage))
    print("the AUC of prediction is %f" % AUC)
    results_file.write(str(AUC) + '\t' + str(percentage) + '\t')
    print(("Total time comsumed %fs" % (time.time() - start)))
    # Node2Vec END
    # --------------------------------------------------------------------------------- #

    # --------------------------------------------------------------------------------- #
    # Embedding
    # 1. 生成MultiNetsEmbedding需要的东西
    os.system("python3 generate_facts.py %s" % filename)
    # 2. MultiEmbedding
    os.system("./Embedding -network_name %s -generate_flag 0" % filename)
コード例 #14
0
    def run(self):
        path = self.path
        online_dir = path + "online/"
        online_graphs, _ = Reader.multi_readG(online_dir)
        offline_dir = path + "offline/"
        offline_graphs, _ = Reader.multi_readG(offline_dir)

        ### Step 1: learing with N2V MKII
        if self.flag == 0 or self.flag == 1:
            off_G = Node2Vec_LayerSelect.Graph(offline_graphs, self.p, self.q)
            off_G.preprocess_transition_probs()
            off_walks = off_G.simulate_walks(self.num_walks, self.walk_length)

            off_words = []
            for walk in off_walks:
                off_words.extend([str(step) for step in walk])

            off_L = Word2Vec.Learn(off_words)
            off_matrix, off_mapping = off_L.train()

            on_G = Node2Vec_LayerSelect.Graph(online_graphs, self.p, self.q)
            on_G.preprocess_transition_probs()
            on_walks = on_G.simulate_walks(self.num_walks, self.walk_length)

            on_words = []
            for walk in on_walks:
                on_words.extend([str(step) for step in walk])

            on_L = Word2Vec.Learn(on_words)
            on_matrix, on_mapping = on_L.train()

            off_perm_list = common_nodes(off_mapping, online_graphs)

            off_eval = Evaluator.Precision_Eval(off_matrix, off_mapping,
                                                online_graphs, off_perm_list,
                                                self.e_p)
            off_precision = off_eval.eval()
            print("*** Off to on MKII precision: ", off_precision)

            off_eval_a = Evaluator.AUC_Eval(off_matrix, off_mapping,
                                            online_graphs, offline_graphs)
            off_auc = off_eval_a.eval_auc(0)
            print("@@@ Off to on MKII AUC:", off_auc)

            on_perm_list = common_nodes(on_mapping, offline_graphs)

            on_eval = Evaluator.Precision_Eval(on_matrix, on_mapping,
                                               offline_graphs, on_perm_list,
                                               self.e_p)
            on_precision = on_eval.eval()
            print("*** On to off MKII precision: ", on_precision)

            on_eval_a = Evaluator.AUC_Eval(on_matrix, on_mapping,
                                           offline_graphs, online_graphs)
            on_auc = on_eval_a.eval_auc(0)
            print("@@@ On to off MKII AUC:", on_auc)

        if self.flag == 0 or self.flag == 2:
            on_matrix = {}
            on_mapping = {}
            on_perm_list = []
            for g in online_graphs:
                G = Node2Vec.Graph(g, self.p, self.q)
                G.preprocess_transition_probs()
                walks = G.simulate_walks(self.num_walks, self.walk_length)
                words = []
                for walk in walks:
                    words.extend([str(step) for step in walk])

                L = Word2Vec.Learn(words)
                matrix, mapping = L.train()
                on_matrix[g] = matrix
                on_mapping[g] = mapping
                on_perm_list.extend(common_nodes(mapping, offline_graphs))

            on_perm_list = set([node for node in on_perm_list])
            #print(on_perm_list)
            #print(on_mapping)
            eval_p_on = Evaluator.combining_Precision_Eval(
                on_matrix, on_mapping, offline_graphs, on_perm_list, self.e_p)
            on_precision = eval_p_on.eval()
            print("*** on to off precision: ", on_precision)

            on_eval_a = Evaluator.combining_AUC_Eval(on_matrix, on_mapping,
                                                     offline_graphs,
                                                     online_graphs)
            on_auc = on_eval_a.eval_auc(0)
            print("@@@ On to off  AUC:", on_auc)

            off_matrix = {}
            off_mapping = {}
            off_perm_list = []
            for g in offline_graphs:
                G = Node2Vec.Graph(g, self.p, self.q)
                G.preprocess_transition_probs()
                walks = G.simulate_walks(self.num_walks, self.walk_length)
                words = []
                for walk in walks:
                    words.extend([str(step) for step in walk])

                L = Word2Vec.Learn(words)
                matrix, mapping = L.train()
                off_matrix[g] = matrix
                off_mapping[g] = mapping
                off_perm_list.extend(common_nodes(mapping, online_graphs))

            off_perm_list = set([node for node in off_perm_list])
            eval_p_off = Evaluator.combining_Precision_Eval(
                off_matrix, off_mapping, online_graphs, off_perm_list,
                self.e_p)
            off_precision = eval_p_off.eval()
            print("*** off to on precision: ", off_precision)

            off_eval_a = Evaluator.combining_AUC_Eval(off_matrix, off_mapping,
                                                      online_graphs,
                                                      offline_graphs)
            off_auc = off_eval_a.eval_auc(0)
            print("@@@ Off to on  AUC:", off_auc)
コード例 #15
0
def MainProcedure(sentence):
    tokens = token_String.tokenizer(sentence)
    count = 0
    finalMovieList = {}
    for token in tokens:
        count = count + 1
        if token[1] == "n":
            initialVec = MySqlConn.returnMovieIdFromTag(token[0])
            # print(initialVec)
        else:
            tagsMovieIds = MySqlConn.returnMovieIdFromTag(token[0])
            #print(tagsMovieIds)

            genresMovieIds = MySqlConn.returnMovieIdFromGenre(token[0])
            #print(genresMovieIds)

            MovieIdsTagsAndGenres = mergeArrays(tagsMovieIds, genresMovieIds)
            #print(MovieIdsTagsAndGenres)

            word2vecSynonims = Word2Vec.give_Word2VecSinonims(token[0])
            wordNetSynonims = wordnet.wordNet(token[0])

            synonims = mergeArrays(word2vecSynonims, wordNetSynonims)
            movieIdList = []
            for synonim in synonims:
                movieIdList.extend(MySqlConn.returnMovieIdFromTag(synonim))

            movieIdList = numpy.unique(movieIdList)
            movieIds = mergeArrays(list(movieIdList),
                                   list(MovieIdsTagsAndGenres))
            #print(tagsMovieIds)
            if count == 1:
                for movie in movieIds:
                    finalMovieList[movie] = 1
                # print(len(finalMovieList))
            else:
                for movie in movieIds:
                    if movie in finalMovieList:
                        finalMovieList[movie] = finalMovieList[movie] + 1
                    else:
                        finalMovieList[movie] = 1
    dicDeFrec = defaultdict(list)
    for movie in finalMovieList:
        rating = MySqlConn.returnRatingForMovieId(movie)
        dicDeFrec[finalMovieList[movie]].append({movie: rating})

    max_films = 1
    movielist = []
    frec = len(tokens)
    for i in range(len(tokens), 0, -1):
        if dicDeFrec[i] != []:
            for j in dicDeFrec[i]:
                if max_films != 0:
                    movielist.append(j)
                    max_films = max_films - 1
                else:
                    break

    # frec = len(tokens)
    # max_films = 5
    # for movie in finalMovieList:
    #     if max_films != 0:
    #         if finalMovieList[movie] == frec:
    #             rating = MySqlConn.returnRatingForMovieId(movie)
    #             dicDeFrec[finalMovieList[movie]].append({movie: rating})
    #             max_films = max_films - 1
    #         else:
    #             pass
    # print(movielist)
    return movielist
コード例 #16
0
def save_data(line_list, data_path, ignore):

    num_lines = 0
    largest_num = 0

    vector_model = wv.load_model()

    for i in range(len(line_list)):
        if (i + 1) % 50 == 0:
            print("第" + str(i + 1) + "行 (" + str(i + 1) + "/" +
                  str(len(line_list)) + ")")

        # 清除无关信息
        line_list[i] = u.remove_useless(line_list[i])

        # 处理标签
        label = 0
        if "|" in line_list[i]:
            label = 1
            line_list[i] = line_list[i].replace("|", "")
        else:
            if ignore:
                if random.randint(0, 9) < 3:
                    label = 0
                else:
                    continue
            else:
                label = 0

        # 转换为词向量
        total_vector = []
        word_list = u.seg2words_long(line_list[i])
        for word in word_list:
            word = word.encode('utf-8')
            vector = wv.get_vector(vector_model, word)  # 模型是utf-8的
            if (vector == []):
                continue
            total_vector.append(vector)

        # 找到最大值
        if len(total_vector) > largest_num:
            largest_num = len(total_vector)

        # 去除空行
        if total_vector == []:
            continue

        num_lines += 1

        # 写入数据
        f = open(data_path, "a")
        f.write(str(label) + "\n")
        for vector in total_vector:
            for num in vector:
                f.write(str(num) + " ")
            f.write("\n")
        f.write("%\n")
        f.close()

    # 在开头两行补上数据的维度,以供在训练的初始化时提取
    f = open(data_path, 'r+')
    content = f.read()
    f.seek(0, 0)
    f.write(str(num_lines) + "\n")
    f.write(str(largest_num) + "\n")
    f.write(content)
    f.close()
コード例 #17
0
def main():
    word_vec = w.main()
    Emoji_vec = ex.main()
    print("Concatenating...")
    Concatenated_Vector = JoinVectors(word_vec, Emoji_vec, len(word_vec))
    return Concatenated_Vector
コード例 #18
0
    return np.sum(y == yhat) * 100.0 / y.size


def softmax_wrapper(features, labels, weights, regularization=0.0):
    cost, grad, _ = softmaxRegression(features, labels, weights,
                                      regularization)
    return cost, grad


# Gradient check always comes first
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
dimVectors = 10
C = 5
_, wordVectors0, _ = Word2Vec.load_saved_params()
wordVectors = (wordVectors0[:nWords, :] + wordVectors0[nWords:, :])

#dummy_weights = 0.1 * np.random.randn(dimVectors, 5)
#dummy_features = np.zeros((10, dimVectors))
#dummy_labels = np.zeros((10,), dtype=np.int32)
#for i in xrange(10):
#    words, dummy_labels[i] = dataset.getRandomTrainSentence()
#    dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)
#print "==== Gradient check for softmax regression ===="
#gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)
#
#print "\n=== For autograder ==="
#print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)

# Try different regularizations and pick the best!
コード例 #19
0
# -*- coding: utf-8 -*

import sys
sys.path.insert(
    0,
    '/Users/davichiar/Documents/ADDAVICHI/Python/Sentimental-Analysis-master/Bidirectional_LSTM'
)

import os
import tensorflow as tf
import Bi_LSTM
import Word2Vec
import gensim
import numpy as np

W2V = Word2Vec.Word2Vec()

Batch_size = 1
Vector_size = 300
Maxseq_length = 2600
learning_rate = 0.001
lstm_units = 128
num_class = 2
keep_prob = 1.0

X = tf.placeholder(tf.float32,
                   shape=[None, Maxseq_length, Vector_size],
                   name='X')
Y = tf.placeholder(tf.float32, shape=[None, num_class], name='Y')
seq_len = tf.placeholder(tf.int32, shape=[None])
コード例 #20
0
ファイル: Main.py プロジェクト: anwar-arif/SentimentAnalysis
import Helper
import Word2Vec
import CNN

positive, negative = Helper.Partition_Pos_Neg_Data()
sentences = Helper.Get_Sentences(positive, negative)

model = Word2Vec.Do_Word2Vec(sentences)

# print(model.similarity('মেসি', 'নেইমার'))
CNN.Do_CNN(positive, negative, model)

# words = Helper.Sentence2Word(positive[0])
# for w in words :
#     print(w)
コード例 #21
0
import Word2Vec
import gensim
import numpy as np
import pymysql.cursors

# ===========================================
# load data
connection = pymysql.connect(user='******', password='******', database='GRE')
cursor = connection.cursor()
commit = "select * from GRES"
cursor.execute(commit)
Sentences = [each[1] for each in cursor.fetchall()]
Sentences = Word2Vec.cleanText(Sentences)

# ===========================================
# Load model
model_google = gensim.models.Word2Vec.load_word2vec_format(
    '../model/GoogleNews-vectors-negative300.bin', binary=True)
# Word2Vec.Train_Wrod2VEc(Sentences, model_google)

# ===========================================
# Generalize words
n_dim = 300
train_vectors = [
    Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences
]
Word2Vec.storeVecs(train_vectors, '../vectors/google_vecs.txt')
コード例 #22
0
# -*- coding: utf-8 -*-

import Word2Vec


#load = ["6CM00079.txt","6CM00080.txt","6CM00082.txt","6CM00083.txt","6CM00088.txt","6CM00090.txt","6CM00092.txt","6CM00093.txt","6CM00094.txt","6CM00095.txt"]
load = ["6CM00080.txt"]

# 호출 및 벡터 사이즈 설정 
vector_size = 10
#word2vec = Word2Vec.Word2Vec(pos,vector_size)
word2vec = Word2Vec.Word2Vec(vector_size,load)
final_embeddings, datas, count, dictionary, reverse_dictionary = word2vec.output()
# 유사한 단어 불러오기 
#print(dictionary)
result = word2vec.similarity("군대",100)
print(result)

# 1. 키워드 입력시 유사단어 뽑기
# 2. 주요키워드에서 보여주기 
コード例 #23
0
import gensim
import pymysql.cursors
import Word2Vec

# ===========================================
# load data
connection = pymysql.connect(user='******', password='******', database='GRE')
cursor = connection.cursor()
commit = "select * from GRES2"
cursor.execute(commit)
Sentences = [each[1] for each in cursor.fetchall()]
Sentences = Word2Vec.cleanText(Sentences)

# ===========================================
# Load model
model_google = gensim.models.KeyedVectors.load_word2vec_format('../GoogleModel/GoogleNews-vectors-negative300.bin', binary=True)
# Word2Vec.Train_Wrod2VEc(Sentences, model_google)

# ===========================================
# Generalize words
n_dim = 300
train_vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences]
Word2Vec.storeVecs(train_vectors, '../data for input1/q_vecs.pkl')

commit = "select * from GRES2"
cursor.execute(commit)
Sentences = [each[2] for each in cursor.fetchall()]
Sentences = Word2Vec.cleanText(Sentences)

# Generalize words
train_vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences]
コード例 #24
0
ファイル: example.py プロジェクト: zaczou/Demo
from Word2Vec import *
import pymongo
db = pymongo.MongoClient().travel.articles


class texts:
    def __iter__(self):
        for t in db.find().limit(30000):
            yield t['words']


wv = Word2Vec(texts(),
              model='cbow',
              nb_negative=16,
              shared_softmax=True,
              epochs=2)  #建立并训练模型
wv.save_model('myvec')  #保存到当前目录下的myvec文件夹

#训练完成后可以这样调用
wv = Word2Vec()  #建立空模型
wv.load_model('myvec')  #从当前目录下的myvec文件夹加载模型
コード例 #25
0
def precision(y, yhat):
    """ Precision for classifier """
    assert(y.shape == yhat.shape)
    return np.sum(y == yhat) * 100.0 / y.size

def softmax_wrapper(features, labels, weights, regularization = 0.0):
    cost, grad, _ = softmaxRegression(features, labels, weights, regularization)
    return cost, grad

# Gradient check always comes first
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
dimVectors = 10
C = 5
_, wordVectors0, _ = Word2Vec.load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])

#dummy_weights = 0.1 * np.random.randn(dimVectors, 5)
#dummy_features = np.zeros((10, dimVectors))
#dummy_labels = np.zeros((10,), dtype=np.int32)
#for i in xrange(10):
#    words, dummy_labels[i] = dataset.getRandomTrainSentence()
#    dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)
#print "==== Gradient check for softmax regression ===="
#gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)
#
#print "\n=== For autograder ==="
#print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)

# Try different regularizations and pick the best!
コード例 #26
0
import Word2Vec

####Testing method

words1 = "Foi um ótimo dia!"
words2 = "Hoje está um dia lindo!"

#iniciando o modelo de embeddings
word_vectors, model = Word2Vec.startModel()

similaridadeMatrizWord2vec = Word2Vec.calculateSimilarity(
    word_vectors, model, words1, words2)
similaridadeVetoresEmbeddings = Word2Vec.embeddingsSimilarity(
    model, words1, words2)
similaridadeWordOrder = Word2Vec.wordOrderSimilarity(word_vectors, model,
                                                     words1, words2)
similaridadeBinaria = Word2Vec.binarySimilarity(words1, words2)

print("similaridade Matriz Word2vec = ", similaridadeMatrizWord2vec)
print("similaridade Vetores Embeddings = ", similaridadeVetoresEmbeddings)
print("similaridade Word Order = ", similaridadeWordOrder)
print("similaridade binaria = ", similaridadeBinaria)
"""TESTANDO COM A BASE DE DADOS"""

dados = open('DadosProcessados.csv', 'r', encoding='utf-8',
             errors='ignore').read().split('\n')

dataset = []

#salvando os dados da base em dataset
for line in dados:
コード例 #27
0
import gensim
import pymysql.cursors
import Word2Vec
import Doc2Vec

# ===========================================
# Load dictionary
connection = pymysql.connect(user='******', password='******', database='GRE')
cursor = connection.cursor()
commit = "select * from GRES"
cursor.execute(commit)
Sentences = [each[1] for each in cursor.fetchall()]
Dictionary1 = Word2Vec.cleanText(Sentences)
Dictionary2 = Doc2Vec.Preprocessing(Sentences)

# ===========================================
# instantiate our DM and DBOW models
size = 400
model_dm = gensim.models.Doc2Vec(min_count=0,
                                 window=10,
                                 size=size,
                                 sample=1e-3,
                                 negative=5,
                                 workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=0,
                                   window=10,
                                   size=size,
                                   sample=1e-3,
                                   negative=5,
                                   dm=0,
                                   workers=3)
コード例 #28
0
def featuresExtraction(dataSet):

    #calculate feature 1 - TFIDF
    textDataSet = []

    for line in dataset:
        textDataSet.append(line[3])
        textDataSet.append(line[4])

    #add synonyms
    newDataSet = Tep.addSynonyms(textDataSet)
    finalDataSet = []

    #stemming
    for line in newDataSet:
        text = mnlp.stemming(line)
        finalDataSet.append(mnlp.convertText(text))

    print(dataSet)
    print(newDataSet)

    vector = Tfidf.calculateTFIDF(finalDataSet)

    similarities = []

    for i in range(0, len(vector), 2):
        distance = spatial.distance.cosine(vector[i], vector[i + 1])
        similarities.append(1 - distance)

    #calculate others features
    word_vectors, model = Word2Vec.startModel()
    features = []

    for x in range(len(dataSet)):

        featuresLine = []

        #calculate feature 2
        feature2 = Word2Vec.wordOrderSimilarity(word_vectors, model,
                                                dataSet[x][3], dataSet[x][4])

        #calculate feature 3
        sim2 = Word2Vec.embeddingsSimilarity(model, dataSet[x][3],
                                             dataSet[x][4])
        if math.isnan(sim2):
            feature3 = 1.0
        else:
            feature3 = sim2

        #calculate feature 4
        sim3 = Word2Vec.calculateSimilarity(word_vectors, model, dataSet[x][3],
                                            dataSet[x][4])
        if math.isnan(sim3):
            feature4 = 1.0
        else:
            feature4 = sim3

        # calculate feature 5
        feature5 = Word2Vec.binarySimilarity(dataSet[x][3], dataSet[x][4])

        # calculate feature 6
        size1 = len(mnlp.tokenize(dataset[x][3]))
        size2 = len(mnlp.tokenize(dataset[x][4]))

        if (size1 > size2):
            feature6 = size2 / size1
        else:
            feature6 = size1 / size2

        featuresLine.append(similarities[x])
        featuresLine.append(feature2)
        featuresLine.append(feature3)
        featuresLine.append(feature4)
        featuresLine.append(feature5)
        featuresLine.append(feature6)
        featuresLine.append(dataSet[x][2])  #similarity class

        print(featuresLine)
        features.append(featuresLine)
コード例 #29
0
    header=0,
    delimiter="\t",
    quoting=3)
unlabeled_train = pd.read_csv(
    "/home/vivek/Desktop/Kaggle/Sentiment Analysis/unlabeledTrainData.tsv",
    header=0,
    delimiter="\t",
    quoting=3)
model = models = gensim.models.Word2Vec.load(
    '300features_40minwords_10context')
clean_train_reviews = []

num_features = 300
for review in train["review"]:
    clean_train_reviews.append(
        Word2Vec.review_to_wordlist(review, remove_stopwords=True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(
        Word2Vec.review_to_wordlist(review, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

# Fit a random forest to the training data, using 100 trees

forest = RandomForestClassifier(n_estimators=100)

print "Fitting a random forest to labeled training data..."
コード例 #30
0
    parameters.init()

    # Prepare data for training the seq2seq
    prepare = DataPreparation()
    text = prepare.make_disintegration
    sent = prepare.get_sentences(text)
    dicc = prepare.get_dictionary(text, stopwords, vocab_size)
    data = prepare.get_word_list(sent,
                                 stopwords,
                                 window_size=Word2Vec_window_size)

    print('Propiedades del corpus: \n')
    print('\tDiccionario con %d palabras' % (len(dicc['w2i'])))

    word_to_vec = Word2Vec(vocab_size, Word2Vec_embedding_dim,
                           Word2Vec_optimizer_step)
    x_train, y_train = word_to_vec.training_data(data)
    W1, b1 = word_to_vec.train(x_train, y_train)
    vocab_vectors = W1 + b1

    conversations = []
    for i in range(len(sent) - 2):
        if len(sent[i + 1]) != 0 and len(
                sent[i + 2]) != 0:  # to avoid empty sentences
            conversations.append([sent[i + 1], sent[i + 2]])

    # TRAIN THE MODEL

    # Initialize all the variables
    session = tf.Session()
    init_variables = tf.global_variables_initializer()
コード例 #31
0
import Word2Vec
import gensim
import numpy as np
import pymysql.cursors

# ===========================================
# load data
connection = pymysql.connect(user='******', password='******', database='GRE')
cursor = connection.cursor()
commit = "select * from GRES"
cursor.execute(commit)
Sentences = [each[1] for each in cursor.fetchall()]
Sentences = Word2Vec.cleanText(Sentences)

# ===========================================
# Train model
model_w2v = gensim.models.Word2Vec.load('../model/model_w2v')
Word2Vec.Train_Wrod2VEc(Sentences, model_w2v)

# ===========================================
# Generalize words
n_dim = 300
train_vectors = [
    Word2Vec.buildWordVector(model_w2v, z, n_dim) for z in Sentences
]
Word2Vec.storeVecs(train_vectors, '../model/w2v_vecs.txt')