def vector_training(self, dimension):
     # w2v setting
     t = w2v()
     t.hyperparameter(dimension=dimension)
     t.train_file_setting("segmentation.txt", "e2v_w2v_sg")
     # articles
     for article in self.articles:
         article_id = article[0]
         print("article_id:", end='')
         print(article_id)
         t.write_file(article[1], append=True)
         t.write_file(article[2], append=True)
     # movies
     for movie in self.movies:
         movie_id = movie[0]
         print("movie_id:", end='')
         print(movie_id)
         t.write_file(movie[1], append=True)
     t.train()
     t.load_model()
     print(t.term_ranking_in_corpus("教師節", 50))
     print(t.term_to_vector("爸爸"))
     print(t.terms_similarity("母親", "母親節"))
     print(1 - t.vectors_similarity(t.term_to_vector("在一起"),
                                    t.term_to_vector("過甜蜜")))
 def vector_training(self):
     # w2v setting
     t = w2v()
     t.train_file_setting("segmentation.txt", "sum_w2v_w2v_sg")
     # articles
     for article in self.articles:
         article_id = article[0]
         content = article[1]
         print("article_id:", end='')
         print(article_id)
         # print(content)
         sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n',
                            content)
         sentence_list = sentences.split("\n")
         # print(sentence_list)
         for sentence in sentence_list:
             if sentence != '':
                 # print(sentence)
                 seg_list = jieba.cut(sentence, cut_all=False)
                 for seg in seg_list:
                     if seg not in self.stopwordset and seg != ' ':
                         print(seg, end=' ')
                         t.write_file(seg + " ", append=True)
             print('')
     # movies
     for movie in self.movies:
         movie_id = movie[0]
         storyline = movie[1]
         print("movie_id:", end='')
         print(movie_id)
         # print(content)
         sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n',
                            storyline)
         sentence_list = sentences.split("\n")
         # print(sentence_list)
         for sentence in sentence_list:
             if sentence != '':
                 # print(sentence)
                 seg_list = jieba.cut(sentence, cut_all=False)
                 for seg in seg_list:
                     if seg not in self.stopwordset and seg != ' ':
                         print(seg, end=' ')
                         t.write_file(seg + " ", append=True)
             print('')
     t.train()
     t.load_model()
     print(t.term_ranking_in_corpus("教師節", 50))
     print(t.term_to_vector("爸爸"))
     print(t.terms_similarity("母親", "母親節"))
     print(1 - t.vectors_similarity(t.term_to_vector("母親"),
                                    t.term_to_vector("母親節")))
Esempio n. 3
0
 def w2v_algorithm(self, dimension, start, end, rank):
     t = w2v()
     t.hyperparameter(dimension=dimension)
     t.train_file_setting("segmentation.txt", "e2v_w2v_sg")
     t.load_model()
     count = start
     print(len(self.emotion_dic))
     sql = "INSERT INTO experiment_entity2vec (id, emotion_entity, emotion_similarity) VALUES (%s, %s, %s)"
     for emotion in sorted(self.emotion_dic.items(),
                           key=lambda x: x[1],
                           reverse=True):
         print(emotion)
         emotion_entity = emotion[0]
         emotion_similarity = ""
         for similarity in t.term_ranking_in_corpus(emotion_entity, rank):
             emotion_similarity += similarity[0] + ":" + " "
         count += 1
         val = (count, emotion_entity, emotion_similarity)
         try:
             self.cursor.execute(sql, val)
         except:
             print("Emotion Term Insert Error")
         if count == end:
             print("emotion finish...", end="\n\n")
             count = start
             break
     print(len(self.event_dic))
     sql = "UPDATE experiment_entity2vec SET event_entity=%s, event_similarity=%s WHERE id=%s and event_entity = ''"
     for event in sorted(self.event_dic.items(),
                         key=lambda x: x[1],
                         reverse=True):
         print(event)
         event_entity = event[0]
         event_similarity = ""
         for similarity in t.term_ranking_in_corpus(event_entity, rank):
             event_similarity += similarity[0] + ":" + " "
         count += 1
         val = (event_entity, event_similarity, count)
         self.cursor.execute(sql, val)
         if count == end:
             print("event finish...", end="\n\n")
             break
     self.db.commit()
Esempio n. 4
0
parser.add_argument(
    '-iv',
    help='Index to Vector numpy array mapping integer to vector',
    default='index_to_vector.npy')

args = parser.parse_args()
word_embedding_filename = args.iv
word_to_embedding_index_filename = args.wi

try:
    word_embedding = np.load(word_embedding_filename)
    word_to_embedding_index = np.load(word_to_embedding_index_filename).item()
except FileNotFoundError:
    print('Word embedding not found, running word2vec')
    word2vec.w2v(corpus_filename='./corpus/imdb_train_corpus.txt')

embedding_norm = np.linalg.norm(word_embedding, axis=1)
embedding_norm.shape = (10000, 1)
normalized_word_embedding = word_embedding / embedding_norm
m = word_to_embedding_index
# Reverse dictionary to look up words from indices
embedding_index_to_word = dict(zip(m.values(), m.keys()))

root = './aclImdb/test/posneg/'
for filename in os.listdir('./ggs_results/diffs/'):
    rv = rp.review(root + filename[0:-4] + '.txt')
    diff = np.load('./ggs_results/diffs/' + filename)
    prob = np.load('./ggs_results/probs/' + filename)
    print('Filename: ', filename, 'Initial Probability: ', prob[0][0])
    if rv.sentiment == 'pos':
 def save_vector(self):
     # w2v setting
     t = w2v()
     t.train_file_setting("segmentation.txt", "sum_w2v_w2v_sg")
     t.load_model()
     dimension = t.size
     # articles
     sql = "INSERT INTO articles_vector (id, sum_w2v_w2v_sg) VALUES (%s, %s)"
     for article in self.articles:
         article_sum_w2v_w2v_sg = np.zeros(dimension)
         article_id = article[0]
         content = article[1]
         print("article_id:", end='')
         print(article_id)
         # print(content)
         sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n',
                            content)
         sentence_list = sentences.split("\n")
         # print(sentence_list)
         for sentence in sentence_list:
             if sentence != '':
                 # print(sentence)
                 seg_list = jieba.cut(sentence, cut_all=False)
                 for seg in seg_list:
                     if seg not in self.stopwordset and seg != ' ':
                         try:
                             seg_vector = t.term_to_vector(seg)
                             print(seg)
                             print(seg_vector[:5], end='\n\n')
                             article_sum_w2v_w2v_sg += seg_vector
                         except:
                             continue
             print('')
         val = (article_id, str(list(article_sum_w2v_w2v_sg)))
         print("sum")
         print(article_sum_w2v_w2v_sg[:5], end="\n\n")
         self.cursor.execute(sql, val)
         self.db.commit()
     # movies
     sql = "INSERT INTO movies_vector (id, sum_w2v_w2v_sg) VALUES (%s, %s)"
     for movie in self.movies:
         movie_sum_w2v_w2v_sg = np.zeros(dimension)
         movie_id = movie[0]
         storyline = movie[1]
         print("movie_id:", end='')
         print(movie_id)
         # print(content)
         sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n',
                            storyline)
         sentence_list = sentences.split("\n")
         # print(sentence_list)
         for sentence in sentence_list:
             if sentence != '':
                 # print(sentence)
                 seg_list = jieba.cut(sentence, cut_all=False)
                 for seg in seg_list:
                     if seg not in self.stopwordset and seg != ' ':
                         try:
                             seg_vector = t.term_to_vector(seg)
                             print(seg)
                             print(seg_vector[:5], end='\n\n')
                             movie_sum_w2v_w2v_sg += seg_vector
                         except:
                             continue
             print('')
         val = (movie_id, str(list(movie_sum_w2v_w2v_sg)))
         print("sum")
         print(movie_sum_w2v_w2v_sg[:5], end="\n\n")
         self.cursor.execute(sql, val)
         self.db.commit()
 def save_vector(self):
     # w2v setting
     t = w2v()
     t.train_file_setting("segmentation.txt", "e2v_w2v_sg")
     t.load_model()
     dimension = t.size
     # Access Articles NER 221269
     self.cursor.execute(
         "SELECT id, emotion, event, person_object, time, location FROM articles_ner Where id >= 1 and id <= 221269"
     )
     articles_ner = self.cursor.fetchall()
     for article_ner in articles_ner:
         article_id = article_ner[0]
         emotion = article_ner[1]
         event = article_ner[2]
         person_object = article_ner[3]
         time = article_ner[4]
         location = article_ner[5]
         print("article_id:", end='')
         print(article_id)
         relationship_e2v_w2v_sg = []
         person_object_count = 0
         person_object_add = np.zeros(dimension)
         for po in person_object.split(" "):
             if po != "":
                 try:
                     person_object_add += t.term_to_vector(po)
                     person_object_count += 1
                 except:
                     continue
         if person_object_count == 0:
             person_object_count = 1
         relationship_e2v_w2v_sg = np.append(
             relationship_e2v_w2v_sg,
             person_object_add / person_object_count)
         scenario_e2v_w2v_sg = []
         emotion_count = 0
         emotion_add = np.zeros(dimension)
         for e in emotion.split(" "):
             if e != "":
                 try:
                     emotion_add += t.term_to_vector(e)
                     emotion_count += 1
                 except:
                     continue
         if emotion_count == 0:
             emotion_count = 1
         relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg,
                                             emotion_add / emotion_count)
         scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg,
                                         emotion_add / emotion_count)
         event_count = 0
         event_add = np.zeros(dimension)
         for e in event.split(" "):
             if e != "":
                 try:
                     event_add += t.term_to_vector(e)
                     event_count += 1
                 except:
                     continue
         if event_count == 0:
             event_count = 1
         relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg,
                                             event_add / event_count)
         scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg,
                                         event_add / event_count)
         location_count = 0
         location_add = np.zeros(dimension)
         for l in location.split(" "):
             if l != "":
                 try:
                     location_add += t.term_to_vector(l)
                     location_count += 1
                 except:
                     continue
         if location_count == 0:
             location_count = 1
         relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg,
                                             location_add / location_count)
         time_count = 0
         time_add = np.zeros(dimension)
         for ti in time.split(" "):
             if ti != "":
                 try:
                     time_add += t.term_to_vector(ti)
                     time_count += 1
                 except:
                     continue
         if time_count == 0:
             time_count = 1
         relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg,
                                             time_add / time_count)
         sql = "UPDATE articles_vector SET relationship_e2v_w2v_sg=%s, scenario_e2v_w2v_sg=%s WHERE id=%s"
         val = (str(list(relationship_e2v_w2v_sg)),
                str(list(scenario_e2v_w2v_sg)), article_id)
         self.cursor.execute(sql, val)
         self.db.commit()
     # Access Movies NER 3722
     self.cursor.execute(
         "SELECT id, emotion, event FROM movies_ner Where id >= 1 and id <= 3722"
     )
     movies_ner = self.cursor.fetchall()
     for movie_ner in movies_ner:
         movie_id = movie_ner[0]
         emotion = movie_ner[1]
         event = movie_ner[2]
         print("movie_id:", end='')
         print(movie_id)
         scenario_e2v_w2v_sg = []
         emotion_count = 0
         emotion_add = np.zeros(dimension)
         for e in emotion.split(" "):
             if e != "":
                 try:
                     emotion_add += t.term_to_vector(e)
                     emotion_count += 1
                 except:
                     continue
         if emotion_count == 0:
             emotion_count = 1
         scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg,
                                         emotion_add / emotion_count)
         event_count = 0
         event_add = np.zeros(dimension)
         for e in event.split(" "):
             if e != "":
                 try:
                     event_add += t.term_to_vector(e)
                     event_count += 1
                 except:
                     continue
         if event_count == 0:
             event_count = 1
         scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg,
                                         event_add / event_count)
         sql = "UPDATE movies_vector SET scenario_e2v_w2v_sg=%s WHERE id=%s"
         val = (str(list(scenario_e2v_w2v_sg)), movie_id)
         self.cursor.execute(sql, val)
         self.db.commit()
Esempio n. 7
0
 def __init__(self):
     self.importData()
     self.topicChooser = w2v()
     self.conDB()
     self.chatIDGen()
     self.cusPrint("hi, what can I help you with?")
from preprocessing import tokenize, read_preprocessed
import numpy as np
from word2vec import w2v
from sklearn.model_selection import train_test_split

use_preprocessed = True

if not use_preprocessed:
    print('Processing the data...')
    data_path = '../Data/'
    # split(data_path, 'tweets.csv')
    good_tweets = tokenize(data_path + 'good_tweets.csv')
    bad_tweets = tokenize(data_path + 'bad_tweets.csv')
else:
    print('Loading preprocessed data...')
    good_tweets = read_preprocessed('good_tweets')
    bad_tweets = read_preprocessed('bad_tweets')

print('Creation of x and y vectors...')
x_vector = good_tweets + bad_tweets
y_vector = (np.zeros(len(good_tweets)).tolist()) + (np.ones(len(bad_tweets)).tolist())

[x_train, x_test, y_train, y_test] = train_test_split(x_vector, y_vector, shuffle=True)

print('Creating the model...')
model = w2v(x_train, y_train)

print('Evaluation of the model...')
model.evaluate(x_test, y_test)