class WordEmbedding: def __init__(self, prep_obj, method='w2v_tfidf'): self.vector_corpus = [] if method == 'w2v_tfidf': self.word_to_vec(prep_obj, use_tf_idf=True) elif method == 'w2v': self.word_to_vec(prep_obj, use_tf_idf=False) elif method == 'cv': self.count_vectorizer(prep_obj) elif method == 'tfidf': self.tf_idf(prep_obj) elif method == 'glove': self.glove(prep_obj) elif method == 'bert': self.bert(prep_obj) def bert(self, prep_obj): bert_embedding = BertEmbedding( model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased') result = np.array(bert_embedding(prep_obj.detokenized_corpus)) vec = np.zeros(768) for sentence in result: for word_vec in sentence[1]: vec = np.add(vec, np.array(word_vec)) vec = np.true_divide( vec, 1 if len(sentence[1]) == 0 else len(sentence[1])) self.vector_corpus.append(vec) def count_vectorizer(self, prep_obj): self.vector_corpus = CountVectorizer().fit_transform( prep_obj.detokenized_corpus) def tf_idf(self, prep_obj): vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', token_pattern="\S*") vec = vectorizer.fit_transform( prep_obj.detokenized_corpus).todense().tolist() feature_names = vectorizer.get_feature_names() self.vector_corpus = [[ vec[index][feature_names.index(word)] for word in tweet ] for index, tweet in enumerate(prep_obj.tokenized_corpus)] def word_to_vec(self, prep_obj, use_tf_idf=False): self.tf_idf(prep_obj) tf_idf_vector = self.vector_corpus self.vector_corpus = [] features = 100 model = gensim.models.Word2Vec(prep_obj.tokenized_corpus, min_count=1, workers=4, size=features, window=5, sg=0) vec = np.zeros(features) for row, tweet in enumerate(prep_obj.tokenized_corpus): for column, word in enumerate(tweet): mul = tf_idf_vector[row][column] if use_tf_idf else 1 vec = np.add(vec, np.array(model.wv[word]) * mul) vec = np.true_divide(vec, 1 if len(tweet) == 0 else len(tweet)) self.vector_corpus.append(vec) def glove(self, prep_obj): embeddings_dict = {} features = 100 with open("glove.twitter.27B." + str(features) + "d.txt", 'r', encoding="utf-8") as f: for line in f: values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") embeddings_dict[word] = vector # if word is not in dictionary then its corresponding vector is taken as zero vec = np.zeros(features) self.vector_corpus = [] for tweet in prep_obj.tokenized_corpus: for word in tweet: try: vec = np.add(vec, np.array(embeddings_dict[word])) except ValueError: pass vec = np.true_divide(vec, 1 if len(tweet) == 0 else len(tweet)) self.vector_corpus.append(vec)
def prepare_sequences(k, k2): dataframe = pd.read_csv('./Data/address.txt', sep="\t", header=None) data = dataframe.values lst_labels_true = data[:, 2] sequences = data[:, 1] list_words = [] max_review_length = 0 for i, item in enumerate(sequences): if isinstance(item, basestring): tmp = CountVectorizer().build_tokenizer()(item) tmplen = len(tmp) if tmplen > max_review_length: max_review_length = tmplen list_words.append(tmp) else: np.delete(lst_labels_true, i, 0) totals = Counter(i for i in list(chain.from_iterable(list_words))) word_index = {} i = 1 for item in totals.keys(): if totals[item] > k2: word_index[item] = i i += 1 text = [] for row in list_words: tmp = [] for word in row: if k2 < totals[word] < max(totals.values()): tmp.append(word_index[word]) else: tmp.append(0) while len(tmp) < max_review_length: tmp.append(0) text.append(tmp) totals = Counter(i for i in list(chain.from_iterable(text))) top_words = len(totals.keys()) # len_data_set = len(text) # items = [i for i in xrange(k)] * int(math.ceil(len_data_set / float(k))) # items = items[:len_data_set] # random.shuffle(items) # # str_file_new = './Data/cross_validation.txt' # the_file = open(str_file_new, 'w') # for element in items: # the_file.write("%s\n" % element) # the_file.close() items = [] with open('./Data/cross_validation.txt', 'r') as f: for line in f: tmp = line.strip('\n') items.append(int(tmp)) class1 = 0 class2 = 0 for item in lst_labels_true: if item == 0: class1 += 1 else: class2 += 1 embedding_arr = xrange(10, 30, 5) dropout_arr1 = [i / float(10) for i in xrange(0, 10, 2)] lstm_hidden = xrange(500, 1301, 50) dropout_arr2 = [i / float(10) for i in xrange(0, 10, 2)] epoch = 35 batch = 64 n = k results_cv = np.zeros([n, 1]) parameters = np.array([[800, 0.1, 750, 0.1, epoch, batch]]) # parameters = np.load('./Data/Results_RNN/lstm2_parameters_cv0.npy') for cv in xrange(n): for i in xrange(2): results = np.zeros([len(embedding_arr), 1]) for ii, embedding in enumerate(embedding_arr): print 'lstm optimize #%d embedding %d' % (i, ii) parameters[0, 0] = embedding results[ii, :] = run_optimize(cv, text, items, lst_labels_true, parameters, max_review_length, top_words) results = results[~np.all(results == 0, axis=1)] tmp = np.argmax(results, axis=0) parameters[0, 0] = embedding_arr[tmp[0]] print parameters results = np.zeros([len(dropout_arr1), 1]) for ii, dropout in enumerate(dropout_arr1): print 'lstm optimize #%d dropout1 %d' % (i, ii) parameters[0, 1] = dropout results[ii, :] = run_optimize(cv, text, items, lst_labels_true, parameters, max_review_length, top_words) results = results[~np.all(results == 0, axis=1)] tmp = np.argmax(results, axis=0) parameters[0, 1] = dropout_arr1[tmp[0]] print parameters results = np.zeros([len(lstm_hidden), 1]) for ii, hidden in enumerate(lstm_hidden): print 'lstm optimize #%d hidden %d' % (i, ii) parameters[0, 2] = hidden results[ii, :] = run_optimize(cv, text, items, lst_labels_true, parameters, max_review_length, top_words) results = results[~np.all(results == 0, axis=1)] tmp = np.argmax(results, axis=0) parameters[0, 2] = lstm_hidden[tmp[0]] print parameters results = np.zeros([len(dropout_arr2), 1]) for ii, dropout in enumerate(dropout_arr2): print 'lstm optimize #%d dropout2 %d' % (i, ii) parameters[0, 3] = dropout results[ii, :] = run_optimize(cv, text, items, lst_labels_true, parameters, max_review_length, top_words) results = results[~np.all(results == 0, axis=1)] tmp = np.argmax(results, axis=0) parameters[0, 3] = dropout_arr2[tmp[0]] print parameters # print 'lstm test' # results_cv[cv, :] = run_test(cv, text, items, lst_labels_true, parameters) # gc.collect() np.save('./Data/Results_RNN/lstm2_parameters_cv%d.npy' % cv, parameters)