def ensemble_embedding(self, word_embedding, context_embedding): """Replace current syn0 with the sum of context and word embeddings. Parameters ---------- word_embedding : str Path to word embeddings in GloVe format. context_embedding : str Path to context embeddings in word2vec_format. Returns ------- numpy.ndarray Matrix with new embeddings. """ glove2word2vec(context_embedding, context_embedding + '.w2vformat') w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) # compare vocab words using keys of dict vocab assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings' # sort context embedding to have words in same order as word embedding prev_c_emb = copy.deepcopy(c_emb.syn0) for word_id, word in enumerate(w_emb.index2word): c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index] # add vectors of the two embeddings new_emb = w_emb.syn0 + c_emb.syn0 self.syn0 = new_emb return new_emb
def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1): glove2word2vec(model_file, model_file + '.w2vformat') model = cls.load_word2vec_format('%s.w2vformat' % model_file) if ensemble and context_file: model.ensemble_embedding(model_file, context_file) if sorted_vocab and vocab_file: model.sort_embeddings(vocab_file) return model
def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1): """Load model from `model_file`. Parameters ---------- model_file : str Path to model in GloVe format. vocab_file : str, optional Path to file with vocabulary. context_file : str, optional Path to file with context-embedding in word2vec_format. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ glove2word2vec(model_file, model_file + '.w2vformat') model = cls.load_word2vec_format('%s.w2vformat' % model_file) if ensemble and context_file: model.ensemble_embedding(model_file, context_file) if sorted_vocab and vocab_file: model.sort_embeddings(vocab_file) return model
def main(*args, **kwargs): # store_location = "/home/rpatel12/ferraro_user/NAM_Modified_data/data_sets/WN_11/" # import pandas as pd # DATA_LOCATION = "/home/rpatel12/ferraro_user/WN11_1/" print(args) print(type(args)) DATA_LOCATION = args[0] # model_location = "/home/rpatel12/ferraro_user/glove_data/" model_location = args[1] store_location = DATA_LOCATION data_set_type = args[2] data_list = pd.read_csv(DATA_LOCATION + "data_list_train.csv") print(data_list.head()) token_list = get_train_entities(data_list, dataset=data_set_type) print(len(token_list)) data_dev = pd.read_csv(DATA_LOCATION + "data_list_dev.csv") data_test = pd.read_csv(DATA_LOCATION + "data_list_test.csv") glove_file = datapath(model_location + "glove.840B.300d.txt") tmp_file = get_tmpfile(model_location + "test_word2vec.txt") # _ = glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) # converting word id and id to word word_2_id_dict, id_2_word_dict = word_2_id(token_list) # building the embedding matrix embedding_matrix, OOV_word = build_embedding_dict(model, 300, id_2_word_dict) print(OOV_word) # build relation to head and relation to tail dictionaries head = data_list["Head"].tolist() tail = data_list['tail'].tolist() rel = data_list['relation'].tolist() relation_2_head, relation_2_tail = get_rel_dict(head, tail, rel) train_vec_dict = build_test_dev_vec(data_list, word_2_id_dict, train=True, dataset=data_set_type) dev_vec_dict = build_test_dev_vec(data_dev, word_2_id_dict, dataset=data_set_type) test_vec_dict = build_test_dev_vec(data_test, word_2_id_dict, dataset=data_set_type) # saving embedding matrix output = open(store_location + "embedding_matrix.pkl", 'wb') pickle.dump(embedding_matrix, output, protocol=2) output.close() # saving train data output = open(store_location + "train_vec_dict.pkl", 'wb') pickle.dump(train_vec_dict, output, protocol=2) output.close() # saving dev data output = open(store_location + "dev_vec_dict.pkl", 'wb') pickle.dump(dev_vec_dict, output, protocol=2) output.close() # saving test data output = open(store_location + "test_vec_dict.pkl", 'wb') pickle.dump(test_vec_dict, output, protocol=2) output.close() # saving word to id dict output = open(store_location + "word_2_id.pkl", 'wb') pickle.dump(word_2_id_dict, output, protocol=2) output.close() # saving id to work dict output = open(store_location + "id_2_word.pkl", 'wb') pickle.dump(id_2_word_dict, output, protocol=2) output.close() # saving rel to tail output = open(store_location + "relation_2_tail.pkl", 'wb') pickle.dump(relation_2_tail, output, protocol=2) output.close() # saving rel to head output = open(store_location + "relation_2_head.pkl", 'wb') pickle.dump(relation_2_head, output, protocol=2) output.close()
from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import unicodedata import six import gensim import random import torch import numpy as np from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec('glove.42B.300d.txt', 'word2vec.txt') embed_lookup = KeyedVectors.load_word2vec_format("word2vec.txt") def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if six.PY3: if isinstance(text, str): return text elif isinstance(text, list): return list elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError("Unsupported string type: %s" % (type(text))) elif six.PY2:
import numpy as np import gensim from gensim import utils from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec import nltk from nltk import word_tokenize from nltk import download from nltk.corpus import stopwords import os stop_words = stopwords.words('english') print("--- loading pre-trained model ---") glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt") model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False) print("--- DONE loading model ---") def preprocess(text): text = text.lower() doc = word_tokenize(text) doc = [word for word in doc if word not in stop_words] doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only return doc def filter_docs(corpus, labels, condition_on_doc):
from gensim.scripts.glove2word2vec import glove2word2vec import os CWD = os.getcwd() glove2word2vec(CWD + "\\glove\\vectors.txt", CWD + "\\glove\\word2vec.txt")
# -*- coding: utf-8 -*- """ Created on Thu Feb 22 19:21:32 2018 @author: RAJDEEP PAL """ from gensim.scripts.glove2word2vec import glove2word2vec glove_path = 'F:/year 2/hpg/project/attribute_embedding/GloVe/glove.6B.300d.txt' w2v_path = 'F:/year 2/hpg/project/attribute_embedding/GloVe/glove.6B.300d.txt.word2vec' glove2word2vec(glove_path, w2v_path) #%% from gensim.models import KeyedVectors model = KeyedVectors.load_word2vec_format(w2v_path, binary=False) results = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3) print(results) #%% words = list(model.vocab) print(len(words)) print(model['walk']) print('brush_teeth' in words)
# taken from https://stackoverflow.com/a/47465278 from gensim.scripts.glove2word2vec import glove2word2vec from embeddings.glove import original_path, gensim_path glove2word2vec(glove_input_file=original_path, word2vec_output_file=gensim_path)
from gensim.scripts.glove2word2vec import glove2word2vec path_old = '../data/glove.42B.300d.txt' path_new = '../data/glove.42B.300d_gensim.txt' glove2word2vec(path_old, path_new)
def glove2word(): from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec("/users5/yjtian/Downloads/glove.840B.300d.txt", "/users5/yjtian/Downloads/glove.840B.300d.w2v.txt")
from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec('glove.6B.300d.txt', 'test.txt')
import predict_emotion from predict_emotion import take_image_classify_emotion from bio_summarization import summarize_doc import nltk nltk.download('averaged_perceptron_tagger') with open('fairytale.pkl', 'rb') as file: lm = pickle.load(file) unzipped_folder = "glove.twitter.27B/" # ENTER THE PATH TO THE UNZIPPED `glove.twitter.27B` HERE # use glove2word2vec to convert GloVe vectors in text format into the word2vec text format: if not Path('gensim_glove_vectors_200.txt').exists(): # assumes you've downloaded and extracted the glove stuff glove2word2vec(glove_input_file=unzipped_folder + "glove.twitter.27B.200d.txt", word2vec_output_file="gensim_glove_vectors_200.txt") # read the word2vec txt to a gensim model using KeyedVectors glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors_200.txt", binary=False) class UI: def __init__(self): self.db = Database() self.db.load('celebrities.pkl') self.logged_in = False save_camera_config(0, exposure=0.2) def forgetful_new_person(self, db, new_name):
def build_vocab_glove(vocab, glove_file, embedding_dim, binary, pre_trained_vocab_embedding_file): vocab_size = vocab.get_vocab_size() # init vocab_embedded = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_dim)) # load any vectors from the word2vec print("Load glove file {}\n".format(glove_file)) pad_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, )) unk_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, )) sos_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, )) eos_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, )) # load any vectors from the word2vec print("Load glove file: {} to gensim model. \n".format(glove_file)) # fname, fvocab=None, binary=False, encoding='utf8' glove_file = datapath(glove_file) tmp_file = get_tmpfile("tmp_word2vec.txt") from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec(glove_file, tmp_file) glove_model = KeyedVectors.load_word2vec_format(fname=tmp_file) out_of_vocab_count = 0 out_of_vocab_words = [] if binary: save_f = codecs.open(pre_trained_vocab_embedding_file, 'w', encoding='utf-8') else: save_f = codecs.open(pre_trained_vocab_embedding_file, 'wb', encoding='utf-8') header = "%d %d\n" % (vocab_size, embedding_dim) # write header save_f.write(header) for id, word in vocab.idx2word.items(): if id == vocab.padid: word_embedded = pad_embedded elif id == vocab.sosid: word_embedded = sos_embedded elif id == vocab.eosid: word_embedded = eos_embedded elif id == vocab.unkid: word_embedded = unk_embedded else: try: word_embedded = glove_model.wv[word] except KeyError: out_of_vocab_words.append(word) out_of_vocab_count += 1 word_embedded = unk_embedded vector_str = ' '.join([str(s) for s in word_embedded]) save_f.write('%s %s\n' % (word, vector_str)) vocab_embedded[id] = word_embedded save_f.close() del glove_model return vocab_embedded, out_of_vocab_count, out_of_vocab_words
def __init__(self): glove_file = 'glove.840B.300d.txt' tmp_file = "glove.840B.300d.tmp" glove2word2vec(glove_file, tmp_file) self.model = KeyedVectors.load_word2vec_format(tmp_file)
def glove_word2vec(): glove_input_file = 'glove.6B.100d.txt' word2vec_output_file = 'glove2word2vec.txt' glove2word2vec(glove_input_file, word2vec_output_file)
def generate_words(self, string): """Génère des mots similaires par proximité vectorielle en base du gloVe n-dimensions.""" # déclare le dataset gloVe à n-dimensions glove_file = 'data/sampled_glove.6B.50d.txt' word2vec_file = 'data/sampled_word2vec-glove.6B.50d.txt' file = pathlib.Path(word2vec_file) # teste son existence dans les dossiers if file.exists(): # s'il existe, pas besoin de le générer à nouveau print('word2vec_file {} already exists, loading existing one, not generated'.format(word2vec_file)) else: # si non, le génère print('word2vec_file {} doesn\'t exist, generating new one'.format(word2vec_file)) glove2word2vec(glove_file, word2vec_file) # déclare le modèle model = KeyedVectors.load_word2vec_format(word2vec_file) # déclare la liste des stopwords à filtrer spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS word = '' string_splitted = string.split() # sépare mot par mot, nécessaire dans le cas de beaucoup de mots dans l'input str for i in range(len(string_splitted)): if(string_splitted[i].lower() not in spacy_stopwords): word = string_splitted[i] # récupère les 100 premiers mots les plus similaires par proximité vectorielle list_words = [] if word != '': try: list_words = model.most_similar(positive=[word], topn=100) except: list_words = [] # défini les premiers mots de la liste list_close_words = [word] list_far_words = [word] # récupère les 4 réponses du QCM (une ou plusieurs réponses correctes) count = 0 for i in range(len(list_words)): if word not in list_words[i][0]: list_close_words.append(list_words[i][0]) count = count + 1 if count == 3: break list_words.reverse() # récupère les 4 réponses du QCU (une seule réponse correcte) count = 0 i = 0 for i in range(len(list_words)): if word not in list_words[i][0]: list_far_words.append(list_words[i][0]) count = count + 1 if count == 3 : break dict_words = {} dict_words['list_far_words'] = list_far_words dict_words['list_close_words'] = list_close_words return dict_words
import numpy as np from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models import KeyedVectors from gensim.test.utils import datapath,get_tmpfile from nltk import word_tokenize glove_file = datapath('./glove.6B.50d.txt') tmp_file = get_tmpfile("glove_word2vec.txt") glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) def sent_embedding(sentence,model=model): # add check for 1) empty sentence 2) sentence containing all words which are out of vocab. tokens = [w for w in word_tokenize(sentence.lower()) if w.isalpha()] sent_emb = np.mean([model[t] if t in model else model['unk'] for t in tokens ],axis=0) return sent_emb # print(sent_emb)
from pathlib import Path import gensim from gensim.scripts.glove2word2vec import glove2word2vec from gensim.test.utils import datapath, get_tmpfile from configuration.config import data_dir glove_file = Path(data_dir)/'Tencent_AILab_ChineseEmbedding_for_el.txt' glove_file = datapath(glove_file) w2v_file = get_tmpfile(Path(data_dir)/'tmpfile') glove2word2vec(glove_file, w2v_file) m = gensim.models.KeyedVectors.load_word2vec_format(w2v_file) m.save(str(Path(data_dir)/'tencent_embed_for_el2019'))
from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models.keyedvectors import KeyedVectors import pyemblib import gensim import os parent = os.path.abspath("../../embeddings/") path = os.path.abspath("../../embeddings/GoogleNews-vectors-negative300.bin") glove = os.path.abspath("../../embeddings/glove.840B.300d.txt") wikitext_path = os.path.abspath("../../embeddings/wiki-news-300d-1M-subword.vec") # gensim working. # google_news = KeyedVectors.load_word2vec_format(path, binary=True) glove2word2vec(glove_input_file=glove, word2vec_output_file=os.path.join(parent, "glove.840B.300d_Word2Vec_format.txt")) # pyemblib not working. # embedding = pyemblib.read(path, format='Word2Vec', mode=pyemblib.Mode.Binary) wikitext = pyemblib.read(wikitext_path, format='Word2Vec', mode=pyemblib.Mode.Text) print("Successful read.")
import gensim from gensim.models import Word2Vec from gensim.scripts.glove2word2vec import glove2word2vec PATH_TO_WORD2VEC = os.path.expanduser( "~/data/word2vec/GoogleNews-vectors-negative300.bin") PATH_TO_GLOVE = os.path.expanduser("~/data/glove/glove.840B.300d.txt") word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True) # In[7]: tmp_file = "/tmp/glove.840B.300d.w2v.txt" glove2word2vec(PATH_TO_GLOVE, tmp_file) glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file) # In[1]: import csv PATH_TO_FREQUENCIES_FILE = "data/sentence_similarity/frequencies.tsv" PATH_TO_DOC_FREQUENCIES_FILE = "data/sentence_similarity/doc_frequencies.tsv" def read_tsv(f): frequencies = {} with open(f) as tsv: tsv_reader = csv.reader(tsv, delimiter="\t") for row in tsv_reader:
import numpy as np print('Indexing word vectors.') embeddings_index = {} f = open('glove.6B.100d.txt', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(embeddings_index)) from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec glove_file = datapath('/content/glove.6B.100d.txt') word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt") glove2word2vec(glove_file, word2vec_glove_file) model = KeyedVectors.load_word2vec_format(word2vec_glove_file) model.most_similar('post') result = model.most_similar(positive=['post', 'tweet']) print("{}: {:.4f}".format(*result[0]))
def loadEmbedding(filename): #加载glove模型,转化为word2vec,再加载word2vec模型 word2vec_temp_file = 'word2vec_temp.txt' glove2word2vec(filename, word2vec_temp_file) model = KeyedVectors.load_word2vec_format(word2vec_temp_file) return model
from keras.models import Sequential from keras.layers import * from keras.utils.np_utils import to_categorical from keras.initializers import Constant from train_am_pm_helper import * labeled_examples, unlabeled_examples = parse_labeled_unlabeled_examples() am_pm_set = parse_am_pm_set(labeled_examples) train_am_pm_set, test_am_pm_set = train_test_split_am_pm_set(am_pm_set) merged_am_pm_set = construct_merged_am_pm_set(train_am_pm_set, 1) config = configparser.ConfigParser() config.read("../config.ini") glove_input_file = config["Paths"]["glove_input_file"] word2vec_output_file = glove_input_file + '.word2vec' glove2word2vec(glove_input_file, word2vec_output_file) glove_model = KeyedVectors.load_word2vec_format( word2vec_output_file, binary=False) max_features = 20000 sequence_length = 300 tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ') texts = [] for hour in range(12): for sent, lab in am_pm_set[hour]: texts.append(clean_str(sent)) tokenizer.fit_on_texts(texts) word_index = tokenizer.word_index
lines = fin.read().split("\n") lines = [l.split()[0] for l in lines if len(l) > 0] return set(lines) if __name__ == "__main__": from gensim.scripts.glove2word2vec import glove2word2vec #voc = get_vocab() #my_glove_file = "data/my-glove.txt" #get_embeddings("/home/diesel/Projects/Datasets/Datasets/glove_data/glove.6B/glove.6B.300d.txt", my_glove_file, voc) #t0 = time.time() glove_w2v_file = "data/glove-w2v.txt" glove2word2vec(my_glove_file, glove_w2v_file) W2vecextractor = Word2vecExtractor(glove_w2v_file) t1 = time.time() print("done loading word vectors: ", (t1 - t0) / 60.0) doc = "A fisherman was catching fish by the sea. A monkey saw him, and wanted to imitate what he was doing. The man went away into a little cave to take a rest, leaving his net on the beach. The monkey came and grabbed the net, thinking that he too would go fishing. But since he didn't know anything about it and had not had any training, the monkey got tangled up in the net, fell into the sea, and was drowned. The fisherman seized the monkey when he was already done for and said, 'You wretched creature! Your lack of judgment and stupid behaviour has cost you your life!'" feature_dict = W2vecextractor.get_doc2vec_feature_dict(doc) print(feature_dict) t2 = time.time() print("execution time:", (t1 - t0) / 60.0)
# Usage: python3 generate_pretrained.py <pretrained_file> vocab = {} embedding_size = 300 path_to_bin = sys.argv[1] with open("../dataset/CoNLL-2003/vocab_dict") as f: for line in f: pairs = line.strip().split() vocab[pairs[0]] = int(pairs[1]) # word: index if "glove" in path_to_bin: word2vec_output_file = path_to_bin + '.word2vec' if not os.path.exists(word2vec_output_file): glove2word2vec(path_to_bin, word2vec_output_file) model = gensim.models.KeyedVectors.load_word2vec_format( word2vec_output_file, binary=False) else: model = gensim.models.KeyedVectors.load_word2vec_format(path_to_bin, binary=True) word_embedding = np.zeros((len(vocab), embedding_size)) for word, index in vocab.items(): try: word_embed = np.asarray(model[word]) except KeyError: word_embed = 2 * np.random.rand(embedding_size, ) - 1.0 # [-1, 1] word_embedding[index] = word_embed
from gensim.scripts.glove2word2vec import glove2word2vec import sys if __name__ == "__main__": input_vec = sys.argv[1] gensim_vec = sys.argv[2] glove2word2vec(glove_input_file=input_vec, word2vec_output_file=gensim_vec)
def setupfiles(): glove2word2vec(glove_path, word2vec_output_file)
def main(hidden_dim, batch_size): global device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if not os.path.exists('glove.6B.50d.w2v.txt'): print("w2v file not found, generating...") glove2word2vec(glove_input_file='glove.6B.50d.txt', word2vec_output_file='glove.6B.50d.w2v.txt') global w2v w2v = KeyedVectors.load_word2vec_format('glove.6B.50d.w2v.txt', binary=False) print("Fetching data...") train_data, valid_data = fetch_data( ) # X_data is a list of pairs (document, y); y in {0,1,2,3,4} model = RNN(50, hidden_dim, 5, batch_size) model.double() model.cuda() print("Vectorizing data...") train_vecs, train_labs = vectorize_data(train_data) valid_vecs, valid_labs = vectorize_data(valid_data) print("Finished vectorizing data") optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=False) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) iters = 10 while iters > 0: # How will you decide to stop training and why model.train() optimizer.zero_grad() minibatch_size = 16 N = len(train_data) perm = np.random.permutation(N) train_vecs = [train_vecs[i] for i in perm] train_labs = train_labs[perm] total = 0 correct = 0 epoch = 10 - iters for minibatch_index in tqdm(range(N // minibatch_size)): optimizer.zero_grad() loss = None for example_index in range(minibatch_size): gold_label = train_labs[minibatch_index * minibatch_size + example_index].long() predicted_vector = model( train_vecs[minibatch_index * minibatch_size + example_index].to(device)) predicted_label = torch.argmax(predicted_vector) correct += int(predicted_label == gold_label) total += 1 example_loss = model.compute_Loss( predicted_vector.view(1, -1), torch.tensor([gold_label]).to(device)) if loss is None: loss = example_loss else: loss += example_loss loss = loss / minibatch_size loss.backward() optimizer.step() optimizer.zero_grad() N = len(valid_data) total = 0 correct = 0 for minibatch_index in tqdm(range(N // minibatch_size)): optimizer.zero_grad() loss = None for example_index in range(minibatch_size): gold_label = valid_labs[minibatch_index * minibatch_size + example_index].long() predicted_vector = model( valid_vecs[minibatch_index * minibatch_size + example_index].to(device)) predicted_label = torch.argmax(predicted_vector) correct += int(predicted_label == gold_label) total += 1 print("Validation completed for epoch {}".format(epoch + 1)) print("Validation accuracy for epoch {}: {}".format( epoch + 1, correct / total)) #scheduler.step() iters -= 1
def get_predictive_model(classifier): # set directries based on run-time environment if in_docker == 'True': model_dir = '/data/models/' data_dir = '/data/data/' else: model_dir = 'models/' data_dir = 'data/' # get model and convert to w2v glove_input_file = model_dir + 'w2v_glove_300.txt' word2vec_output_file = '/tmp/w2v.txt' glove2word2vec(glove_input_file, word2vec_output_file) model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False) # get stop words sw = data_dir + "/stopwords.txt" with open(sw) as f: stop_words = f.read().splitlines() def get_sentence_vector(sentence): word_list = word_tokenize(sentence) word_list = [ word.lower() for word in word_list if word.lower() not in stop_words ] word_vectors = [] for x in word_list: try: w_vec = model.get_vector(x) word_vectors.append(w_vec) except KeyError: pass return sum(word_vectors) / len(word_vectors) # load prepartitioned train/test sets test = pd.read_csv(data_dir + "/test.csv") train = pd.read_csv(data_dir + "/train.csv") test['vec'] = [get_sentence_vector(x) for x in test.text] train['vec'] = [get_sentence_vector(x) for x in train.text] train_grouped_abbr = train.groupby('abbrev') test_grouped_abbr = test.groupby('abbrev') # load full data set frames = [test, train] df = pd.concat(frames) # Loop through different abbreviations for abbr in train.abbrev.unique(): train_abbr = train_grouped_abbr.get_group(abbr) test_abbr = test_grouped_abbr.get_group(abbr) X_train = np.array(list(train_abbr.vec)) y_train = train_abbr.expansion X_test = np.array(list(test_abbr.vec)) y_test = test_abbr.expansion if classifier == 'svm': # set up SVM clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train) elif classifier == 'log': clf = LogisticRegression().fit(X_train, y_train) elif classifier == 'mlp': clf = MLPClassifier().fit(X_train, y_train) elif classifier == 'bag': clf = BaggingClassifier(tree.DecisionTreeClassifier( random_state=1)).fit(X_train, y_train) elif classifier == 'boost': num_trees = 70 clf = AdaBoostClassifier(n_estimators=num_trees, random_state=1032).fit(X_train, y_train) elif classifier == 'rf': clf = RandomForestClassifier().fit(X_train, y_train) else: print('INVALID OPTION!') pred = clf.predict(X_test) output_dir = Path(data_dir + "output") output_dir.mkdir(parents=True, exist_ok=True) (pd.DataFrame({'predictions': pred})).to_csv( output_dir / "{}_{}.csv".format(classifier, abbr)) cm = confusion_matrix(y_test, pred, labels=list(set(df.expansion))) print() print("MODEL -> ", classifier) print("##" * 20) print(" " * 20 + abbr) print("##" * 20) print(classification_report(y_test, pred)) print() print(f'examples (first 5 cases)\t\t\t\t\t\ttrue_abbr\t\t\tpred_abbr') # Print first 5 cases i = 0 for input_row, true_abbr, pred_abbr in zip(train_abbr.iterrows(), y_test, pred): sn_start = max(input_row[1].start - 25, 0) sn_end = min(input_row[1].end + 25, len(input_row[1].text)) example_text = input_row[1].text[sn_start:sn_end] print(f'... {example_text} ...\t{true_abbr:<35}\t{pred_abbr}') if i == 5: break i += 1
inverted_idx = data_dict['inverted_idx'] qlist_seg = data_dict['qlist_seg'] alist = data_dict['alist'] X = data_dict['X'] # 加载nltk自带停用词,该停用词表个人感觉一般,具体到细分领域可能还是需要自己归纳 sw = set(stopwords.words('english')) # 个人感觉对于一个问题而言这些词不应该删去 sw -= {'who', 'when', 'why', 'where', 'how'} # 这里只是随便去了下符号 sw.update(['\'s', '``', '\'\'']) ps = PorterStemmer() # 将GloVe转为word2vec if not os.path.exists('./data/glove2word2vec.6B.100d.txt'): _ = glove2word2vec('./data/glove.6B.100d.txt', './data/glove2word2vec.6B.100d.txt') model = KeyedVectors.load_word2vec_format( './data/glove2word2vec.6B.100d.txt') else: model = KeyedVectors.load_word2vec_format( './data/glove2word2vec.6B.100d.txt') def text_preprocessing(text): """ 对单条文本进行处理。 text: str类型 return: 分词后的list """
def transform_glove_in_word2vec(glove_input_file: str, word2vec_output_file: str): glove2word2vec(glove_input_file, word2vec_output_file)
def build_word2vec(): info('converting from glove to word2vec format') glove2word2vec(glove_file, w2v_file) info('training word2vec model') model = KeyedVectors.load_word2vec_format(w2v_file, binary=False) model.save(w2v_model)