def __init__(self, sentences=None, corpus_file_path: str = None, dim: int = 100, saved_model_path: str = None, save_path=None) -> None: """Constructor for FastTextEmbedding classes (RAII)""" super().__init__() if sentences is not None: self._impl = FastText(size=dim, sentences=sentences) elif corpus_file_path is not None: self._impl = FastText(size=dim, corpus_file=corpus_file_path) elif saved_model_path is not None: # load from saved FastText embedding file self._impl = load_facebook_model(saved_model_path) else: raise AssertionError( 'sentences or corpus_file_path should be given as not None') self.vocab = self._impl.wv.index2word self.vocab_size = len(self.vocab) self.embedding_matrix = [] for word in self.vocab: self.embedding_matrix.append(self._impl.wv[word])
def loadWikiModel(self): """ 위키 한국어 데이터를 기반으로 FastText 단어 임베딩 모델 학습 : 기존 학습된 모델이 있는 경우 해당 모델 반환 : 위키 한국어 데이터(./data/cc.ko.300.bin.gz)가 없는 경우 다운로드 : 기존 학습된 모델이 없는 경우 학습 : 학습된 결과를 ./we_model에 저장 - export : CONST.wiki_model_path """ model = None if not os.path.isfile(CONST.wiki_model_path): print('🐈 학습된 단어 임베딩 모델이 없습니다.') if not os.path.isfile(CONST.wiki_data_path): print('🐈 단어 임베딩 모델 학습에 필요한 데이터를 다운로드를 시작합니다.') downloadByURL(CONST.wiki_data_url, CONST.wiki_data_path) print('🐈 단어 임베딩 모델 학습을 시작합니다.') model = fasttext.load_facebook_model(CONST.wiki_data_path) print('🐈 단어 임베딩 모델을 저장합니다.') model.save(CONST.wiki_model_path) else: model = FastText.load(CONST.wiki_model_path) # print(f'vocab size : {len(model.wv.vocab)}') # 2,000,000 return model
def __init__( self, data_path='/data2/mrony/submission_soccer/data/soccer/', vec_dim=300, # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'): fasttext_model='/data2/mrony/submission_soccer/data/wiki.simple.bin'): self.data_path = data_path # self.max_similarity = 85 self.vec_dim = vec_dim cap_path = datapath(fasttext_model) self.word_emb = load_facebook_model(cap_path) # print (self.max_er_vec) self.stop = set(stopwords.words('english')) self.punc = string.punctuation self.er_dict, self.global_ent, self.eo_dict, self.objlist = self.get_kg( self.data_path + 'KG/') self.args = get_args() self.train_dataset = self.get_data('train') self.val_dataset = self.get_data('val') self.test_dataset = self.get_data('test') self.entitites = [d['e'] for d in self.train_dataset] self.entitites = list(set(self.entitites)) print("self.entities: ", self.entitites) self.entities = [tmp.lower() for tmp in self.entitites] self.global_ent = [unidecode(tmp.lower()) for tmp in self.global_ent] print("self.globalent: ", self.global_ent) self.allent = list(set(self.global_ent + self.entitites + self.objlist)) # Create the vocab self.vocab = defaultdict(float) # self.vocab[pos] self.get_vocab(self.train_dataset) self.get_vocab(self.test_dataset) self.get_vocab(self.val_dataset) # Add additional tokens to vocab self.vocab[self.args.unk_tok] += 1.0 self.vocab[self.args.sos_tok] += 1.0 # self.vocab[self.args.ent_tok] += 1.0 self.vocab[self.args.eou_tok] += 1.0 self.vocab[self.args.eos_tok] += 1.0 self.vocab[self.args.no_ent_tok] += 1.0 self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1))) self.stoi[self.args.pad_tok] = 0 self.itos = {v: k for k, v in self.stoi.items()} print(len(self.stoi)) self.n_words = len(self.stoi) self.vectors = np.zeros((len(self.stoi), vec_dim)) for w, w2i in self.stoi.items(): # if w2i < self.stoi[self.args.eos_tok]: self.vectors[w2i] = self.get_w2v(w) self.ent_dict = dict(zip(self.allent, range(0, len(self.allent)))) self.ent_dict["<no_ent>"] = len(list(self.ent_dict.keys()))
def load_fasttext_model(model_name: str) -> FastText: if normpath((model_name)).lower().endswith('.bin'): fasttext_model = load_facebook_model(datapath(normpath(model_name))) else: fasttext_model = FastText.load(datapath(normpath(model_name))) fasttext_model.init_sims(replace=True) return fasttext_model
def get_embeddings(self, corpus_path: str, model_path: str = 'models', transfer: bool = False, overwrite: bool = False): """ Build FastText embeddings for a given corpus if no embeddings exist yet or existing embeddings are to be overwritten. Loads and returns existing embeddings if they can be detected. TODO: implement that last bit. :param corpus_path: The path to the text corpus used to generate embeddings. :param model_path: The path where the word embeddings are to be stored. :param transfer: Encodes whether the new embeddings should be added "on top" of existing embeddings. :param overwrite: If a trained model already exists but the user still wants to train one from scratch, this is true. """ if overwrite or len(os.listdir("models/transfer_learned")) == 0: print("Loading pretrained model...") model = load_facebook_model("models\wiki.en.bin") model.build_vocab(sentences=self.corpus, update=True) print( "Successfully loaded pretrained model!\nStart transfer-learning..." ) model.train(sentences=self.corpus, total_examples=len(self.corpus), epochs=5) print("Successfully finished transfer learning!") model.save("models/transfer_learned/big_model.model") else: print("Loading word embeddings...") model = FastText.load("models/transfer_learned/big_model.model") print("Word embeddings loaded!") return model
def __init__(self): print( f"{datetime.now():%Y-%m-%d %H:%M:%S} FastText model loading started" ) self._model = load_facebook_model(_MODEL_PATH) print( f"{datetime.now():%Y-%m-%d %H:%M:%S} FastText model loading ended")
def load_data(base_path, language, drop_columns, unreliable_sampling): datasets = {} for ds in tqdm(["train_reliable", "train_unreliable", "dev", "test"], file=sys.stdout): if ds == "train_unreliable" and unreliable_sampling == 0: continue df = pd.read_parquet( path.join(base_path, f"{language}", f"{ds}.parquet")).drop(drop_columns, axis=1, errors="ignore") if ds == "train_unreliable" and 0 < unreliable_sampling < 1: df = df.groupby(["category"]).apply(lambda cat: cat.sample( frac=unreliable_sampling)).reset_index(drop=True) elif ds == "train_unreliable" and unreliable_sampling > 1: df = df.groupby(["category"]).apply(lambda cat: cat.sample(n=int( unreliable_sampling))).reset_index(drop=True) if ds == "train_reliable": datasets["train"] = df elif ds == "train_unreliable": datasets["train"] = pd.concat([datasets["train"], df], ignore_index=True) else: datasets[ds] = df w2v = load_facebook_model( path.join(base_path, f"{language}", "fasttext.bin")) return datasets, w2v
def __init__( self, data_path='data/soccer/', vec_dim=300, # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'): fasttext_model='/data/dchaudhu/soccerbot_acl/vocab/wiki.en.bin'): self.data_path = data_path self.max_similarity = 85 self.vec_dim = vec_dim cap_path = datapath(fasttext_model) self.word_emb = load_facebook_model(cap_path) # print (self.max_er_vec) self.stop = set(stopwords.words('english')) self.punc = string.punctuation self.train_dataset = self.get_data('train') self.val_dataset = self.get_data('val') self.test_dataset = self.get_data('test') self.max_er_vec = [] # max er vector combination size for dat in self.train_dataset: self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items())) self.max_out_reln = np.max(self.max_er_vec) self.inp_graph_max_size = np.max( [len(getER_vec(kg['kgER'])) for kg in self.train_dataset]) print('input graph size:' + str(self.inp_graph_max_size)) print(self.max_out_reln) self.objects = ['o' + str(j) for j in range(self.max_out_reln)] self.args = get_args() # Create vocabulary and word2id self.vocab = defaultdict(float) self.get_vocab(self.train_dataset) self.get_vocab(self.test_dataset) self.get_vocab(self.val_dataset) self.vocab[self.args.unk_tok] += 1.0 self.vocab[self.args.sos_tok] += 1.0 self.vocab[self.args.eos_tok] += 1.0 for o in self.objects: self.vocab[o] += 1.0 self.stoi = dict(zip(self.vocab.keys(), range(0, len(self.vocab)))) # add additional tokens # self.stoi[self.args.unk_tok] = len(self.stoi) # self.stoi[self.args.sos_tok] = len(self.stoi) # self.stoi[self.args.eos_tok] = len(self.stoi) # print(len(self.stoi)) # self.itos = {v: k for k, v in self.stoi.items()} # for j in range(self.max_out_reln): # self.stoi['o'+str(j)] = len(self.stoi)+1 # del self.stoi self.itos = {v: k for k, v in self.stoi.items()} print(len(self.stoi)) self.n_words = len(self.stoi) self.vectors = np.zeros((len(self.stoi), vec_dim)) for w, w2i in self.stoi.items(): if w2i < self.stoi[self.args.eos_tok]: self.vectors[w2i] = self.word_emb.wv[w]
def load(self, file_name="SO_fasttext_vectors_200.bin"): start = time.time() file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), file_name) logging.info(f"Loading model from {file_path}") self.model = load_facebook_model(file_path) end = time.time() logging.info(">> model loaded") logging.info(">> %s" % (end - start))
def load_bin_model(modelPath): print("Loading the model...") #path=os.getcwd()+modelPath model = load_facebook_model(modelPath) #model = FastText.load_fasttext_format(path,full_model=True) print("Loading done.") print_model_info(modelPath,model) return model
def get_new_model(): model_names = [ 'word2vec-skipgram', 'word2vec-cbow', 'glove', 'fasttext-skipgram', 'fasttext-cbow', 'en-glove', 'en-fasttext-skipgram', 'en-fasttext-cbow' ] model = None while model == None: chosen_model = input('choose one of ' + str(model_names) + ': ') if chosen_model in model_names: print("loading model...") if chosen_model == 'word2vec-skipgram': model = Word2Vec.load('models/word2vec-skipgram/word2vec.model', mmap='r').wv elif chosen_model == 'word2vec-cbow': model = Word2Vec.load('models/word2vec-cbow/word2vec.model', mmap='r').wv elif chosen_model == 'glove': vectors_file = 'models/simplewiki_glove.txt' tmp_file = "models/gensim_glove_vectors.txt" glove2word2vec(glove_input_file=vectors_file, word2vec_output_file=tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file, binary=False) elif chosen_model == 'en-glove': vectors_file = 'models/enwiki_glove.txt' tmp_file = "models/gensim_glove_vectors.txt" glove2word2vec(glove_input_file=vectors_file, word2vec_output_file=tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file, binary=False) elif chosen_model == 'fasttext-skipgram': model = fasttext.load_facebook_model( 'models/simplewiki_fasttext_skipgram.bin').wv elif chosen_model == 'fasttext-cbow': model = fasttext.load_facebook_model( 'models/simplewiki_fasttext_cbow.bin').wv elif chosen_model == 'en-fasttext-skipgram': model = fasttext.load_facebook_model( 'models/enwiki_fasttext_skipgram.bin').wv elif chosen_model == 'en-fasttext-cbow': model = fasttext.load_facebook_model( 'models/enwiki_fasttext_cbow.bin').wv else: print('unrecognized model name. choose again') return model
def __init__( self, data_path='data/incar/', vec_dim=300, # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'): fasttext_model='/home/deep/Emphatic_VW/emotion_classifer-cnn/vectors/wiki.en.bin' ): self.data_path = data_path # self.max_similarity = 85 self.vec_dim = vec_dim cap_path = datapath(fasttext_model) self.word_emb = load_facebook_model(cap_path) # print (self.max_er_vec) self.stop = set(stopwords.words('english')) self.punc = string.punctuation self.er_dict, self.global_ent, self.eo_dict = self.get_kg( self.data_path + 'KG/') self.args = get_args() self.train_dataset = self.get_data('train') self.val_dataset = self.get_data('val') self.test_dataset = self.get_data('test') self.entitites = [d['e'] for d in self.train_dataset] self.entitites = list(set(self.entitites)) # Create the vocab self.vocab = defaultdict(float) # self.vocab[pos] self.get_vocab(self.train_dataset) self.get_vocab(self.test_dataset) self.get_vocab(self.val_dataset) # Add additional tokens to vocab self.vocab[self.args.unk_tok] += 1.0 self.vocab[self.args.sos_tok] += 1.0 # self.vocab[self.args.ent_tok] += 1.0 if not self.args.use_bert: self.vocab[self.args.eou_tok] += 1.0 self.vocab[self.args.eos_tok] += 1.0 self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1))) self.stoi[self.args.pad_tok] = 0 self.itos = {v: k for k, v in self.stoi.items()} print(len(self.stoi)) self.n_words = len(self.stoi) self.vectors = np.zeros((len(self.stoi), vec_dim)) for w, w2i in self.stoi.items(): # if w2i < self.stoi[self.args.eos_tok]: self.vectors[w2i] = self.get_w2v(w) self.ent_dict = dict( zip(list(self.entitites), range(0, len(self.entitites))))
def __init__(self, index_name, es_text_field, save_model_pth, num_training_epochs, use_analyzed_field=False, init_from_pretrained=False, pretrained_path=None, bsize=128, emb_dim=300, window=5, min_count=1, workers=12, max_vocab_size=None, must_have_fields=None, must_not_have_fields=None): assert (not init_from_pretrained and not pretrained_path) or (init_from_pretrained and pretrained_path), \ "If init_from_pretrained=True, pretrained_path must be provided" if must_have_fields is None: must_have_fields = [] if must_not_have_fields is None: must_not_have_fields = [] # General parameters self.index_name = index_name self.es_text_field = es_text_field self.bsize = bsize self.use_analyzed_field = use_analyzed_field self.save_model_pth = save_model_pth self.num_training_epochs = num_training_epochs self.must_have_fields = must_have_fields + [es_text_field] self.must_not_have_fields = must_not_have_fields # Training parameters self.emb_dim = emb_dim # May be updated below if using a pretrained model self.window = window self.min_count = min_count self.workers = workers self.max_vocab_size = max_vocab_size # Initialize model if init_from_pretrained and pretrained_path: try: self.model = load_facebook_model(pretrained_path) except NotImplementedError: self.model = FastText.load(pretrained_path) except Exception as e: raise RuntimeError("Fasttext model is neither facebook nor gensim model: {}".format(e)) self.emb_dim = self.model.vector_size print('Ignoring arg `emb_dim` since loading a pretrained model. Emb_dim is set to {}'.format(self.emb_dim)) # Update parameters self.model.workers = self.workers # Start iterating corpus and building vocab print('Updating vocabulary with first pass over corpus') self.model.build_vocab( sentences=TextFieldIterator( index_name, es_text_field, self.must_have_fields, self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field ), update=True) else: # More parameters can be exposed when creating a new model; # https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText # Instantiate model self.model = FastText(size=emb_dim, window=window, min_count=min_count, workers=workers, max_vocab_size=max_vocab_size) # Start iterating corpus and building vocab self.model.build_vocab( sentences=TextFieldIterator( index_name, es_text_field, self.must_have_fields, self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field, ) )
def prepare_wiki_fasttext(): tokenized_corpus = [doc for _, doc in text_from_file()] t = time.time() ft_model = fasttext.load_facebook_model(fasttext_wiki_base_model) t2 = time.time() print(f"Took {t2-t} to load the model") t = time.time() ft_model.build_vocab(tokenized_corpus, update=True) t2 = time.time() print(f"Took {t2-t} to update the corpus the model") ft_model.save(fasttext_wiki_our_prepared_model)
def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path): logger.info("Initializing Memory Generator ....") self.conv2kg = conv2kg self.kgs = kgs self.mapping = json.load(open("data/" + dataset + "/ERmapping.json")) self.maxEntity, self.maxRel = self.read_dataset(dataset) logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " + str(self.maxRel)) self.matrix_dim = self.maxEntity + self.maxRel self.word_emb = load_facebook_model( datapath(os.getcwd() + "/" + fasttext_emb_path)) logger.info("READY: Memory Generator")
def _setup_model(self): if self.verbose: print('Loading model') loaded_model = load_facebook_model(self.default_model) self.model = loaded_model.wv if self.verbose: print('Model loaded') self.similarity_index = WordEmbeddingSimilarityIndex(self.model) self.model_ready = True
def load_pretrained_vectors(dest_path, file_name="wiki.simple.bin"): """ Method that loads fastText vectors. Downloads if it doesn't exist. Args: file_name(str): Name of the fastText file. dest_path(str): Path to the directory where fastText vectors exist or will be downloaded. Returns: gensim.models.fasttext.load_facebook_model: Loaded word2vectors """ file_path = _maybe_download_and_extract(dest_path, file_name) model = load_facebook_model(file_path) return model
def train_model(self, corpus): if self.model is not None: return self.model logging.info(f"Start fine tuning model {self.pretrained_model_path}") self.model = load_facebook_model(self.pretrained_model_path) self.model.min_count = 1 self.model.build_vocab(sentences=corpus.data, update=True) self.model.train( sentences=corpus.data, total_examples=len(corpus.data), callbacks=[EpochLogger("Finetuned")], epochs=self.epochs) self.model = self.model.wv self.model.init_sims(True) logging.info(f"Finished fine tuning model {self.pretrained_model_path}") return self.model
def findSimilar2(): # zh_model = FastText2.load_fasttext_format("wiki.zh.bin") zh_model = FastText3.load_facebook_model("wiki.zh.bin") # print ('程序员' in zh_model.wv.vocab) # words = [] # for word in zh_model.words: # words.append(word) # print("预训练模型包含单词总数: {}".format(len(words))) # print (words[:10]) find_similar_to = "程序员" for similar_word in zh_model.wv.similar_by_word(find_similar_to, topn=5): print("Word: {0}, Similarity: {1:.2f}".format(similar_word[0], similar_word[1]))
def get_embedding_matrix_from_fasttext_model(path, word2idx, embed_dim=300, MAX_NB_WORDS=200000): model = load_facebook_model(path) nb_words = min(MAX_NB_WORDS, len(word2idx)) embedding_matrix = np.zeros((nb_words, embed_dim)) for word, i in word2idx.items(): if i >= nb_words: continue embedding_vector = model.wv[word.strip('_')] if (embedding_vector is not None) and len(embedding_vector) > 0: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return embedding_matrix
def update_model(cls, corpus_path, pmodel_name, use_iknow_entities=True, tokenize_concepts=True, installdir=''): """ Updates an already existing model by continuing its training on a new corpus. Parameters ----------- corpus_path (str) - path to the corpus being used to update the model pmodel_name (str, optional) - The name of the model to be updated, defaults to the model currently in use Return ----------- True if model was updated, else False Throws ----------- FileNotFoundError - if corpus or model not found """ model_path = installdir + IKFastTextModeling.__PATH_PREFIX__ try: if pmodel_name[-4:] != '.bin': pmodel_name = pmodel_name + '.bin' path = os.path.join(model_path, pmodel_name) model = ft.load_facebook_model(path) super().update_model(corpus_path, model, use_iknow_entities, tokenize_concepts) # Clear current contents of folders storing model and KeyedVectors files as gensim doesn't do it os.remove(path) ft.save_facebook_model(model, path=path) except FileNotFoundError as err: raise FileNotFoundError( "Model could not be updated, check specified corpus and model names" ) from err
def make_fasttext_pretrained(target_corpus, pretrined_model): target_corpus_name = target_corpus + '.txt' corpus_path = os.path.join(CONFIG.DATA_PATH, "corpus", target_corpus_name) fb_path = os.path.join(CONFIG.EMBEDDING_PATH, "facebook", pretrined_model) sentences = word2vec.LineSentence(corpus_path) embedding_model = load_facebook_model(fb_path) # load pretrained model print("embedding started") embedding_model.build_vocab(sentences=sentences, update=True) print(embedding_model.epochs) print("train started") embedding_model.train(sentences=sentences, total_examples=embedding_model.corpus_count, epochs=10) print("train completed") model_name = "FASTTEXT_" + target_corpus + "_pretrained.model" embedding_model.wv.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name)) print("embedding completed")
def __init__(self): """ When constructed, this class will automatically donwload pretrained English word embeddings from Facebook's FastText and save them for later use. WARNING: The pretrained vector file is 6GB, so make sure you download it on reliable WiFi. """ if not path.exists(self.ZIP_FILE) and not path.exists(self.MODEL_FILE): print("Downloading model...", file=sys.stderr) request.urlretrieve(self.DOWNLOAD_LINK, self.MODEL_FILE) if not path.exists(self.MODEL_FILE): print("Unzipping model...", file=sys.stderr) with zipfile.ZipFile(self.ZIP_FILE, "r") as zip: zip.extract(self.MODEL_FILE) print("Reading file...", file=sys.stderr) self.fb_model = load_facebook_model( path.join(os.getcwd(), self.MODEL_FILE)) self.model = fb_model.wv
def main(): random.seed(142) np.random.seed(142) parser = ArgumentParser() parser.add_argument('-s', '--src', dest='source_fasttext_name', type=str, required=True, help='A binary file with a source Facebook-like FastText model (*.bin).') parser.add_argument('-d', '--dst', dest='destination_fasttext_name', type=str, required=True, help='A binary file with a destination Facebook-like FastText model (*.bin) after ' 'its fine-tuning.') parser.add_argument('-c', '--cache_dir', dest='cache_dir', type=str, required=True, help='A directory with cached data for training.') parser.add_argument('--epochs', dest='max_epochs', type=int, required=False, default=5, help='A number of epochs to train the FastText model.') args = parser.parse_args() source_fasttext_name = os.path.normpath(args.source_fasttext_name) assert os.path.isfile(source_fasttext_name), 'File `{0}` does not exist!'.format(source_fasttext_name) destination_fasttext_name = os.path.normpath(args.destination_fasttext_name) destination_fasttext_dir = os.path.dirname(destination_fasttext_name) assert os.path.isdir(destination_fasttext_dir), 'Directory `{0}` does not exist!'.format(destination_fasttext_dir) cache_data_dir = os.path.normpath(args.cache_dir) assert os.path.isdir(cache_data_dir), 'Directory `{0}` does not exist!'.format(cache_data_dir) ruwordnet_occurrences_file = os.path.join(cache_data_dir, 'ruwordnet_occurrences_in_texts.json') submission_occurrences_file = os.path.join(cache_data_dir, 'submission_occurrences_in_texts.json') assert os.path.isfile(ruwordnet_occurrences_file), 'File `{0}` does not exist!'.format(ruwordnet_occurrences_file) assert os.path.isfile(submission_occurrences_file), 'File `{0}` does not exist!'.format(submission_occurrences_file) texts = load_text_corpus(ruwordnet_occurrences_file) + load_text_corpus(submission_occurrences_file) print('{0} texts have been loaded...'.format(len(texts))) fasttext_model = load_facebook_model(datapath(source_fasttext_name)) print('The FastText model has been loaded...') fasttext_model.workers = max(os.cpu_count(), 1) fasttext_model.min_count = 1 fasttext_model.callbacks = [EpochLogger()] + list(fasttext_model.callbacks) fasttext_model.build_vocab(texts, update=True) fasttext_model.train(texts, total_examples=len(texts), epochs=args.max_epochs) fasttext_model.callbacks = () fasttext_model.save(destination_fasttext_name)
def init__usif(self): """ Train and save uSIF embedding model. :return: """ embeddings = [] i = 0 # load dump of review analysis sentences with open(self.embedding_path, 'r') as f: for line in f: line = line[:-1] line = line.lower() line = re.sub(r'\d+', '', line) # numbers line = re.sub(r'\p{P}+', '', line) # punc embeddings.append( (line.split(), i) ) i+=1 print('Total lines {}'.format(str(i))) # get fasttext and train usif model on them fasttext = load_facebook_model(self.fasttext_path) self.usif_model = uSIF(fasttext, workers=16) self.usif_model.train(embeddings) self.usif_model.save(self.fse_path)
users = list(set(ratings[['user']].values.flatten())) all_items = set(pd.read_csv('../datasets/goodbooks-10k-master/books.csv').index.values.flatten()) items_tags = pd.read_csv('../datasets/books-tags.csv', encoding='latin-1') if MODE == Technique.WORD_2_VEC: wv = api.load('word2vec-google-news-300') elif MODE == Technique.DOC_2_VEC: wv = Doc2Vec.load('../datasets/goodbooks-10k-master/doc2vec/doc2vec.bin') elif MODE == Technique.TFIDFMODEL: wv = api.load("glove-wiki-gigaword-50") else: wv = load_facebook_model('../datasets/goodbooks-10k-master/fasttext/wiki.simple.bin') items_content, items_vectors = build_items_content() output_path = OUTPUT_FOLDER + OUTPUT_FILE_NAME + pick_tag() + '.csv' # write csv doc header f = open(output_path, 'w') f.write('user,item,score\n') f.close() for user in users: print(user) user_ratings = ratings.query('user == @user') rated_items = set(user_ratings[['item']].values.flatten())
def read_embedding(switch, data_addr): if switch == 'FastText': fb_model = fasttext.load_facebook_model(path=data_addr) return fb_model
def load_fasttext_model(self): fasttext = load_facebook_model( settings.PATH_TO_FASTTEXT_PT_EMBEDDINGS, encoding="utf-8") return fasttext
def __init__(self, data_path='data/incar/', pretrained_weights='bert-base-uncased', use_bert=True, fasttext_model='data/wiki.simple.bin', batch_size=32, max_sent_len=20, vec_dim=300, max_resp_len=15, gpu=False, domain='incar'): # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin', self.args = get_args() self.data_path = data_path self.batch_size = batch_size self.max_sent_len = max_sent_len self.max_out_len = max_resp_len self.vec_dim = vec_dim self.gpu = gpu self.n_graph_features = 1 cap_path = datapath(fasttext_model) self.word_emb = load_facebook_model(cap_path) self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights) # SRC and TRG vocabularies self.src_vocab = defaultdict(float) self.trg_vocab = defaultdict(float) self.trg_vocab[self.args.sos_tok] = 1.0 self.trg_vocab[self.args.eos_tok] = 1.0 self.trg_vocab[self.args.unk_tok] = 1.0 self.src_vocab[self.args.unk_tok] = 1.0 # Load Datasets and preprocess files self.train_dataset = np.load(self.data_path + 'preproc_files_kg/' + 'train.npy', allow_pickle=True) # random.shuffle(self.train_dataset) self.val_dataset = np.load(self.data_path + 'preproc_files_kg/' + 'val.npy', allow_pickle=True) self.test_dataset = np.load(self.data_path + 'preproc_files_kg/' + 'test.npy', allow_pickle=True) # Create vocabularies self.create_vocab(self.train_dataset) self.create_vocab(self.val_dataset) self.create_vocab(self.test_dataset) self.src_stoi = dict( zip(self.src_vocab.keys(), range(0, len(self.src_vocab.keys())))) self.src_itos = {v: k for k, v in self.src_stoi.items()} self.trg_stoi = dict( zip(self.trg_vocab.keys(), range(0, len(self.trg_vocab.keys())))) self.trg_itos = {v: k for k, v in self.trg_stoi.items()} # self.stoi = np.load(self.data_path+'preproc_files_kg/'+'stoi.npy', allow_pickle=True).item() self.etoi = np.load(self.data_path + 'preproc_files_kg/' + 'etoi.npy', allow_pickle=True).item() # self.vectors = np.load(self.data_path+'preproc_files_kg/'+'wemb.npy', allow_pickle=True) # Remove vectors which are not present in source stoi self.src_vectors = np.zeros((len(self.src_stoi), self.vec_dim)) self.trg_vectors = np.zeros((len(self.trg_stoi), self.vec_dim)) # for w, i in self.src_stoi.items(): # self.src_vectors[i] = self.get_w2v(w) # for w, i in self.trg_stoi.items(): # self.trg_vectors[i] = self.get_w2v(w) # self.itos = {v: k for k, v in self.stoi.items()} self.itoe = {v: k for k, v in self.etoi.items()} self.er_dict, self.global_ent, self.eo_dict, self.e_o_1hop, self.e_r_l = self.get_kg( data_path + 'KG/', dat=domain) # Maximum graph input feature # self.max_er_vec = [] # max er vector combination size # for dat in self.train_dataset: # self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items())) # self.max_out_reln = np.max(self.max_er_vec) # Data Statistics # self.n_words = len(self.stoi) self.n_train = len(self.train_dataset) self.n_val = len(self.val_dataset) self.n_test = len(self.test_dataset)
existing_f = open("existing_full.txt", "r") emerging_f = open("emerging_full.txt", "r") sentences = [] for l in existing_f.readlines(): sentences.append(l.split(" ")) for l in emerging_f.readlines(): sentences.append(l.split(" ")) """ ko_txt = open("ko.txt", "r") sentences = [] for l in ko_txt.readlines(): sentences.append(l.split(" ")) model = load_facebook_model("./ko.bin") print(model.wv.most_similar("립스틱")) sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'semi-groups']] model.build_vocab(sentences, update=True) model.train(sentences=sentences, total_examples=len(sentences), epochs=100) print(model.wv.most_similar("립스틱")) model.save("./ko_aihub_100.bin") """ #ko_model = gensim.models.Word2Vec.load('./ko.vec') #model = KeyedVectors.load_word2vec_format( # "./ko.bin", binary=True) #print(model.wv.most_similar("강아지")) ko_model = Word2Vec(size=200, min_count=54)