def update_fasttext(): model = FastText.load_fasttext_format('/data/dataset/wiki.zh') # 注意,因为用的是fasttext 官方的模型,model.corpus_count 是0 # 需要手动指定一个 corpus_count model.train([['今天', '天气', '很好', '。']], total_examples=model.corpus_count)
def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: model[u'který'] except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word')
def _load_gensim_format_embeddings(self): if not os.path.exists(self.word_embedding_file): raise Exception("{} is not found!".format( self.word_embedding_file)) if self.word_embedding_mode.lower() == "fasttext": if self.word_embedding_file.endswith(".model"): model = FastText.load(self.word_embedding_file) else: model = FastText.load_fasttext_format(self.word_embedding_file) elif self.word_embedding_file.endswith(".bin"): model = KeyedVectors.load_word2vec_format(self.word_embedding_file, binary=True) else: model = Word2Vec.load(self.word_embedding_file) embedding_size = model["and"].size unknown_vec = np.random.uniform(-0.25, 0.25, embedding_size) embeddings = [unknown_vec] * (self.n_words()) embeddings[0] = np.zeros(embedding_size) for word in self.word2index: try: embeddings[self.word2index[word]] = model[word] except: # self.word2index[word] = self.word2index[self.UNK_TOKEN] pass self.word_embedding_size = len(embeddings[0]) embeddings = np.array(embeddings, dtype=np.float32) return embeddings
def create_and_train_nn_prediction_from_file( fasttext: str, data: str, dump: str = None, num_neurons: int = DEFAULT_NUM_NEURONS, batch_size: int = DEFAULT_BATCH_SIZE, lr: float = DEFAULT_LR, decay: float = DEFAULT_DECAY, num_epochs: int = DEFAULT_NUM_EPOCHS) -> keras.models.Sequential: """ Train NN model for correction embedding prediction from files. :param fasttext: Path to the binary dump of a FastText model. :param data: Path to a CSV dump of pandas.DataFrame containing columns \ [Columns.CorrectToken, Columns.Token]. :param dump: Path to the file where to dump the trained NN model. :param num_neurons: Number of neurons in each hidden layer. :param batch_size: Batch size for training. :param lr: Learning rate. :param decay: Learning rate exponential decay per epoch. :param num_epochs: Number of training passes over the dataset. :return: Trained Keras model. """ fasttext_model = FastText.load_fasttext_format(fasttext) df = pandas.read_csv(data, index_col=0).infer_objects() return create_and_train_nn_prediction(fasttext_model, df, dump, num_neurons, batch_size, lr, decay, num_epochs)
def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: model[u'který'] except KeyError: self.fail('Unable to access vector for cp-852 word')
def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: model[u'který'] except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word')
def load_fasttext(path): model = FastText.load_fasttext_format(path) model.init_sims(replace=True) return model
def _embedding_load_pre_fasttext(pretrained_path): try: model_embedding = FastText.load_fasttext_format(pretrained_path) print("[+] FastText Embedding model successfully loaded from {}".format(pretrained_path)) return model_embedding except: raise FileNotFoundError("[!] FastText Embedding model couldn't be loaded from {}".format(pretrained_path))
def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: model[u'který'] except KeyError: self.fail('Unable to access vector for cp-852 word')
def __init__(self, project, embeddingSize): self.ft = FT.load_fasttext_format('./EmbeddingModel/fastText.bin') self.embeddingSize = embeddingSize try: self.embedder = torch.load('./EmbeddingModel/{}-EmbeddingLayer.pt'.format(project)) except FileNotFoundError: self.embedder = nn.Embedding(num_embeddings=len(super().wordSet), embedding_dim=embeddingSize) torch.save(self.embedder, './EmbeddingModel/{}-EmbeddingLayer.pt'.format(project))
def load_embeddings(embedding_file): ''' loading embeddings from file input: embeddings output: embeddings in a dict-like structure available for look-up, vocab covered by the embeddings as a set ''' print('Using embeddings: ', embedding_file) if embedding_file.endswith('.txt') or embedding_file.endswith('.vec'): w2v = {} vocab = [] try: f = open(embedding_file, 'r') for line in f: values = line.split() word = values[0] try: float(values[1]) except ValueError: continue coefs = np.asarray(values[1:], dtype='float') w2v[word] = coefs vocab.append(word) except UnicodeDecodeError: f = open(embedding_file, 'rb') for line in f: values = line.split() word = values[0] try: float(values[1]) except ValueError: continue coefs = np.asarray(values[1:], dtype='float') w2v[word] = coefs vocab.append(word) f.close() else: try: w2v = FT_gensim.load(embedding_file) vocab = w2v.wv.vocab.keys() print('using FastText gensim...') except: try: w2v = FT_gensim.load_fasttext_format(embedding_file) vocab = w2v.wv.vocab.keys() print('using gensim Facebook FastText...') except: w2v, vocab = load_vectors(embedding_file) print('using Facebook fastText') try: print("Done.", len(w2v), " words loaded!") except: pass return w2v, vocab
def __init__(self, train_X, train_Y, test_X, test_Y, model): self.model_path = model # embedding_index = {} # for i, line in enumerate(open('glove.6B/glove.6B.100d.txt')): # values = line.split() # embedding_index[values[0]] = np.asarray(values[1:], dtype='float32') embedding_index = FastText.load_fasttext_format('cc.id.300.bin') # Create tokenizer object tokenizer = text.Tokenizer() tokenizer.fit_on_texts(train_X) word_index = tokenizer.word_index # Convert text to padded sequence of tokens and load previous model if available, disable train method self.test_seq_X = sequence.pad_sequences(tokenizer.texts_to_sequences(test_X), maxlen=70) if os.path.isfile(self.model_path): self.classifier = load_model(self.model_path) return # Save if no previous model loaded self.train_seq_X = sequence.pad_sequences(tokenizer.texts_to_sequences(train_X), maxlen=70) self.train_Y = train_Y self.test_Y = test_Y if os.path.isfile(self.model_path): self.classifier = load_model(self.model_path) return # Create word embeddings mapping embedding_matrix = np.zeros((len(word_index) + 1), 300) for word, i in word_index.items(): embedding_vector = embedding_index.wv.most_similar(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Creating layer # Add input layer input_layer = layers.Input((70, )) # Add the word embedding layer embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer) embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer) # Add LSTM layer lstm_layer = layers.LSTM(self.hidden_state)(embedding_layer) # Output layers output_layer1 = layers.Dense(50, activation="relu")(lstm_layer) output_layer1 = layers.Dropout(0.25)(output_layer1) output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1) # Compile model model = models.Model(inputs=input_layer, outputs=output_layer2) model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy') self.classifier = model logging.info("LSTM model created")
def load_word_embeddings_bin(filename, algorithm='fasttext'): print('loading model...') global wv_model if (algorithm == 'fasttext'): wv_model = FastText.load_fasttext_format(filename) elif (algorithm == 'word2vec'): wv_model = KeyedVectors.load_word2vec_format(filename, encoding='utf8', binary=True) print('Done!') return wv_model
def __init__(self, f_emb: str, f_db): super().__init__() log.info('loading embeddings') self._emb = FastText.load_fasttext_format(f_emb) log.info('loading ungol index') with open(f_db, 'rb') as fd: self.db = pickle.load(fd)
def load_fasttext_model(path): try: model = FastText.load(path).wv except Exception as e: try: model = FastText.load_fasttext_format(path).wv except Exception as e: model = gensim.models.KeyedVectors.load(path) return model
def embedding_yukle_fasttext(dizin): try: model_embedding = FastText.load_fasttext_format(dizin) print("[+] Embedding modeli (pretrained) {} dosyasindan yuklendi.". format(dizin)) return model_embedding except: raise FileNotFoundError( "[!] Embedding modeli (pretrained) {} dosyasindan yuklenemedi.". format(dizin))
def import_pretrained_fasttext(in_file, out_file, mmap='r'): try: return GensimKeyedVectors.load(out_file, mmap=mmap) except IOError: pass ft = FastText.load_fasttext_format(in_file) ft.init_sims(replace=True) model = GensimKeyedVectors(ft.wv) model.save(out_file) return GensimKeyedVectors.load(out_file, mmap=mmap)
def load_embedding(embedding_path, embedding_dim, format, file_type, with_head=False, word_set=None): """ Args: format: 'glove', 'word2vec', 'fasttext' file_type: 'text' or 'binary' """ embedding_dict = dict() if format == 'word2vec' or format == 'fasttext': if file_type == 'text': vector_total = KeyedVectors.load_word2vec_format( embedding_path, binary=False, unicode_errors='ignore') else: if format == 'word2vec': vector_total = KeyedVectors.load_word2vec_format( embedding_path, binary=True, unicode_errors='ignore') elif format == 'fasttext': vector_total = FastText.load_fasttext_format(embedding_path, encoding='utf8') assert vector_total.vector_size == embedding_dim if word_set is None: embedding_dict = vector_total else: if not (format == 'fasttext' and file_type == 'binary'): word_total = vector_total.index2word # actually, vector_total.index2word is the word list else: word_total = vector_total.wv.index2word for word in word_total: if word in word_set: embedding_dict[word] = vector_total[word] elif format == 'glove': with codecs.open(embedding_path, 'r', encoding='utf-8') as fin: if with_head == True: _ = fin.readline() for idx, line in enumerate(fin): line = line.rstrip() if idx == 0 and len(line.split()) == 2: continue if len(line) > 0: word, vec = line.split(" ", 1) if (word_set and word in word_set) or (word_set is None): vector = [float(num) for num in vec.split(" ")] assert len(vector) == embedding_dim embedding_dict[word] = vector else: raise Exception( 'The format supported are glove, word2vec or fasttext, dost not support %s now.' % format) return embedding_dict
def create_fasttext_embeddings(words, embeddings_path): print("Loading fasttext model...") ff_model = FastText.load_fasttext_format(embeddings_path) fasttext_word_dict = dict() for w in words: try: fasttext_word_dict[w] =ff_model[w] except: pass print "total words not found in Fasttext:", len(words) - len(fasttext_word_dict) return fasttext_word_dict
def make_pretrained_embeddings(word_index): model = FastText.load_fasttext_format('/models/wiki.ru') # create embedding_matrix: "index of word - vector" embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM)) for lem, i in word_index.items(): if lem in model: embedding_matrix[i] = model[lem] # write embedding_matrix to file in the output directory of Floyd np.save('/output/embedding_matrix_lemma.npy', embedding_matrix) return embedding_matrix
def test_load_fasttext_new_format(self): try: new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) expected_vec = [ -0.025627, -0.11448, 0.18116, -0.96779, 0.2532, -0.93224, 0.3929, 0.12679, -0.19685, -0.13179 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.53378, -0.19, 0.013482, -0.86767, -0.21684, -0.89928, 0.45124, 0.18025, -0.14128, 0.22508 ] self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(new_model.min_count, 5) self.assertEqual(new_model.window, 5) self.assertEqual(new_model.iter, 5) self.assertEqual(new_model.negative, 5) self.assertEqual(new_model.sample, 0.0001) self.assertEqual(new_model.bucket, 1000) self.assertEqual(new_model.wv.max_n, 6) self.assertEqual(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
def test_load_fasttext_format(self): try: model = FT_gensim.load_fasttext_format(self.test_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(model.wv.vocab), vocab_size, model_size) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) expected_vec = [ -0.57144, -0.0085561, 0.15748, -0.67855, -0.25459, -0.58077, -0.09913, 1.1447, 0.23418, 0.060007 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.23825, -0.58482, -0.22276, -0.41215, 0.91015, -1.6786, -0.26724, 0.58818, 0.57828, 0.75801 ] self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(model.min_count, 5) self.assertEqual(model.window, 5) self.assertEqual(model.iter, 5) self.assertEqual(model.negative, 5) self.assertEqual(model.sample, 0.0001) self.assertEqual(model.bucket, 1000) self.assertEqual(model.wv.max_n, 6) self.assertEqual(model.wv.min_n, 3) self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
def test_load_fasttext_new_format(self): try: new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) expected_vec = [ -0.025627, -0.11448, 0.18116, -0.96779, 0.2532, -0.93224, 0.3929, 0.12679, -0.19685, -0.13179 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.49111, -0.13122, -0.02109, -0.88769, -0.20105, -0.91732, 0.47243, 0.19708, -0.17856, 0.19815 ] self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(new_model.min_count, 5) self.assertEqual(new_model.window, 5) self.assertEqual(new_model.iter, 5) self.assertEqual(new_model.negative, 5) self.assertEqual(new_model.sample, 0.0001) self.assertEqual(new_model.bucket, 1000) self.assertEqual(new_model.wv.max_n, 6) self.assertEqual(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
def test_load_fasttext_format(self): try: model = FT_gensim.load_fasttext_format(self.test_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(model.wv.vocab), vocab_size, model_size) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) expected_vec = [ -0.57144, -0.0085561, 0.15748, -0.67855, -0.25459, -0.58077, -0.09913, 1.1447, 0.23418, 0.060007 ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4)) # vector for oov words are slightly different from original FastText due to discarding unused ngrams # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin expected_vec_oov = [ -0.21929, -0.53778, -0.22463, -0.41735, 0.71737, -1.59758, -0.24833, 0.62028, 0.53203, 0.77568 ] self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) self.assertEqual(model.min_count, 5) self.assertEqual(model.window, 5) self.assertEqual(model.iter, 5) self.assertEqual(model.negative, 5) self.assertEqual(model.sample, 0.0001) self.assertEqual(model.bucket, 1000) self.assertEqual(model.wv.max_n, 6) self.assertEqual(model.wv.min_n, 3) self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
def get_pretrained_embeddings(filename, vocab): def get_vectors(filename): objects = dict() with (open(filename, "rb")) as openfile: while True: try: objects = pickle.load(openfile) except EOFError: break return objects # vectors = embeddings.load_from_dir(filename) model = FastText.load_fasttext_format(filename) size = model.vector_size embs_matrix = np.random.rand(len(vocab), size) for i, token in enumerate(vocab.token2id): if token in model: embs_matrix[i] = model[token] return torch.FloatTensor(embs_matrix)
def train(): print('Loading fasttext...') fasttext = FastText.load_fasttext_format(path_fasttext) fasttext_dict = {} for word in tqdm(fasttext.wv.vocab): fasttext_dict[word] = fasttext[word] del fasttext print('Counting input...') count_lines = 0 with open(path_news_shuffled, 'r') as in_news: for _ in tqdm(in_news): count_lines += 1 train_size = int(count_lines * .8) test_size = int(count_lines * .8) val_size = count_lines - (int(count_lines * 0.8) + int(count_lines * 0.1)) print('Train size:', train_size, '; test size:', test_size, '; val size:', val_size) print('Training...') with tf.device('/gpu:0'): cnn_model = bilstm_model(units=[128, 64, 32], hidden_dims=18) checkpoint = ModelCheckpoint( path_data + 'bilstm_all_1_weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, mode='auto') cnn_model.fit_generator(embedded_news_generator_all( path_news_train_all, batch_size, fasttext_dict, max_words), steps_per_epoch=train_size // batch_size, epochs=epochs, verbose=1, validation_data=embedded_news_generator_all( path_news_val_all, batch_size, fasttext_dict, max_words), validation_steps=val_size // batch_size, callbacks=[checkpoint])
def _load_subword_embeddings(self): if not os.path.exists(self.word_embedding_file): raise Exception("{} is not found!".format( self.word_embedding_file)) model = FastText.load_fasttext_format(self.word_embedding_file) embedding_size = model["and"].size unknown_vec = np.random.uniform(-0.25, 0.25, embedding_size) embeddings = [unknown_vec] * (self.n_words()) embeddings[0] = np.zeros(embedding_size) for word in self.word2index: try: embeddings[self.word2index[word]] = model[word] except: # self.word2index[word] = self.word2index[self.UNK_TOKEN] pass self.word_embedding_size = len(embeddings[0]) embeddings = np.array(embeddings, dtype=np.float32) return embeddings
def load_model(self): files, dirs = get_dir(self.sourceDict["smallPath"]) fileToLoad = [file for file in files if ".bin" in file] if len(fileToLoad) > 0: try: print("Loading with FastText.load") model = FastText.load(fileToLoad[0]) print(fileToLoad[0] + " was loaded.") return model except: try: print("Loading with FastText.load_fasttext_format") model = FastText.load_fasttext_format(fileToLoad[0]) print(fileToLoad[0] + " was loaded.") return model except: print("Unable to load " + fileToLoad[0] + " file. Please retrain or redl weights.") else: print("No model found. Please dl pretrained weights or train custom ones.") return
def load_pretrained_fasttext(): return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/wiki.si.bin")
def load_homemade_fasttext(): # return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/fasttext_model_skipgram_300.bin") # return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/fasttext_model_skipgram_remove300_5.bin") return FastText.load_fasttext_format("../../../corpus/analyzed/saved_models/xxx.bin")
from gensim.models.fasttext import FastText as ft MODEL = "../model/cc.fr.300.bin" MODEL2 = "../model/fr.bin" WORDS = ["Ah non, c'est de la merde", 'Trop biiiieeenn', 'Excellent putain !'] tokens = ['enedis', 'energie', 'heureux', 'machine', 'propre', 'merde'] #classifier = load_model(MODEL) model = ft.load_fasttext_format(MODEL2) oov_vector = model[tokens] print(oov_vector)
def __init__(self, language='en'): if language == 'en': super().__init__(FastText.load_fasttext_format(config.pretrained_fasttext_path)) elif language == 'de': super().__init__(FastText.load_fasttext_format(config.pretrained_fasttext_de_path)) print("the fasttext model loaded")
def write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1' wsm = gs.models.Word2Vec.load(wg_path) cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/cc.sv.300.bin' csm = FastText.load_fasttext_format(cg_path) #cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext' #csm = FastText.load(cg_path) epit = epitran.Epitran('swe-Latn') col_names = [ 'sw1_charemb_score', 'sw2_charemb_score', 'blend_charemb_score', 'sw1_sw2_charemb_sim', 'sw1_blend_charemb_sim', 'sw2_blend_charemb_sim', 'sw1_wordemb_score', 'sw2_wordemb_score', 'blend_wordemb_score', 'sw1_blend_wordemb_sim', 'sw2_blend_wordemb_sim', 'sw1_sw2_wordemb_sim', 'splits', 'sw1_sw2_char_bigramsim', 'sw2_sw1_char_bigramsim', 'sw1_sw2_char_trigramsim', 'sw2_sw1_char_trigramsim', 'lcs_sw1_sw2', 'sw1_blend_IPA_lev_dist', 'sw2_blend_IPA_lev_dist', 'sw1_sw2_IPA_lev_dist', 'sw1_blend_lev_dist', 'sw2_blend_lev_dist', 'sw1_sw2_lev_dist', 'sw1_graphemes', 'sw2_graphemes', 'sw1_syllables', 'sw2_syllables', 'sw1_len', 'sw2_len', 'sw1_contrib', 'sw2_contrib', 'sw1_sw2_removal', 'sw1_aff_c', 'sw1_N_c', 'sw2_aff_c', 'sw2_N_c', 'sp1', 'sp2', 'sp3', 'LABEL', 'BLEND', 'CW1', 'CW2', 'CW1_split', 'CW2_split' ] csvf = open('overlap_splitp_040918.csv'.format(lexicon), '+w', newline='') csvw = csv.DictWriter(csvf, delimiter=',', fieldnames=col_names) T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) # overlap candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/' # noverlap #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/' for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] print('#', i, 'reading', blend) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') sw1, sw2 = gold_blends[blend] #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2)) feature_set = extract_sample_features(blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd, wsm, csm, epit) for features, label in feature_set: #entry = list(map(lambda x: str(x), features.values())) csvw.writerow(features) csvf.close()
def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new')
def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
from tqdm import tqdm from gensim.models.fasttext import FastText from torch.autograd import Variable from torch import nn from pytorch_pretrained_bert import BertAdam from pytorch_pretrained_bert.optimization import WarmupHalfLinearSchedule, WarmupCosineWithWarmupRestartsSchedule from loss import FocalLoss random.seed(0) vis = visdom.Visdom() EPOCH = 2000 jieba.load_userdict('bert-model/dict-traditional.txt') jieba.suggest_freq('<newline>', True) # Load vocabularies word2vec = FastText.load_fasttext_format('bert-model/wordvec-large.dim1024') vocab = {} idx2vocab = {} vec = [] with open('bert-model/TF.csv') as TF: print('建構詞向量...') for idx, line in enumerate(tqdm(TF)): term = line.split(',')[0] vocab[term] = idx idx2vocab[idx] = term vec.append(word2vec[term]) del word2vec # BERT Model model = modeling.BertNoEmbed(vocab=vocab, hidden_size=1024, enc_num_layer=3)