def train_model(book_name): print(book_name + ' fasttext model train start') # extracting file name model_name = book_name.split('.') # train and save model fasttext.skipgram(book_name, model_name[0] + "_fasttext_model") print(book_name + " fasttext model saved")
def create_word_vectors(input_file, output_dir, dim, word_ngram): if not os.path.isdir(output_dir): os.mkdir(output_dir) filename = output_dir.split('/')[-2] output_file = os.path.join(output_dir, filename) fasttext.skipgram(input_file, output_file, dim=dim, word_ngrams=word_ngram)
def fasttext_model(doc, model): fasttext.skipgram(doc, model, dim=param.dim, ws=param.ws, min_count=param.min_count, t=param.t, thread=param.thread)
def train_embedding_fasttext(): ''' 使用fasttext 生成字符级别和单词级别的词嵌入 :return: ''' # Skipgram model logging.info( 'generating CHAR embedding %s with fasttext using %s algorithm', 'char2vec_fastskip256', 'Skipgram') model = fasttext.skipgram(CHAR_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'char2vec_fastskip256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # CBOW model logging.info( 'generating CHAR embedding %s with fasttext using %s algorithm', 'char2vec_fastcbow256', 'CBOW') model = fasttext.cbow(CHAR_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'char2vec_fastcbow256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # Skipgram model logging.info( 'generating WORD embedding %s with fasttext using %s algorithm', 'word2vec_fastskip256', 'Skipgram') model = fasttext.skipgram(WORD_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'word2vec_fastskip256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model) # CBOW model logging.info( 'generating WORD embedding %s with fasttext using %s algorithm', 'word2vec_fastcbow256', 'CBOW') model = fasttext.cbow(WORD_LEVEL_CORPUS, os.path.join(MODEL_DIR, 'word2vec_fastcbow256'), word_ngrams=2, ws=5, min_count=10, dim=256) del (model)
def generate_model(self, file_name='doc.txt', model_name='model', load=True): u"""document名・model名を指定してmodelを生成する。 Args: file_name : String, 形態素解析したドキュメントデータ model_name: String, skip-gramの結果生成されるファイルの名前 load : Boolean, model生成後、そのmodelをloadするか否か """ ft.skipgram(file_name, model_name, **self.__params_fasttext) if load: self.model = ft.load_model(model_name + '.bin') self.df_word = self.model2df()
def train_w2v(train_file, model_file): # doc: https://pypi.org/project/fasttext/ model = fasttext.skipgram(train_file, model_file, \ lr=0.01, dim=256, \ min_count=1, thread=30, \ t=1e-4, ws=5, neg=5, \ epoch=10, silent=False)
def train(self, txt_path, config=DEF_CONFIG): if self.mode == "skipgram": self.model = fasttext.skipgram(txt_path, self.model_path, **config) elif self.mode == "cbow": self.model = fasttext.cbow(txt_path, self.model_path, **config) elif self.mode == "supervised": self.model = fasttext.supervised(txt_path, self.model_path, **config)
def word_embedding(fname='filtered_tweets.txt'): vec_dim = 10 maximum = 24 tokenizer = RegexpTokenizer(r'\w+') with open(fname) as f: tweets = f.readlines() m = len(tweets) for i in range(0, m): tweets[i] = tokenizer.tokenize(tweets[i]) model = fasttext.skipgram(fname, 'model', dim=vec_dim) for i in range(0, m): print '\n\ntweet #' + str(i) tweets[i] = np.array([model[w] for w in tweets[i]]) if len(tweets[i]) == 0: tweets[i] = np.zeros([maximum, vec_dim]) elif len(tweets[i]) < maximum: padlen = maximum - len(tweets[i]) padding = np.zeros([padlen, vec_dim]) tweets[i] = np.concatenate((np.array(tweets[i]), padding), axis=0) tweets[i] = tweets[i].flatten() return tweets
def train_fasttext(path): # https://pypi.python.org/pypi/fasttext import fasttext model = fasttext.skipgram(path, 'embeddings_models/model_fasttext_' + str(TRAINING_SENTENCES), dim=300)
def Item2vec(self, fileL): file_list = self.Fname(fileL) # 根据切词结果训练模型 filename_cut = 'news_fasttext_train.txt' model = fasttext.skipgram(filename_cut, 'model') fout = open('item2vec.txt', 'w+') i = 9 for file in file_list: i += 1 #叠加关键词的向量 vec = [] filename_key = file + '_keywords.txt' fin = open(filename_key, 'r') while True: line = fin.readline() if line: keyword = line.strip() # print keyword k_vec = model[keyword] # print k_vec if len(vec) == 0: vec = k_vec else: vec = list(map(lambda x: x[0] + x[1], zip(vec, k_vec))) else: break fin.close() fout.write(str(vec) + '\n') fout.close()
def building_word_vector_model(option, sentences, embed_dim, workers, window, y_train): """ Builds the word vector Args: type = {bool} 0 for Word2vec. 1 for gensim Fastext. 2 for Fasttext 2018. sentences = {list} list of tokenized words embed_dim = {int} embedding dimension of the word vectors workers = {int} no. of worker threads to train the model (faster training with multicore machines) window = {int} max distance between current and predicted word y_train = y_train Returns: model = Word2vec/Gensim fastText/ Fastext_2018 model trained on the training corpus """ if option == 0: print("Training a word2vec model") model = Word2Vec(sentences = sentences, size = embed_dim, workers = workers, window = window, epochs = 10) print("Training complete") elif option == 1: print("Training a Gensim FastText model") model = FastText(sentences = sentences, size = embed_dim, workers = workers, window = window, iter = 10) print("Training complete") elif option == 2: print("Training a Fasttext model from Facebook Research") y_train = ["__label__positive" if i == 1 else "__label__negative" for i in y_train] with open("imdb_train.txt","w") as text_file: for i in range(len(sentences)): print(sentences[i],y_train[i],file = text_file) model = fasttext.skipgram("imdb_train.txt","model_ft_2018_imdb",dim = embed_dim) print("Training complete") return model
def learn_embeddings(self, inp_path, out_path, emb_epoch=40, emb_lr=0.01, emb_dim=100, encoding_type='utf-8'): import fasttext fasttext.skipgram(inp_path, out_path, epoch=emb_epoch, lr=emb_lr, dim=emb_dim) from gensim.models.wrappers import FastText return FastText.load_fasttext_format(out_path + '.bin', encoding=encoding_type)
def execute(): # Verify that mandatory arguments are present if "-i" not in args: return "ERROR: No input file was given" if "-t" not in args: return "ERROR: No model type was given" # Extract arguments train_file = args[args.index("-i")+1] model_type = args[args.index("-t")+1] # Extract optional arguments epoch = get_optional_param('--epoch',5) ngrams = get_optional_param('--ngrams',1) label_prefix = get_optional_param('--label','__label__') # Create temporary file tmp, modelname = tempfile.mkstemp() # Use specified classifier with parameters and output model to the name of the temporary file if model_type == "supervised": classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "skipgram": classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "cbow": classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) # Return the temporary file name return modelname
def create_fast_text_model(folder, merged_spellcheck_path): start_time_fasttext = time.time() path = './' + str(folder) + '/model' model = fasttext.skipgram(merged_spellcheck_path, path) print("Time used to create Fasttext model: ", get_time(start_time_fasttext)) return model
def trainFastText(data_path, embedding_size=300, context_window=5, min_count=5, save_path='outputs/FastText'): # function that fits a fastext model, can only be in one file path print('Training FastText Model...') # train model model = fasttext.skipgram(data_path, save_path + '/fasttext', dim=embedding_size, ws=context_window, min_count=min_count) # save model with open('data/words.txt', 'r') as f: words = f.read().split('\n')[:-1] shared_words = list(set(words).intersection(set(model.words))) fasttext_dict = {} for word in shared_words: fasttext_dict[word] = model[word] np.save('DSMs/fasttext.npy', fasttext_dict)
def fasttext_lstm(): dimens = 100 #ftmodel = ft.supervised('data/trainprocess.txt', 'model/train', label_prefix='__label__') #ftmodel = ft.load_model('model/model_sentiment.bin', encoding = 'utf-8', label_prefix='__label__') ftmodel = ft.skipgram('data/trainprocess.txt', 'skip_gram', dim=dimens) # print(len(ftmodel['langgg'])) # print(ftmodel.words) train_embed = [] test_embed = [] for text in train_data: tokens = separate(text) embed = [] for token in tokens: vec = ftmodel[token] embed.append(vec) train_embed.append(embed) #print(embed) for text in test_data: tokens = separate(text) embed = [] for token in tokens: vec = ftmodel[token] embed.append(vec) test_embed.append(embed) train_embed = np.array(train_embed) test_embed = np.array(test_embed) return train_embed, train_labels, test_embed, test_labels
def pre_train_fasttext(): thefile = open('data2.txt', 'w', encoding='utf-8') for i in merge: sentence = clean_sent(i) thefile.write("%s\n" % sentence) model = fasttext.skipgram(thefile, 'model') return model
def train_embedding_fasttext(): # Skipgram model model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # CBOW model model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # Skipgram model model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model) # CBOW model model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256) del(model)
def trainWord2Vector(): model1 = fasttext.skipgram( '/Users/didi/workspace/data/tensorflow/resume_data/resume_train.txt', 'model', lr=0.01, dim=300) #for word in model1.words: print model1.words['模型'].encode('utf-8')
def train_model(self, train_txt, vector_len=100): """Trains a new fasttext model""" print("Creating new fasttext model...") output_path = self._ft_dir + "ft_model" # Using skipgram model to learn vector representations self.model = ft.skipgram(train_txt, output_path, dim=vector_len) self._retrieve_word_embeddings() print("Done")
def fasttext_model(doc, model): """ generate without label and save it generate the fasttext model :param doc -> string: data path :param model -> string: fasttext model path :return: """ with open('./cnews/without.dat', 'w', encoding='utf8') as f: data = read_file(doc) for d, _ in data: f.write(d) f.flush() fasttext.skipgram('./cnews/without.dat', model, dim=self.dim, ws=self.ws, min_count=self.min_count)
def trainSkipgram(inFile,mdlfile): # Skipgram model model = fasttext.skipgram(input_file=inFile,\ output=mdlfile,\ lr=0.1,\ dim=200,\ epoch=10,\ word_ngrams=3,\ bucket=5000000)
def trainWordToVec(): """ Will train word2vec from wikipedia text make sure that hewiki-latest-pages-articles.xml.bz2 is downloaded to the data folder this function creates word2vec model :return: """ print("WordToVec model exists: {}".format(os.path.isfile(utils.word2VecFiles + ".bin"))) from gensim.corpora import WikiCorpus # stop if model already has been created if os.path.isfile(utils.word2VecFiles + ".bin"): return # download from wikipedia if not os.path.isfile(utils.wikiTar): import urllib tarLocation = 'https://dumps.wikimedia.org/hewiki/latest/hewiki-latest-pages-articles.xml.bz2' wikiConn = urllib.request.urlopen(tarLocation) with open(utils.wikiTar, 'wb') as wikiSaver: wikiSaver.write(wikiConn.read()) # parse tar file if not os.path.isfile(utils.wikiFull): i = 0 output = open(utils.wikiFull, 'w') wiki = WikiCorpus(utils.wikiTar, lemmatize=False, dictionary={}) for text in wiki.get_texts(): article = " ".join([t for t in text]) output.write("{}\n".format(article)) i += 1 if i % 500 == 0: print("{} items loaded".format(str(i))) output.close() # train word2vec fasttext.skipgram(utils.wikiFull, utils.word2VecFiles) print("step 1 - {0} created".format(word2VecFiles))
def text2vec_fast(data_file, method='cbow', modelname='model', **kargs): clean_data(data_file, 'cleaned.txt') if method == 'skipgram': model = fasttext.skipgram('cleaned.txt', modelname, **kargs) else: model = fasttext.cbow('cleaned.txt', modelname, **kargs) vector_dict, index_dict = extract_word_vectors(modelname + '.vec') os.remove('cleaned.txt') return vector_dict, index_dict
def train(self, questions_path, vector_length=100): self.dim = vector_length model_name = 'model_' + os.path.split(questions_path)[-1].split('.')[0] model_path = os.path.join(self.files_path, model_name) self.model = fasttext.skipgram(questions_path, model_path, dim=self.dim, thread=4)
def fasttext_tfidf(): dimens = 150 vectorizer = TfidfVectorizer(analyzer=separate) vectorizer = vectorizer.fit(train_data) train_tfidf = vectorizer.transform(train_data) test_tfidf = vectorizer.transform(test_data) vocab = vectorizer.vocabulary_ print(type(vocab)) print(test_tfidf.shape) ftmodel = ft.skipgram('data/trainprocess.txt', 'skip_gram', dim=dimens) # print(ftmodel.words) train_embed = [] test_embed = [] for j in range(len(train_data)): text = train_data[j] tokens = separate(text) embed = [] for i in range(dimens): embed.append(0) for token in tokens: vec = ftmodel[token] multi = 1 if (token in vocab.keys()): multi = train_tfidf[j, vocab[token]] for i in range(dimens): embed[i] += vec[i] * multi for i in range(dimens): embed[i] = embed[i]/(len(tokens)) train_embed.append(embed) #print(embed) for j in range(len(test_data)): text = test_data[j] tokens = separate(text) embed = [] for i in range(dimens): embed.append(0) for token in tokens: vec = ftmodel[token] multi = 1 if (token in vocab.keys()): multi = test_tfidf[j, vocab[token]] for i in range(dimens): embed[i] += vec[i] * multi for i in range(dimens): embed[i] = embed[i] / (len(tokens)) test_embed.append(embed) train_embed = np.array(train_embed) test_embed = np.array(test_embed) print(train_embed.shape) print(test_embed.shape) return train_embed, train_labels, test_embed, test_labels
def run_model(db_path, model_path, model_type, model_params): print('Training FastText model...') start_time = clock() conn = sqlite3.connect(db_path) cursor = conn.cursor() if not os.path.isdir(model_path): os.makedirs(model_path) temp_fpath = os.path.join(model_path, '__ft_temp.txt') get_text_command = 'SELECT preprocessed FROM rss_data ORDER BY cachedate DESC' results = cursor.execute(get_text_command) with open(temp_fpath, 'w', newline='', encoding='utf-8') as tmpfile: for r in results: if r[0] is not None: tmpfile.write(r[0] + '.\n') conn.close() model_fpath = os.path.join(model_path, 'fasttext_model') model = None p_ngrams = model_params.get('ngram', 1) p_dim = model_params.get('dim', 100) p_ws = model_params.get('ws', 5) p_epoch = model_params.get('epoch', 5) p_loss = model_params.get('loss', 'ns') p_min_count = model_params.get('min_count', 5) p_silent = model_params.get('silent', 1) if model_type == 'cbow': model = fasttext.cbow(temp_fpath, model_fpath, word_ngrams=p_ngrams, dim=p_dim, ws=p_ws, epoch=p_epoch, loss=p_loss, silent=p_silent, min_count=p_min_count) else: model = fasttext.skipgram(temp_fpath, model_fpath, word_ngrams=p_ngrams, dim=p_dim, ws=p_ws, epoch=p_epoch, loss=p_loss, silent=p_silent, min_count=p_min_count) os.remove(temp_fpath) end_time = clock() print('Model trained in {} seconds.'.format(int(end_time - start_time))) return model
def train(inp = "wiki2.he.text",out_model = "wiki.he.fasttext.model", alg = "CBOW"): start = time.time() if alg == "skipgram": # Skipgram model model = fasttext.skipgram(inp, out_model) print(model.words) # list of words in dictionary else: # CBOW model model = fasttext.cbow(inp, out_model) print(model.words) # list of words in dictionary print(time.time()-start) model.save(out_model)
def train_word_repr(self): documents = [] for xlsx in os.listdir(self.excel_dir): print('reading Excel: %s ...' % xlsx) df = pd.read_excel(os.path.join(COMMENTS_DIR, xlsx), encoding='GBK', index_col=None) df = df.dropna(how='any') documents += df['content'].tolist() lines = [] for i in range(len(documents)): line = " ".join(jieba.cut(str(documents[i]).strip())) lines.append(line) with open('./comments.txt', mode='wt', encoding='utf-8') as f: f.write("".join(lines)) fasttext.skipgram('comments.txt', 'fastTextRepr')
def grid_search(dims, lr, train, full_data, test, test_labels, epoch, ngrams, ws): """ Performs a grid seach over the parameters dims and lr. params: dims : list of dimension parameters lr : list of learning rate parameters Returns : Best parameters and best accuracy. """ #Initialzing best accuracy and best parameters best_accracy = 0.0 best_params = (0.0, 0.0) #preprocessing testing set. actual = [1 if '__label__positive' in t else -1 for t in test_labels] #Iterating over paramters for grams in ngrams: print('ngrams = ', grams) for w in ws: print('ws = ', w) for k in epoch: print('epoch = ', str(k)) for i in dims: print('dim = ', str(i)) for j in lrs: print('learning rate = ', str(j)) #writing files write_to_file(train, 'train.txt') write_to_file(full_data, 'data.txt') #building model. model = fasttext.skipgram('data.txt', 'model', dim=i, lr=j, epoch=k, word_ngrams=grams, ws=w) classifier = fasttext.supervised( 'train.txt', 'model', label_prefix='__label__') labels = classifier.predict(test) pred = [1 if t == ['positive'] else -1 for t in labels] #Computing accuracy. accuracy = 1 - np.mean( np.abs(np.array(pred) - np.array(actual)) / 2) print('accuracy = ' + str(accuracy)) if (accuracy > best_accracy): best_accracy = accuracy best_params = (i, j, k, w, grams) return best_accracy, best_params
def test_train_skipgram_model(self): # set params lr=0.005 dim=10 ws=5 epoch=5 min_count=1 neg=5 word_ngrams=1 loss='ns' bucket=2000000 minn=3 maxn=6 thread=4 lr_update_rate=10000 t=1e-4 silent=1 # train skipgram model model = ft.skipgram(input_file, output, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Make sure the model is generated correctly self.assertEqual(model.dim, dim) self.assertEqual(model.ws, ws) self.assertEqual(model.epoch, epoch) self.assertEqual(model.min_count, min_count) self.assertEqual(model.neg, neg) self.assertEqual(model.loss_name, loss) self.assertEqual(model.bucket, bucket) self.assertEqual(model.minn, minn) self.assertEqual(model.maxn, maxn) self.assertEqual(model.lr_update_rate, lr_update_rate) self.assertEqual(model.t, t) # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) self.assertTrue(path.isfile(output + '.vec')) # Make sure the vector have the right dimension self.assertEqual(len(model['the']), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) self.assertTrue(unicode_str in model) self.assertEqual(len(model[unicode_str]), model.dim)
def test_train_skipgram_model_default(self): default_args = default_params.read_file(params_txt) model = ft.skipgram(input_file, output) # Make sure the default params of skipgram is equal # to fasttext(1) default params self.assertEqual(model.model_name, 'skipgram') self.assertEqual(model.dim, int(default_args['dim'])) self.assertEqual(model.ws, int(default_args['ws'])) self.assertEqual(model.epoch, int(default_args['epoch'])) self.assertEqual(model.min_count, int(default_args['minCount'])) self.assertEqual(model.neg, int(default_args['neg'])) self.assertEqual(model.word_ngrams, int(default_args['wordNgrams'])) self.assertEqual(model.loss_name, default_args['loss']) self.assertEqual(model.bucket, int(default_args['bucket'])) self.assertEqual(model.minn, int(default_args['minn'])) self.assertEqual(model.maxn, int(default_args['maxn'])) self.assertEqual(model.lr_update_rate, float(default_args['lrUpdateRate'])) self.assertEqual(model.t, float(default_args['t']))
def main(): data_path = '/Users/ruizhang/Documents/NLP_dataset/' ############# # ############ # Load train set train_file = data_path +'dbpedia_csv/train.csv' df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'dbpedia_csv/test.csv' df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Mapping from class number to class name class_dict = { 1: 'Company', 2: 'EducationalInstitution', 3: 'Artist', 4: 'Athlete', 5: 'OfficeHolder', 6: 'MeanOfTransportation', 7: 'Building', 8: 'NaturalPlace', 9: 'Village', 10: 'Animal', 11: 'Plant', 12: 'Album', 13: 'Film', 14: 'WrittenWork' } df['class_name'] = df['class'].map(class_dict) df.head() ############# # ############ desc = df.groupby('class') desc.describe().transpose() # Transform datasets df_train_clean = clean_dataset(df, True, False) df_test_clean = clean_dataset(df_test, False, False) # Write files to disk train_file_clean = data_path + 'dbpedia.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'dbpedia.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) # Train a classifier output_file = data_path + 'dp_model' classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__') result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.'] labels1 = classifier.predict(sentence1) class1 = int(labels1[0][0]) print("Sentence: ", sentence1[0]) print("Label: %d; label name: %s" % (class1, class_dict[class1])) sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.'] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class2 as string print("Sentence: ", sentence2[0]) print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!'] number_responses = 3 labels3 = classifier.predict_proba(sentence3, k=number_responses) print("Sentence: ", sentence3[0]) for l in range(number_responses): class3, prob3 = labels3[0][l] print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3)) # Load train set train_file = data_path + 'amazon_review_polarity_train.csv' df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'amazon_review_polarity_test.csv' df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Transform datasets df_train_clean = clean_dataset(df_sentiment_train, True, False) df_test_clean = clean_dataset(df_sentiment_test, False, False) # Write files to disk train_file_clean = data_path + 'amazon.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'amazon.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) dim = 10 lr = 0.1 epoch = 5 min_count = 1 word_ngrams = 2 bucket = 10000000 thread = 12 label_prefix = '__label__' # Train a classifier output_file = data_path + 'amazon_model' classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, label_prefix=label_prefix) # Evaluate classifier result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) class_dict = { 1: "Negative", 2: "Positive" } sentence1 = ["The product design is nice but it's working as expected"] labels1 = classifier.predict_proba(sentence1) class1, prob1 = labels1[0][0] # it returns class as string print("Sentence: ", sentence1[0]) # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1)) sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class as string print("Sentence: ", sentence2[0]) # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) url = "https://twitter.com/miguelgfierro/status/805827479139192832" response = urlopen(url).read() title = str(response).split('<title>')[1].split('</title>')[0] print(title) # # Format tweet # tweet = unescape(title) # print(tweet) # # # Classify tweet # label_tweet = classifier.predict_proba([tweet]) # class_tweet, prob_tweet = label_tweet[0][0] # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet)) wiki_dataset_original = data_path + 'enwik9' wiki_dataset = data_path + 'text9' if not os.path.isfile(wiki_dataset): os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset) output_skipgram = data_path + 'skipgram' if os.path.isfile(output_skipgram + '.bin'): skipgram = fasttext.load_model(output_skipgram + '.bin') else: skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) print(np.asarray(skipgram['king'])) print("Number of words in the model: ", len(skipgram.words)) # Get the vector of some word Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum() print(Droyals) Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople) Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople2) print(len(skipgram.words)) targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother', 'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy'] classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] X_target = [] for w in targets: X_target.append(skipgram[w]) X_target = np.asarray(X_target) word_list = list(skipgram.words)[:10000] X_subset = [] for w in word_list: X_subset.append(skipgram[w]) X_subset = np.asarray(X_subset) X_target = np.concatenate((X_subset, X_target)) print(X_target.shape) X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact', random_state=0, n_iter=200, verbose=2).fit_transform(X_target) print(X_tsne.shape) X_tsne_target = X_tsne[-20:, :] print(X_tsne_target.shape) plot_words(X_tsne_target, targets, classes=classes) plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
import fasttext INPUT_TXT = '/path/to/file.txt' OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram' OUTPUT_PATH_CBOW = '/tmp/cbow' # Learn the word representation using skipgram model skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print skipgram['word'] # Learn the word representation using cbow model cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print cbow['word'] # Load pre-trained skipgram model SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin' skipgram = fasttext.load_model(SKIPGRAM_BIN) print skipgram['word'] # Load pre-trained cbow model CBOW_BIN = OUTPUT_PATH_CBOW + '.bin' cbow = fasttext.load_model(CBOW_BIN) print cbow['word']