def test_subword_vector(self): f = load_model(self.output + '.bin') words, _ = f.get_words(include_freq=True) words += get_random_words(10000, 1, 200) input_matrix = f.get_input_matrix() for word in words: # Universal api to get word vector vec1 = f.get_word_vector(word) # Build word vector from subwords subwords, subinds = f.get_subwords(word) subvectors = list(map(lambda x: f.get_input_vector(x), subinds)) subvectors = np.stack(subvectors) vec2 = np.sum((subvectors / len(subwords)), 0) # Build word vector from subinds vec3 = np.sum(input_matrix[subinds] / len(subinds), 0) # Build word vectors from word and subword ids wid = f.get_word_id(word) if wid >= 0: swids = list(map(lambda x: f.get_subword_id(x), subwords[1:])) swids.append(wid) else: swids = list(map(lambda x: f.get_subword_id(x), subwords)) swids = np.array(swids) vec4 = np.sum(input_matrix[swids] / len(swids), 0) self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all()) self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all()) self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all()) self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all())
def test_vocab(self): f = load_model(self.output + '.bin') words, freq = f.get_words(include_freq=True) self.eprint( "There is no way to access words from the cli yet. " "Therefore there can be no rigorous test." )
def test_subwords(self): f = load_model(self.output + '.bin') words, _ = f.get_words(include_freq=True) words += get_random_words(10, 1, 10) for w in words: f.get_subwords(w) self.eprint( "There is no way to access words from the cli yet. " "Therefore there can be no test." )
def load_fasttext_model(path): """ Load a binarized fastText model. """ try: import fastText except ImportError: raise Exception("Unable to import fastText. Please install fastText for Python: " "https://github.com/facebookresearch/fastText") return fastText.load_model(path)
def __init__(self, model_name="model_cat1", k=5, threshold=0): try: self.model = fastText.load_model( resources_path + "models/{}.bin".format(model_name) ) log.info('Load tagging model {}'.format(model_name)) except ValueError as exc: log.error( 'Error loading tagging model {}: {}'.format(model_name, exc) ) raise exc self.k = k self.threshold = threshold
def test_getvector(self): f = load_model(self.output + '.bin') words, _ = f.get_words(include_freq=True) words += get_random_words(100, 1, 100) ftbin_vectors = self.get_word_vectors_from_list(self.output, words) ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1] for v in ftbin_vectors: word = v.split(' ')[0] vector = v.split(' ')[1:-1] vector = np.array(list(map(float, vector))) pvec = f.get_word_vector(word) # The fasttext cli returns floats with 5 digits, # but we use the full 6 digits. self.assertTrue(np.allclose(vector, pvec, rtol=1e-04))
def get_word_vector(data, model): t1 = time.time() print("Reading") with open(data, 'r') as f: tokens = tokenize(f.read()) t2 = time.time() print("Read TIME: " + str(t2 - t1)) print("Read NUM : " + str(len(tokens))) f = load_model(model) # This is not equivalent to piping the data into # print-word-vector, because the data is tokenized # first. t3 = time.time() i = 0 for t in tokens: f.get_word_vector(t) i += 1 if i % 10000 == 0: sys.stderr.write("\ri: " + str(float(i / len(tokens)))) sys.stderr.flush() t4 = time.time() print("\nVectoring: " + str(t4 - t3))
def load(self, *args, **kwargs) -> Fasttext.FastText._FastText: """ Load fastText binary model from self.load_path Args: *args: arguments **kwargs: arguments Returns: fastText pre-trained model """ if self.load_path and self.load_path.is_file(): log.info("[loading embeddings from `{}`]".format(self.load_path)) model_file = str(self.load_path) model = Fasttext.load_model(model_file) else: log.error('No pretrained fasttext model provided or provided load_path "{}" is incorrect.' .format(self.load_path)) sys.exit(1) return model
prev_time = m.timestamp feat_vector.append(curr_cnt) # print(user, len(feat_vector)) user_density[user] = feat_vector return user_density def calculate_chat_density(user_time): user_density = {} for username in user_time: times = user_time[username] if max(times) != min(times): density = len(times) / (max(times) - min(times)) if density > 1e-2: continue user_density[username] = density * 1e5 print(username, user_density[username]) return user_density if __name__ == "__main__": user_log = parse_log("chat_log_target.csv") # load_data(user_log, output_file="pretrain_data.txt") # print("generating model...") # model = train_unsupervised(input="pretrain_data.txt", model='skipgram', dim=500) # model.save_model("pretrain_token.bin") model = fastText.load_model("pretrain_token.bin") feat_path = "fasttext_feat_500_sent_target/" generate_fasttext_embeddings(user_log, model)
def test_predict(self): # TODO: I went a little crazy here as an exercise for # a rigorous test case. This could be turned into # a few utility functions. f = load_model(self.output_sup + '.bin') def _test(N, min_length, max_length, k, add_vocab=0): words = get_random_words(N, min_length, max_length) if add_vocab > 0: vocab, _ = f.get_words(include_freq=True) for _ in range(add_vocab): ind = random.randint(0, len(vocab)) words += [vocab[ind]] all_labels = [] all_probs = [] ii = 0 gotError = False for w in words: try: labels, probs = f.predict(w, k) except ValueError: gotError = True continue all_labels.append(labels) all_probs.append(probs) ii += 1 preds, _, retcode = self.get_predictions_from_list( self.output_sup, words, k ) if gotError and retcode == 0: self.eprint( "Didn't get error. Make sure your compiled " "binary kept the assert statements" ) self.assertTrue(False) else: return preds = preds.split('\n')[:-1] self.assertEqual(len(preds), len(all_labels)) for i in range(len(preds)): labels = preds[i].split() probs = np.array(list(map(float, labels[1::2]))) labels = np.array(labels[::2]) self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04)) self.assertTrue(np.array_equal(labels, all_labels[i])) _test(0, 0, 0, 0) _test(1, 0, 0, 0) _test(10, 0, 0, 0) _test(1, 1, 1, 0) _test(1, 1, 1, 1) _test(1, 2, 3, 0) _test(1, 2, 3, 1) _test(10, 1, 1, 1) _test(1, 1, 1, 0, add_vocab=10) _test(1, 1, 1, 1, add_vocab=10) _test(1, 2, 3, 0, add_vocab=10) _test(1, 2, 3, 1, add_vocab=10) reach = 10 for _ in range(10): N = random.randint(0, reach) init = random.randint(0, reach) offset = random.randint(0, reach) k = random.randint(0, reach) _test(N, init, init + offset, k)
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np from fastText import load_model embed_file = '../model/wiki.en.bin' ft_model = load_model(embed_file) input_matrix = ft_model.get_input_matrix() output_matrix = ft_model.get_output_matrix() emb_mean, emb_std = input_matrix.mean(), input_matrix.std() print(emb_mean) print(emb_std) emb_mean, emb_std = output_matrix.mean(), output_matrix.std() print(emb_mean) print(emb_std) def load_crawl_embed_index(file_path): embed_index = {} with open(file_path) as f: for line in f.read().split("\n")[1:-1]: values = line.split(" ") word = values[0] coefs = np.asarray(values[1:-1], dtype='float32') embed_index[word] = coefs print('Found %s word vectors.' % len(embed_index)) return embed_index
if initial_rows - rows_with_text_empty == len(df): print "Out of {} rows, {} rows were found with empty text field. They were dropped, and the number of rows now is {}".format( initial_rows, rows_with_text_empty, len(df)) else: print "Check again!" df.drop('photourl', axis=1, inplace=True) print list(df.columns.values) df['ext_hashtags'] = df.apply(lambda row: extract_hashtags(row['text']), axis=1) print list(df.columns.values) # Load language identification model model = fastText.load_model('fastText_models/lid.176.bin') # Get predictions for language identification df['langid'] = df['text'].apply(lambda x: detect_lang(x)) #drop rows if langid was not successfull df.dropna(subset=['langid'], inplace=True) # Save DataFrame to disk print 'Saving to a file' df.to_csv('lang_instagram.tsv', sep='\t', quoting=csv.QUOTE_NONNUMERIC, encoding='latin-1', index=False)
def before_request(): g.ft_model = fastText.load_model(app.config["FT_SERVER_MODEL_PATH"])
import fastText import dropbox_helper try: dropbox_helper.load('./ml/tags/model/tags_model_new', '/tags_model_new') model = fastText.load_model('ml/tags/model/tags_model_new') print('loaded last tags model') except ValueError: model = fastText.load_model('ml/tags/model/tags_model') print('loaded default tags model') def predict(text): labels, probs = model.predict([text], k=5) print(labels, probs) tags = [] for i, label in enumerate(labels[0]): tags.append({ 'label': label.replace('__label__', ''), 'probability': round(probs[0][i], 3) }) return tags # fast code for result # def build_model() {
# Replace miscellaneous characters train['comment_text'] = train['comment_text'].str.replace('ı', 'i') test['comment_text'] = test['comment_text'].str.replace('ı', 'i') # Normalize comment_text (IMPLEMENTED IN GENERATOR) #train['comment_text'] = train['comment_text'].apply(normalize) #test['comment_text'] = test['comment_text'].apply(normalize) # Split comment_text (IMPLEMENTED IN GENERATOR) #train['comment_text'] = train['comment_text'].str.split() #test['comment_text'] = test['comment_text'].str.split() """ Loading FT model """ print('Loading FT model') ft_model = load_model('/home/kazuki_onodera/wiki.en.bin') # Embedding dimension n_features = ft_model.get_dimension() """ Define models """ def build_model(logdir='.'): # Bidirectional-LSTM if logdir is not None and os.path.exists(logdir): tb_cb = TensorBoard(log_dir='.', histogram_freq=0, write_graph=True) inp = Input(shape=(window_length, 300)) x = Bidirectional( LSTM(50, return_sequences=True, dropout=0.1,
zero, diag_margin = Variable(zero), Variable(diag_margin) x = x / torch.norm(x, 2, 1, keepdim=True) v = v / torch.norm(v, 2, 1, keepdim=True) prod = torch.matmul(x, v.transpose(0, 1)) diag = torch.diag(prod) for_x = torch.max(zero, margin - torch.unsqueeze(diag, 1) + prod) - diag_margin for_v = torch.max(zero, margin - torch.unsqueeze(diag, 0) + prod) - diag_margin return (torch.sum(for_x) + torch.sum(for_v)) / x.size(0) if __name__ == '__main__': print('Loading a pretrained fastText model...') word_embedding = fasttext.load_model(args.fasttext_model) print('Loading a dataset...') train_data = ReedICML2016( args.img_root, args.caption_root, args.trainclasses_file, word_embedding, args.max_nwords, transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) word_embedding = None train_loader = data.DataLoader(train_data,
corpus_path = "senti_corpus.csv" df_corpus = pd.read_csv(corpus_path, encoding="utf-8") train_list = (df_corpus["label"] + " , " + df_corpus["seged_weibo"]).tolist() # input must be a filepath train_path = "train.txt" model_path = "senti-model.bin" with open(train_path, "w", encoding="utf_8_sig") as fw: for line in train_list: fw.write(u"{}\n".format(line)) # train model_classifier = fastText.train_supervised( train_path, label="__label__", dim=200, lr=0.2, epoch=25, wordNgrams=2, ) model_classifier.save_model(model_path) """test""" test_path = "senti_test.csv" df_test = pd.read_csv(test_path, encoding="utf-8") test_list = (df_test["label"] + " , " + df_test["seged_weibo"]).tolist() test_path2 = "test.txt" with open(test_path2, "w", encoding="utf_8_sig") as fw: for line in test_list: fw.write(u"{}\n".format(line)) model_path = "senti-model.bin" model_classifier = fastText.load_model(model_path) # input must be a filepath result = model_classifier.test(test_path2) print(result[1]) # accuracy
class FastTextDataGenerator(): # class variables print('loading FastText model...', flush=True) # flush set true has no effect? ft_model_path = '/home/kai/data/resources/FastText/wiki.en.bin' ft_model = load_model(ft_model_path) n_features = ft_model.get_dimension() print('fasttext model loaded. embedding dimemsion: {}'.format(n_features)) def __init__(self, df, label_cols, text_column_name, window_length, batch_size, shuffle=True): """ Params: df: (dataframe) at least contains a text column and label columns label_cols: (list) names of label columns e.g.: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] text_column_name: (str) text for faxttext embedding e.g.: comment or comment_text_cleaned window_length: (int) pick at most the first n words from a text batch_size: (int) how large to generate at each batch shuffle: (boolean) whether to shuffle df each epoch Returns: (tuple) contains training data and labels of the batch size """ #self._ft_model = load_model(ft_model_path) # self._n_features = FastTextDataGenerator.ft_model.get_dimension() # print('fasttext model loaded. embedding dimemsion: {}'.format(self._n_features)) self._df = df self._label_cols = label_cols self._text_column_name = text_column_name self._window_length = window_length self._batch_size = batch_size self._shuffle = shuffle self.training_steps_per_epoch = round(len(df) / batch_size) def load_new_ft_model(self, new_ft_model_path): print('loading new model...', flush=True) FastTextDataGenerator.ft_model = load_model(new_ft_model_path) FastTextDataGenerator.n_features = FastTextDataGenerator.ft_model.get_dimension( ) print('fasttext model loaded. embedding dimemsion: {}'.format( FastTextDataGenerator.n_features)) def data_gen(self): """ Given a raw dataframe, generates infinite batches of FastText vectors. """ batch_i = 0 # Counter inside the current batch vector batch_x = None # The current batch's x data batch_y = None # The current batch's y data while True: # Loop forever if self._shuffle: self._df = self._df.sample(frac=1) for i, row in self._df.iterrows(): comment = row[self._text_column_name][ 0] # add [0] to get the string from the pd.Series if batch_x is None: batch_x = np.zeros((self._batch_size, self._window_length, FastTextDataGenerator.n_features), dtype='float32') batch_y = np.zeros( (self._batch_size, len(self._label_cols)), dtype='float32') batch_x[batch_i] = FastTextDataGenerator.text_to_vector( comment, self._window_length) batch_y[batch_i] = row[self._label_cols].values batch_i += 1 if batch_i == self._batch_size: # Ready to yield the batch yield batch_x, batch_y batch_x = None batch_y = None batch_i = 0 @staticmethod def text_to_vector(text, window_length): """ Given a string, normalizes it, then splits it into words and finally converts it to a sequence of word vectors. """ text = FastTextDataGenerator.normalize(text) words = text.split() window = words[-window_length:] x = np.zeros((window_length, FastTextDataGenerator.n_features)) for i, word in enumerate(window): x[i, :] = FastTextDataGenerator.ft_model.get_word_vector( word).astype('float32') return x @staticmethod def normalize(s): """ Given a text, cleans and normalizes it. Feel free to add your own stuff. """ #s = s.lower() # Replace ips #s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s) # Isolate punctuation #s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s) # Remove some special characters #s = re.sub(r'([\;\:\|•«\n])', ' ', s) # Replace numbers and symbols with language s = s.replace('&', ' and ') s = s.replace('@', ' at ') s = s.replace('0', ' zero ') s = s.replace('1', ' one ') s = s.replace('2', ' two ') s = s.replace('3', ' three ') s = s.replace('4', ' four ') s = s.replace('5', ' five ') s = s.replace('6', ' six ') s = s.replace('7', ' seven ') s = s.replace('8', ' eight ') s = s.replace('9', ' nine ') return s @staticmethod def df_to_data(df, text_column_name, window_length): """ Convert a given dataframe to a dataset of inputs for the NN. """ x = np.zeros( (len(df), window_length, FastTextDataGenerator.n_features), dtype='float32') for i, comment in enumerate(df[text_column_name].values): x[i, :] = FastTextDataGenerator.text_to_vector( comment, window_length) return x
fin.close() return test_data, test_lable # fact=argv[1] if __name__ == '__main__': stop_words = read_stop_words('F:/FastTextpredict/stopwords.txt') # # print('train acc...') # classifier = ff.train_supervised(input="acc_train.txt", label="__label__" ,wordNgrams=2) # classifier.save_model('acc_train.model.bin') #load训练好的模型 classifier = ff.load_model('F:/FastTextpredict/acc_train.model.bin') #测试模型 #系统运行的代码 # list =[] # list.append(fact) # list =cut_text(list,stop_words) # lable, pro = classifier.predict(list) # for i,text in enumerate(lable): # print(text[9:]+"的概率为:"+str(pro[i])) #end here fact = '昌宁县人民检察院指控,2014年4月19日下午16时许,被告人段某驾拖车经过鸡飞乡澡塘街子,' \ '时逢堵车,段某将车停在“冰凉一夏”冷饮店门口,被害人王某的侄子王2某示意段某靠边未果,' \ '后上前敲打车门让段某离开,段某遂驾车离开,但对此心生怨愤。同年4月21日22时许,被告人' \
#!/usr/bin/env python import re import sys import json from waitress import serve import fastText as fasttext from flask import Flask, request, jsonify model = fasttext.load_model('model.bin') app = Flask(__name__) def tokenize(line): line = re.sub(r'[,./<>?;:\"!@#$%^&*()=\[\]{}()]', ' ', line) line = re.sub(r'[ \t]{2,}', ' ', line).lower() line = re.sub(r'(.)\1\1+', r'\1\1\1', line) return line def predict(line): line = tokenize(line) predict = model.predict(line) return { "labels": list(predict[0]), "scores": list(predict[1]), } @app.route("/") def handler(): review = request.args.get('review')
def __init__(self, model_path): self.model = load_model(model_path) input_matrix = self.model.get_input_matrix() input_matrix_shape = input_matrix.shape super().__init__(input_matrix_shape[0], input_matrix_shape[1]) self.weight.data.copy_(torch.FloatTensor(input_matrix))
offsets = Variable(torch.LongTensor(word_offsets)) return super().forward(ind, offsets) def random_word(N): return ''.join( random.choices( string.ascii_uppercase + string.ascii_lowercase + string.digits, k=N ) ) if __name__ == "__main__": ft_emb = FastTextEmbeddingBag("fil9.bin") model = load_model("fil9.bin") num_lines = 200 total_seconds = 0.0 total_words = 0 for _ in range(num_lines): words = [ random_word(random.randint(1, 10)) for _ in range(random.randint(15, 25)) ] total_words += len(words) words_average_length = sum([len(word) for word in words]) / len(words) start = time.clock() words_emb = ft_emb(words) total_seconds += (time.clock() - start) for i in range(len(words)): word = words[i]
def test_dimension(self): f = load_model(self.output + '.bin') f.get_dimension()
help='path to out file', required=True) parser.add_argument('--fasttext-model', action='store', dest='model', help='.bin model of fasttext', required=True) args = parser.parse_args() # # load data & work # ------------------------------- # model = fastText.load_model(args.model) ind_stats = [] with open(args.allen_vocab, "r", encoding="utf8") as in_file: with open(args.out_file, "w", encoding="utf8") as out_file: for line in tqdm(in_file): line = line.strip() _, indices = model.get_subwords(line) ind_stats.append(len(indices)) out_file.write(line + " " + " ".join(map(str, map(lambda x: x + 1, indices))) + "\n")
def create_fasttext_embedding(tokens, bin_path): model = fastText.load_model(bin_path) emb = {tok: model.get_word_vector(tok) for tok in tokens} return emb return {}
words = ' '.join(words) return words # print(sentences_to_words(celebratity['Tom Cruise'])) celebratity_type = dict() import fastText model_name = 'model/' + 'classify_with_videos' + '.model' type_set = [ 'ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ' ] classifier = fastText.load_model(model_name) count = 0 for name in celebratity.keys(): count += 1 words = sentences_to_words(celebratity[name]) print(words) words = words.replace('\n', '') # with open('words%d.txt'%count,'w') as f: # f.write('i love you') results = classifier.predict(words, 16) print(name, results) type, proba = results real_type = type[0][-4:] print(real_type) type = [item[-4:] for item in type] proba = [item for item in proba]
from fastText import load_model import sys import os import joblib repos = sys.argv[1] model_f = sys.argv[2] model = load_model(model_f) res = {} for repo_voc in os.listdir(repos): full_path = os.path.join(repos, repo_voc) print(full_path) with open(full_path) as handle: sent = handle.read() v = model.get_sentence_vector(sent) res[repo_voc] = v joblib.dump(res, 'repo_vectors')
def read_polyglot(path, word_tokenizer): with open(path, 'rb') as file: model = pickle.load(file, encoding='bytes') word_index = word_tokenizer.word_index embedding = np.zeros([len(word_index) + 1, len(list(model.values())[0])]) for v, i in word_index.items(): if v in model: embedding[i] = model[v] return embedding if __name__ == '__main__': from data_utils.constants import WORD_VEC_PATH import pickle with open('output/word_tokenizer.pkl', 'rb') as file: word_tokenizer = pickle.load(file) print(word_tokenizer.word_index[word_tokenizer.oov_token]) print(len(word_tokenizer.word_index)) model = fastText.load_model(WORD_VEC_PATH) embedding = read_word_vec(WORD_VEC_PATH, word_tokenizer) assert (np.sum(embedding[word_tokenizer.word_index['nhà']] == model.get_word_vector('nhà')) == model.get_dimension()) assert (np.sum(embedding[word_tokenizer.word_index['Bán']] == model.get_word_vector('Bán')) == model.get_dimension()) assert (np.sum(embedding[word_tokenizer.word_index['tầng']] == model.get_word_vector('tầng')) == model.get_dimension()) assert (np.sum(embedding[word_tokenizer.word_index['<UNK>']] == model.get_word_vector('<UNK>')) == model.get_dimension()) print(embedding[word_tokenizer.word_index[word_tokenizer.oov_token]])
from gensim.models import KeyedVectors import fastText as ft import numpy as np import os import sys import gc import shutil print("loading vocabulary") embeddings = KeyedVectors.load("Norsk_embeddings") print("loading out of vocabulary") outofvocab = ft.load_model("norsk.bin") print("loading data") basepath = os.path.normpath(os.path.realpath(__file__)) while os.path.basename(basepath) != "Minerva": basepath = os.path.dirname(basepath) directory = os.path.normpath( os.path.join(basepath, "data/clean/mftd_norwegian")) newWords = {} seen = set() print("Loaded data") for filename in os.listdir(directory): with open(directory + "/" + filename, 'r') as file: data = file.read() tokens = [token for token in data.split(" ") if token != ""] for token in tokens: if token in seen: continue seen.add(token)
for del_key in delete_key_words: del ast_word2word[del_key] # Normalize comment_text (IMPLEMENTED IN GENERATOR) #train['comment_text'] = train['comment_text'].apply(normalize) #test['comment_text'] = test['comment_text'].apply(normalize) # Split comment_text (IMPLEMENTED IN GENERATOR) #train['comment_text'] = train['comment_text'].str.split() #test['comment_text'] = test['comment_text'].str.split() """ Loading FT model """ print('Loading FT model') ft_model = load_model('../external_data/pretrained/fasttext/wiki/wiki.en.bin') # Embedding dimension n_features = ft_model.get_dimension() """ Define models """ def build_lstm_stack_model(logdir='attention'): # Bidirectional-LSTM inp = Input(shape=(window_length, 300)) inp_dr = SpatialDropout1D(0.05)(inp) l_lstm = Bidirectional(CuDNNGRU(512, return_sequences=True))(inp_dr) l_lstm = Dropout(0.05)(l_lstm) x_gmp = GlobalMaxPool1D()(l_lstm)
def __call__(self, data=None, save_csv=False, full=False, verbose=True, runKNN=False): """ By calling this function the user will start the processing of the pipeline. If no data is provided to this function under the data parameter it will take the path provided in the config['data'] entry, load it and use it. It is useful as a fallback option, however it is expected that as part of the integration of Optimus into a pipeline, some data will be passed to this function call. Parameters ---------- data : pd.core.series.Series a series object containing the strings that need to be processed (default=None) save_csv : bool this dictates if the full prowl will be saved as a csv (default=False) full : bool this dictates if the data returned to the user in the form of a full dataframe or just a series of predicted labels (default=False) verbose : bool this parameter dictates how much will be printed. if false only a few lines will be output. (default=True) runKNN : bool this parameter dictates if the K Nearest Neighbour algorythm will be applied to the labels that are not picked up in the normal run of optimus Returns ------- pd.core.series.Series / pd.core.frame.DataFrame depending on the full setting this will return the output of the last depth or a full dataframe with outputs from each iteration """ # set the verbosity setting self.verbose = verbose # notes self.vprint('-- Performing setup') self.vprint('_' * 79) # load config before each run self.config = self.load_config(self.config_path) # reformat provided series into accepted format data = self.catch_input(data) # build looping mechanism, adding 1 to the depth of # ratchet and changing the dataset passing through the classes # free text loading self.vprint("-- Loading descriptions") if data: self.vprint("-- Ingesting provided series") L = Loader(self.config, data) else: self.vprint("-- No custom data provided, using data from config") L = Loader(self.config) # start a dataframe that will track the labels at each level prowl = pd.DataFrame.from_dict(L.linked, orient='index') prowl = prowl.reset_index() prowl.columns = ['original', 'current_labels'] # embed the words using fastText if hasattr(self, "matrix"): self.vprint("-- Model already loaded") else: self.vprint("-- Loading model") self.matrix = ft.load_model(self.config['model']) self.vprint("-- Embedding") clusterer = Clusterer(L, self.matrix, self.config) # clustering self.vprint("-- Clustering") CC = ClusterConstructor(clusterer, self.config) # start the loop for each depth self.vprint('_' * 79) # some decoration while CC.iterate: self.vprint(f"-- Depth: {CC.distance}") # some decoration self.vprint('_' * 79) # some decoration # edit distance based metrics ED = EditDistance(CC, self.config) # push the rejected clusters back to the ClusterConstructor # for the next phase CC.clusters = ED.rejected self.vprint( f" ** | Edit Distance | classified: {len(ED.accepted)}") # class for character and word n-gram and scoring WG = WordGram(CC, self.config) # push the rejected clusters back to CC for the next phase CC.clusters = WG.rejected self.vprint( f" ** | Word Grams | classified: {len(WG.accepted)}") # class for character and word n-gram and scoring CG = CharGram(CC, self.config) # push the rejected clusters back to CC for the next phase CC.clusters = CG.rejected self.vprint( f" ** | Character Grams | classified: {len(CG.accepted)}") # class for finding suitable hypernyms from WordNet HN = Hypernyms(CC, self.config) # push the rejected clusters back to CC for the next phase CC.clusters = HN.rejected self.vprint( f" ** | Hyponyms | classified: {len(HN.accepted)}") # gatekeeper, overwrites the CC with what it needs for the # next push H = Gatekeeper(CC, ED, WG, CG, HN, self.matrix, self.config, prowl) CC = H.clusterconstructor prowl = H.prowl self.vprint('_' * 79) # if requested run a KNN on the non_labeled data #if runKNN: # self.vprint(f"-- Performing KNN") # K = KNN(H, self.matrix) # self.KNN_predictions = pd.DataFrame(K()) # self.KNN_predictions.to_csv('knn_results.csv') # self.vprint(self.KNN_predictions) # clean up after yourself self.clean_up() # return output return self.handle_output(prowl, save_csv=save_csv, full=full)
import json import fastText model_file = './model/ft_train.bin' number_of_labels = 3 with open('./model/class_label.json','r',encoding = 'utf-8') as f: class_label = json.load(f) model = fastText.load_model(model_file) def predict_field(text): res = [] tmp = model.predict(text,number_of_labels) labels = tmp[0] prob = tmp[1] for i,label in enumerate(labels): # if prob[i] > 0.05: res.append(label.replace('__label__','').replace('_',' ')) return res def intersect(a, b): return list(set(a) & set(b)) def getListJD(text,field): if (field not in class_label.keys()): return [] return class_label[field] def retrieval_jd(text,fields): list_jd = []
import pandas as pd from keras.layers import Bidirectional, CuDNNGRU, SpatialDropout1D, GlobalAveragePooling1D, concatenate, Dropout, Dense from keras.models import Model from fastText import load_model import tensorflow as tf import os import keras.backend.tensorflow_backend as KTF classes = ['requires_reply'] path = './' training_filename = '' test_filename = '' fasttext_filename = 'cc.de.300.bin' ft_model = load_model(path + fasttext_filename) n_features = ft_model.get_dimension() print('Dimensions ' + str(n_features)) train = pd.read_csv(path + training_filename) test = pd.read_csv(path + test_filename) def get_session(gpu_fraction=0.9): num_threads = os.environ.get('OMP_NUM_THREADS') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) if num_threads: return tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, intra_op_parallelism_threads=num_threads))
X = (train["description"].fillna("") + " " + train["title"].fillna("")).values X_test = (test["description"].fillna("") + " " + test["title"].fillna("")).values y = pd.read_csv("../input/train.csv")["deal_probability"].values embed_size = EMBEDDING_SIZE maxlen_desc = 120 maxlen_title = 21 embedding_vectors = {} word_idx = {} embedding_vectors["UNKNOWN"] = np.zeros(embed_size) word_idx["UNKNOWN"] = len(word_idx) model = fastText.load_model(EMBEDDING_FILE) #.replace(".vec", ".bin")) print("getting embedding vectors...") for text in tqdm(list(X) + list(X_test)): # max_length, padding, for w in text.split(): try: embedding_vectors[w] = model.get_word_vector(w) if w in word_idx: continue else: word_idx[w] = len(word_idx) except: pass idx_to_word = {v: k for k, v in word_idx.items()} del model train_test_index_list_1 = []
BIG_CATEGORY = 'mobile' N_CLASSES = 27 if __name__ == "__main__": psychic_learners_dir = Path.cwd().parent valid_data = os.path.join(psychic_learners_dir, 'data', f'{BIG_CATEGORY}_valid_split.csv') test_data = os.path.join(psychic_learners_dir, 'data', f'{BIG_CATEGORY}_test_split.csv') ROOT_PROBA_FOLDER = os.path.join(psychic_learners_dir, 'data', 'probabilities', BIG_CATEGORY, 'extractions_fasttext') valid_data = pd.read_csv(valid_data) test_data = pd.read_csv(test_data) model = load_model( str(psychic_learners_dir / 'data' / 'fasttext_models' / f'{BIG_CATEGORY}_extractions_model.bin')) valid_preds = [] test_preds = [] for title in valid_data['extractions'].values: if title == '0': # comment out if normal title valid_preds.append(np.zeros(N_CLASSES)) continue title = ' '.join(literal_eval(title)) # comment out if normal title pred = model.predict(title, k=N_CLASSES) pred = sorted(zip(pred[0], pred[1]), key=lambda x: x[0]) pred = [x[1] for x in pred] valid_preds.append(pred) for title in test_data['extractions'].values: if title == '0': # comment out if normal title
import xml.etree.cElementTree as ET from xml.dom import minidom import sys import json import re import os import datetime import fnmatch from language_tagger import tag_language from trending_videos_difference_server import difference import fastText # Load fastText language detection model langdetect = fastText.load_model( '/disk/data/share/MTproject/fastText/langdetect.bin') categories = { '1': 'Film & Animation', '2': 'Cars & Vehicles', '10': 'Music', '15': 'Pets & Animals', '17': 'Sport', '19': 'Travel & Events', '20': 'Gaming', '22': 'People & Blogs', '23': 'Comedy', '24': 'Entertainment', '25': 'News & Politics', '26': 'How-to & Style', '27': 'Education', '28': 'Science & Technology',
def load_model(cls, model_fpath): clf = FasttextClassifier() clf.classifier_ = fasttext.load_model(model_fpath) print 'load model from %s success!' % (model_fpath) return clf
def __init__(self): self.encoder = cPickle.loads(open("encoder2.7.p", "rb").read()) self.classifier = load_model('models/fasttext.bin')
# Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction # y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1)) # print(lstm_out.size()) # print(lstm_out.contiguous().view(-1, self.hidden_dim*2).size()) y_pred = self.linear(lstm_out.contiguous().view( -1, self.hidden_dim * 2)) y_pred = y_pred.view(batch_size, -1, self.output_dim) # y_pred = torch.nn.functional.softmax(y_pred) # y_pred = torch.nn.Softmax(dim=-1)(y_pred) # print(y_pred.size()) # y_pred = nn.Softmax()(y_pred) return y_pred print("loading embeddings...") ft_hi = fastText.load_model( "/home1/zishan/WordEmbeddings/FastText/wiki.bn.bin") print("loading dictionaries...") class_index = json.load(open("../../Data/Crosslingual/class_index.json")) word_index = pd.read_pickle( "../../Data/Crosslingual/universal_word_index.pickle") print("loading training data...") train = pd.read_pickle('../../Data/Crosslingual/Bengali_train.pickle') print("loading testing data...") test = pd.read_pickle('../../Data/Crosslingual/Bengali_test.pickle') test_trig = np.asarray(test['trigger'].tolist()) test_trig = test_trig.reshape(len(test_trig), 75, 1) #test_trig = to_categorical(test_trig,2) test_sentences = np.asarray(test['sentences_token'].tolist())
def process_data(data): model = load_model("wiki.simple.bin") model_s = list() dic = {} back = {} t = 1 zero = np.zeros(300) model_s.append(zero) model_s.append(zero) print(model_s[0]) dic['$'] = 0 #end dic['&'] = 1 #none back[0] = '$' back[1] = '&' max_par_size = 0 max_que_size = 0 contexts_input = list() contexts_len = list() questions_input = list() questions_len = list() ans_start = list() ans_end = list() for i in tqdm(range(len(data['data']))): # проход по всем статьям википедии article = data['data'][i] # проход по всем параграфам в данной статье for parag in article['paragraphs']: # сюда будем пихать численное описание вектора context_vector = list() context = parag['context'] t_context = tokenize(context) for i in range(len(t_context)): if t_context[i] in dic: x = dic[t_context[i]] else: t += 1 dic[t_context[i]] = t back[t] = t_context[i] # тут надо еще обыграть что может не быть такого вектора # !!! model_s.append(model.get_word_vector(t_context[i])) x = t context_vector.append(x) context_size = len(context_vector) # проход по всем вопросам к этому парарграфу for qn in parag['qas']: question = qn['question'] t_question = tokenize(question) quest_vector = list() for j in range(len(t_question)): if t_question[j] in dic: x = dic[t_question[j]] else: t += 1 dic[t_question[j]] = t back[t] = t_question[j] # тут надо еще обыграть что может не быть такого вектора # !!! model_s.append(model.get_word_vector(t_question[j])) x = t quest_vector.append(x) quest_size = len(quest_vector) answer = qn['answers'][0]['text'] start, end = find_span(context, answer) #Добавим в массив: contexts_input.append(context_vector) contexts_len.append(context_size) questions_input.append(quest_vector) questions_len.append(quest_size) ans_start.append(start) ans_end.append(end) max_que_size = max(quest_size, max_que_size) max_par_size = max(context_size, max_par_size) #paragraphs_ft.append(vec) return model_s, dic, back, max_par_size, max_que_size, contexts_input, contexts_len, questions_input, questions_len, ans_start, ans_end,
import fastText from nltk import sent_tokenize from gensim.models import Word2Vec import numpy as np import json import sys import time from nltk import ngrams from nltk.stem import PorterStemmer from nltk import sent_tokenize from nltk import word_tokenize ps = PorterStemmer() model = fastText.load_model( '../../../Divers_Data_Maitrise/wiki.simple/wiki.simple.bin') #model = fastText.load_model('../embeding_perso_fastText/data_embeding.bin') #model = fastText.load_model('../embeding_perso_fastText/train_steam_embeding.bin') path_data = '../../../Data_Maitrise/data/' path_dest = '../../../Data_Maitrise/data_perso/' time1 = time.time() with_steming_param = False k_best_sentences = 1 n = 2 similarity_type = 1 #1 : cosine, 2:dice def get_best_sentence_no_repli(model: fastText, list_sentence,
help="Model to use", ) parser.add_argument( "question_words", help="word questions similar to tmikolov's file (see help for link)", ) parser.add_argument( "threshold", help="threshold used to limit number of words used", ) args = parser.parse_args() args.threshold = int(args.threshold) # Retrieve list of normalized word vectors for the first words up # until the threshold count. f = load_model(args.model) # Gets words with associated frequeny sorted by default by descending order words, freq = f.get_words(include_freq=True) words = words[:args.threshold] vectors = np.zeros((len(words), f.get_dimension()), dtype=float) for i in range(len(words)): wv = f.get_word_vector(words[i]) wv = wv / np.linalg.norm(wv) vectors[i] = wv total_correct = 0 total_qs = 0 total_num_lines = 0 total_se_correct = 0 total_se_qs = 0
def fastText_predict(text): model = fastText.load_model('lid.176.ftz') return model.predict(text)[0][0][-2:]