def gen_test_tokenize(self, kwargs): self.assertEqual(["asdf", "asdb"], fastText.tokenize("asdf asdb")) self.assertEqual(["asdf"], fastText.tokenize("asdf")) self.assertEqual([fastText.EOS], fastText.tokenize("\n")) self.assertEqual(["asdf", fastText.EOS], fastText.tokenize("asdf\n")) self.assertEqual([], fastText.tokenize("")) self.assertEqual([], fastText.tokenize(" ")) # An empty string is not a token (it's just whitespace) # So the minimum length must be 1 words = get_random_words(100, 1, 20) self.assertEqual(words, fastText.tokenize(" ".join(words)))
def cleanData(): data = pd.read_csv("data.csv", sep=";") data["text"] = data["text"].apply(str) data["text"] = data["text"].apply(lambda x: x.lower()) # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\x84|ä)", "ae", x)) # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\x9c|ü)", "ue", x)) # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\xb6|ö)", "oe", x)) ##see for tokenization rules: https://github.com/facebookresearch/fastText/blob/master/python/README.md tokenizer = lambda x: " ".join(fastText.tokenize(x)) data["text"] = data["text"].apply(tokenizer) ##delete any non-alpha character data["text"] = data["text"].apply(lambda x: re.sub("[^A-Za-z ]", "", x)) ##delete if text empty data = data[data["text"]!=""] data.to_csv("data.csv", sep=";", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)
def _data_generator(self): for i in range(len(self.X_train // self.opt['batch_size'])): X_sample = self.X_train[i * self.opt['batch_size']:i * self.opt['batch_size'] + self.opt['batch_size']] y_sample = self.y_train[i * self.opt['batch_size']:i * self.opt['batch_size'] + self.opt['batch_size']] X_sample = [[ self.ft.get_word_vector(word) for word in fastText.tokenize(sent) ] for sent in X_sample] yield X_sample, y_sample
def get_word_vectors(text): """ Tokenize text and return fastText word vectors. Requires a fastText model to be loaded (see :func:`load_fasttext_model`) :param text: input text :return: word vector matrix """ try: matrix = [ _fasttext_model.get_word_vector(w) for w in fastText.tokenize(util.normalize_message_text(text)) ] except Exception as e: logger.error('Failed to tokenize line: {}'.format(e)) matrix = [get_word_vector('')] if len(matrix) == 0: matrix = [get_word_vector('')] return np.array(matrix)
def get_word_vector(data, model): t1 = time.time() print("Reading") with open(data, 'r') as f: tokens = tokenize(f.read()) t2 = time.time() print("Read TIME: " + str(t2 - t1)) print("Read NUM : " + str(len(tokens))) f = load_model(model) # This is not equivalent to piping the data into # print-word-vector, because the data is tokenized # first. t3 = time.time() i = 0 for t in tokens: f.get_word_vector(t) i += 1 if i % 10000 == 0: sys.stderr.write("\ri: " + str(float(i / len(tokens)))) sys.stderr.flush() t4 = time.time() print("\nVectoring: " + str(t4 - t3))
songs = pd.read_csv('../dataset/dataset.csv') print("Loading the embedding model for querying...") model = ft.load_model("./embeddings_lyrics.bin") dim = model.get_dimension() print("Loading the embeddings for songs to compare...") embedding_of_lyrics = pd.read_csv('../database/song_embedings.csv').values[1:, 1:] print("Making the KDTree tree...") tree = KDTree(embedding_of_lyrics) while (True): query = input("Enter your query : ") tokens = ft.tokenize(query) #print(tokens) # embedding matrix for the query query_embedding_matrix = [] for x in tokens: query_embedding_matrix.append(model.get_word_vector(x)) # embedding of the query - using avg method with fastText embedding_of_query = model.get_word_vector(query) # finding the num_search_results closest songs # # finding the index dist, ind = tree.query([embedding_of_query], k=num_search_results) print(ind)
def test_tokenize(self): train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9") with open(train, 'r') as f: _ = tokenize(f.read())