Exemple #1
0
 def gen_test_tokenize(self, kwargs):
     self.assertEqual(["asdf", "asdb"], fastText.tokenize("asdf asdb"))
     self.assertEqual(["asdf"], fastText.tokenize("asdf"))
     self.assertEqual([fastText.EOS], fastText.tokenize("\n"))
     self.assertEqual(["asdf", fastText.EOS], fastText.tokenize("asdf\n"))
     self.assertEqual([], fastText.tokenize(""))
     self.assertEqual([], fastText.tokenize(" "))
     # An empty string is not a token (it's just whitespace)
     # So the minimum length must be 1
     words = get_random_words(100, 1, 20)
     self.assertEqual(words, fastText.tokenize(" ".join(words)))
Exemple #2
0
 def gen_test_tokenize(self, kwargs):
     self.assertEqual(["asdf", "asdb"], fastText.tokenize("asdf asdb"))
     self.assertEqual(["asdf"], fastText.tokenize("asdf"))
     self.assertEqual([fastText.EOS], fastText.tokenize("\n"))
     self.assertEqual(["asdf", fastText.EOS], fastText.tokenize("asdf\n"))
     self.assertEqual([], fastText.tokenize(""))
     self.assertEqual([], fastText.tokenize(" "))
     # An empty string is not a token (it's just whitespace)
     # So the minimum length must be 1
     words = get_random_words(100, 1, 20)
     self.assertEqual(words, fastText.tokenize(" ".join(words)))
Exemple #3
0
def cleanData():
    data = pd.read_csv("data.csv", sep=";")
    data["text"] = data["text"].apply(str)
    data["text"] = data["text"].apply(lambda x: x.lower())
    # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\x84|ä)", "ae", x))
    # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\x9c|ü)", "ue", x))
    # data["text"] = data["text"].apply(lambda x: re.sub("(\xc3\xb6|ö)", "oe", x))
    ##see for tokenization rules: https://github.com/facebookresearch/fastText/blob/master/python/README.md
    tokenizer = lambda x: " ".join(fastText.tokenize(x))
    data["text"] = data["text"].apply(tokenizer)
    ##delete any non-alpha character
    data["text"] = data["text"].apply(lambda x: re.sub("[^A-Za-z ]", "", x))
    ##delete if text empty
    data = data[data["text"]!=""]
    data.to_csv("data.csv", sep=";", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)
    def _data_generator(self):
        for i in range(len(self.X_train // self.opt['batch_size'])):
            X_sample = self.X_train[i * self.opt['batch_size']:i *
                                    self.opt['batch_size'] +
                                    self.opt['batch_size']]
            y_sample = self.y_train[i * self.opt['batch_size']:i *
                                    self.opt['batch_size'] +
                                    self.opt['batch_size']]

            X_sample = [[
                self.ft.get_word_vector(word)
                for word in fastText.tokenize(sent)
            ] for sent in X_sample]

            yield X_sample, y_sample
def get_word_vectors(text):
    """
    Tokenize text and return fastText word vectors.
    Requires a fastText model to be loaded (see :func:`load_fasttext_model`)

    :param text: input text
    :return: word vector matrix
    """
    try:
        matrix = [
            _fasttext_model.get_word_vector(w)
            for w in fastText.tokenize(util.normalize_message_text(text))
        ]
    except Exception as e:
        logger.error('Failed to tokenize line: {}'.format(e))
        matrix = [get_word_vector('')]

    if len(matrix) == 0:
        matrix = [get_word_vector('')]

    return np.array(matrix)
def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))
Exemple #7
0
def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))
Exemple #8
0
songs = pd.read_csv('../dataset/dataset.csv')

print("Loading the embedding model for querying...")
model = ft.load_model("./embeddings_lyrics.bin")
dim = model.get_dimension()

print("Loading the embeddings for songs to compare...")
embedding_of_lyrics = pd.read_csv('../database/song_embedings.csv').values[1:,
                                                                           1:]

print("Making the KDTree tree...")
tree = KDTree(embedding_of_lyrics)

while (True):
    query = input("Enter your query : ")
    tokens = ft.tokenize(query)
    #print(tokens)

    # embedding matrix for the query
    query_embedding_matrix = []
    for x in tokens:
        query_embedding_matrix.append(model.get_word_vector(x))

    # embedding of the query - using avg method with fastText
    embedding_of_query = model.get_word_vector(query)

    # finding the num_search_results closest songs
    # # finding the index
    dist, ind = tree.query([embedding_of_query], k=num_search_results)
    print(ind)
Exemple #9
0
 def test_tokenize(self):
     train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9")
     with open(train, 'r') as f:
         _ = tokenize(f.read())
Exemple #10
0
 def test_tokenize(self):
     train, _, _ = self.build_paths("fil9", "rw/rw.txt", "fil9")
     with open(train, 'r') as f:
         _ = tokenize(f.read())