Ejemplo n.º 1
0
 def get_one_sentence_vector(cls, tm, sentence):
     import fasttext
     tokens = fasttext.tokenize(sentence)
     if isinstance(tm, fasttext.FastText._FastText):
         result = torch.tensor([tm[t] for t in tokens])
     elif isinstance(tm, torchnlp.word_to_vector.char_n_gram.CharNGram):
         result = torch.stack([tm[t] for t in tokens])
     else:
         result = tm[tokens]
     return result
Ejemplo n.º 2
0
def tokenize(keyword):
    """
    Tokenizes using default fasttext tokenizer.

    Args:
        keyword: Keyword string (can be multi-word phrase!).

    Returns:
        List of words (tokens) from the keyword.
    """
    return fasttext.tokenize(keyword)
Ejemplo n.º 3
0
def is_similar_fasttext(str1: str, str2: str) -> Tuple[bool, float]:
    """[summary] TODO

    Arguments:
        str1 {str} -- [description]
        str2 {str} -- [description]
    Returns:
        Tuple[bool, float] -- [description]
    """
    # !!модель весит 6.7GB!!
    kaz_ft_model = fasttext.load_model(path="./models/cc.kk.300.bin")

    str1_tok = fasttext.tokenize(text=str1)
    str2_tok = fasttext.tokenize(text=str2)

    ratio = Levenshtein.ratio(str1, str2)

    if ratio < SIMILARITY_THRESHOLD:
        return (False, ratio)
    else:
        return (True, ratio)
    def get_nltk_vectors(self, texts: List[str]):
        # https://gist.github.com/japerk/1909413
        from textblob import TextBlob
        sid = self.nltk_sid
        vsid = self.vader_sid
        pdict = self.pdict
        n_tokens_in = self.n_tokens_in
        rake = self.rake_nltk
        nltk_texts = [fasttext.tokenize(text) for text in texts]
        textblob_sentiments = [[sentiment.polarity, sentiment.subjectivity] for sentiment in [TextBlob(text).sentiment for text in texts]]
        textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(1).expand(len(texts), n_tokens_in, 2)
        textblob_sentiments = textblob_sentiments.to(get_device())

        mask = stack_and_pad_tensors(list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in)
        mask = mask.to(get_device())
        mask = self.is_mask_em(mask)
        has_digit = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in)
        has_digit = has_digit.to(get_device())
        has_digit = self.has_digit_em(has_digit)

        m = self.text_model
        nltk_emb = stack_and_pad_tensors([torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size)
        nltk_emb = nltk_emb.to(get_device())
        sid_vec = torch.tensor([list(sid.polarity_scores(t).values()) for t in texts])
        sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1))
        sid_vec = sid_vec.to(get_device())
        vsid_vec = torch.tensor([list(vsid.polarity_scores(t).values()) for t in texts])
        vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1))
        vsid_vec = vsid_vec.to(get_device())
        conlltags = [[ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))] for x in nltk_texts]

        pos = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in)
        pos = pos.to(get_device())
        pos_emb = self.tag_em(pos)
        ner = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[ne.lower().split("-")[-1]] for token, tag, ne in x]), conlltags)), n_tokens_in)
        ner = ner.to(get_device())
        ner_emb = self.tag_em(ner)

        phrases = [get_rake_nltk_phrases(rake, t) for t in texts]

        key_wc_rake_nltk = [get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases)]
        key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in)
        key_wc_rake_nltk = key_wc_rake_nltk.to(get_device())
        nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk)

        result = torch.cat([vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit], 2)
        result = result.to(get_device())
        result = self.nltk_nn(result)
        return result
Ejemplo n.º 5
0
 def gen_test_tokenize(self, kwargs):
     self.assertEqual(["asdf", "asdb"], fasttext.tokenize("asdf asdb"))
     self.assertEqual(["asdf"], fasttext.tokenize("asdf"))
     self.assertEqual([fasttext.EOS], fasttext.tokenize("\n"))
     self.assertEqual(["asdf", fasttext.EOS], fasttext.tokenize("asdf\n"))
     self.assertEqual([], fasttext.tokenize(""))
     self.assertEqual([], fasttext.tokenize(" "))
     # An empty string is not a token (it's just whitespace)
     # So the minimum length must be 1
     words = get_random_words(100, 1, 20)
     self.assertEqual(words, fasttext.tokenize(" ".join(words)))
Ejemplo n.º 6
0
    def input_vectors(self, sentences: list):

        max_num_words = 4
        tokens = []
        batch_size = len(sentences)

        h, w = (self.dim, 1)

        #loop over the batches to tokenize the inputs
        for i in range(batch_size):
            #Tokenize words using default fasttext tokenizer, which creates tokens
            # by dividing splitting at word separating chars
            tokens.append(fasttext.tokenize(sentences[i]))

        #Create a matrix with batch_size batches, num token channels and 100x1 matrices to store the 100dim embeddings
        in_vector = np.zeros((batch_size, max_num_words, h, w))

        #cycle over the tokens and get their vectors, reshape them to 100x1 and store in the corresponding
        #channel in the return variable

        #cycle over the entire batch
        for j in range(len(tokens)):

            #counter for tokens
            i = 0

            #cycle over tokens
            for token in tokens[j]:

                #get the embedding for the single token
                vector = torch.tensor(self.ft[token].astype(np.double))

                #reshape it to desired dims
                vector = vector.reshape(h, w)

                #Store it in the input vectors matrix
                in_vector[j][i] = vector

                #increment the position of the word index within the given sentence
                #if it goes over the max word size, cut
                i = i + 1
                if (i == max_num_words):
                    break

        #create a tensor object to return
        in_vector = torch.tensor(in_vector)

        return in_vector
Ejemplo n.º 7
0
    def _process_variable(self, word):
        labels = []
        words = []

        variable_word = word.replace('{', '').replace('}', '')
        try:
            words = fasttext.tokenize(
                self.literals[variable_word].get().lower())
        except:
            print(word, variable_word)

        for i in range(len(words)):
            label = BEGINNING + variable_word if i == 0 else INSIDE + variable_word
            labels.append(label)

        return words, labels
Ejemplo n.º 8
0
def get_fasttext_embeddings(x: List[str], ft=None, path: str = None):
    if ft is None:
        if path is None:
            raise Exception("Both path and ft can't be None")
        ft = fasttext.load_model(path)

    embeddings = []
    for sentence in x:
        tokens = fasttext.tokenize(sentence)
        representation = []
        for token in tokens:
            representation.append(ft[token])
        embeddings.append(representation)

    embeddings = pad(embeddings, [0 for _ in range(100)], 32)
    return embeddings
Ejemplo n.º 9
0
def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))
Ejemplo n.º 10
0
def preprocess_text_for_language_detection(text: str):
    """
    Cleans the text as per fasttext requirements.
    The requirements can be found here: https://pypi.org/project/fasttext/
    
    :text: str: text to clean
    :returns: str: cleaned text
    """
    # fastText assumes UTF-8 encoded text
    text = str(text)

    # fastText is not aware of UTF-8 whitespace
    # Replace all white space with space
    text = white_space_pattern.sub(text, " ")

    # Tokenize text, per fastext function and rejoin
    tokens = tokenize(text)
    text = " ".join(tokens)
    n = len(tokens)

    # Remove white space char as it affects the model accuracy
    text = text.replace("</s>", "")

    return text.lower()
Ejemplo n.º 11
0
 def encode(self, text):
     tokens = tokenize(text.lower().replace('\n', ' ') + '\n')
     return [
         self.vocab[t] if t in self.vocab else self.vocab['[UNK]']
         for t in tokens
     ]
def get_corpus_description(data):
    corpus = ''.join(data["text"])
    corpus_tokens = fasttext.tokenize(corpus)
    print("raw number of tokens: %d" % len(corpus_tokens))
    counts = Counter(corpus_tokens)
    print("raw number of disctinct tokens: %d" % len(counts))

    print("#### running the spellchecker###")
    # the only spellcheck I'm doing is removing repeated letters
    checked_corpus = re.sub(r"(.)\1{2,}", r"\1\1", corpus)
    corpus_tokens = fasttext.tokenize(checked_corpus)
    print("raw number of tokens: %d" % len(corpus_tokens))
    counts = Counter(corpus_tokens)
    print("raw number of disctinct tokens: %d" % len(counts))

    print('#### removing html tags')
    html_free_corpus = re.sub(
        '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\\u201c|\\u2019', '',
        checked_corpus)
    html_free_corpus_tokens = fasttext.tokenize(html_free_corpus)
    print("number of tokens: %d" % len(html_free_corpus_tokens))
    html_counts = Counter(html_free_corpus_tokens)
    print("number of disctinct tokens: %d" % len(html_counts))

    print('#### removing links')
    link_free_corpus = re.sub(
        r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
        '', html_free_corpus)
    link_free_corpus_tokens = fasttext.tokenize(link_free_corpus)
    print("number of tokens: %d" % len(link_free_corpus_tokens))
    link_counts = Counter(link_free_corpus_tokens)
    print("number of disctinct tokens: %d" % len(link_counts))

    print("#### removing special symbols and numbers ###")
    # remove special characters and numbers
    clean_corpus = re.sub('[^A-Za-z\s]+', '', link_free_corpus)
    clean_corpus_tokens = fasttext.tokenize(clean_corpus)
    print("number of tokens: %d" % len(clean_corpus_tokens))
    clean_counts = Counter(clean_corpus_tokens)
    print("number of disctinct tokens: %d" % len(clean_counts))

    # only the special symbols
    dirty_corpus = re.sub('[A-Za-z\s]+', '', corpus)
    distinct_symbols = Counter(dirty_corpus)
    print("Number of distinct removed special symbols: %d" %
          len(distinct_symbols))

    print("#### removing stop words ####")
    stop_words = set(stopwords.words('english'))
    stop_words = [re.sub('[^A-Za-z\s]+', '', word) for word in stop_words]
    print("number of stop words: %d" % len(stop_words))
    corpus_wo_stop_words = [
        token for token in clean_corpus_tokens if not token in stop_words
    ]
    counts_wo_stopwords = Counter(corpus_wo_stop_words)
    print("Number of tokens wo stopwords: %d" % len(corpus_wo_stop_words))
    print("Number of distinct tokens: %d" % len(counts_wo_stopwords))

    print("#### lemmatization ####")
    lemmatizer = WordNetLemmatizer()
    lemmatized_corpus = [
        lemmatizer.lemmatize(x) for x in tqdm(corpus_wo_stop_words)
    ]
    counts_lemmatized = Counter(lemmatized_corpus)
    print("Number lemmatized tokens: %d" % len(lemmatized_corpus))
    print("Number of distinct lemmatized tokens: %d" % len(counts_lemmatized))
    return counts_lemmatized.most_common(25000)
Ejemplo n.º 13
0
def description_to_tensor(model, desc):
    return torch.stack([
        torch.tensor(model.get_word_vector(w)) for w in fasttext.tokenize(desc)
    ])
 def get_one_sentence_vector(self, m, text):
     vs = min(m.vector_size, 150)
     zeros = np.zeros(vs)
     result = [m[t][:150] if t in m else zeros for t in fasttext.tokenize(text)]
     return torch.tensor(result, dtype=float)