Ejemplo n.º 1
0
 def get_entity_pymorphy(self, q_text):
     """
     Look for (capitalized) entities in q_text.
     For this specific application pymorphy2 tagging is enough.
     """
     forbidden = [
         'ВВП', 'HDI', 'ISO', 'ООН', 'UN', 'UTC', 'Utc-Поправка',
         'Utc-Поправка'
     ]
     words = fix_hyphens(tokenize_words(q_text))
     phrase = []
     for i, w in enumerate(words[1:]):
         if w in forbidden:
             continue
         if w[0] == w[0].upper():
             w_parsed = morph.parse(w.strip(' ?'))[0]
             w_lemma = w_parsed.normal_form
             if w_lemma in self.lem_dict:
                 if 'ADJF' in w_parsed.tag:
                     phrase.append(gender_agree(w_parsed).title())
                     phrase.append(
                         morph.parse(words[i +
                                           2].strip(' ?'))[0].normal_form)
                     return ' '.join(phrase).title()
                 elif 'NOUN' in w_parsed.tag:
                     return w_lemma.title()
                 elif 'UNKN' in w_parsed.tag:
                     return w_lemma.title()
             matches = get_close_matches(w_lemma.title(),
                                         list(self.disamb_dict.keys()))
             if matches:
                 return matches[0]
             else:
                 continue
     return None
Ejemplo n.º 2
0
    def org_form(self):
        if self.short_name.strip():
            tokenized = tokenize_words(self.short_name)
            if tokenized:
                return tokenized[0].upper()

        return "NONE"
Ejemplo n.º 3
0
def get_unique_delimiters(texts):
    delimiters = set()
    for text in texts:
        for word in tokenize_uk.tokenize_words(text):
            if (len(word) == 1 and not word in delimiters
                    and not word.isalpha() and not word.isdigit()):
                delimiters.add(word)
    return delimiters
Ejemplo n.º 4
0
 def tokenize_words(raw, is_tokenize_uk=False):
     """
     uses nltk by default
     if 'is_tokenize_uk' is True, then uses tokenize_uk
     """
     if is_tokenize_uk:
         return tokenize_uk.tokenize_words(raw)
     else:
         return nltk.word_tokenize(raw)
Ejemplo n.º 5
0
def clean_texts(texts):
    dashes = {'–', '—', '―', '~'}  # replace with -
    special_symbols = {'№', '_', '<', '>', '|', ']', '*', '[', '^',
                       '&'}  # replace with ""
    apostrophes = {'’', '‘'}  # replace with '
    direct_speech = {'“', '»', '«'}  # replace with '"'
    three_dots = {'…'}  # replace with '.

    counter = 0
    for i in range(len(texts)):
        print("Processing text: ", i)
        text = texts[i]
        words = []
        tokenized_words = tokenize_uk.tokenize_words(text)
        for word in tokenized_words:
            added = False

            for dash in dashes:
                if (dash in word):
                    new_word = word.replace(dash, "-")
                    words.append(new_word)
                    added = True
                    continue

            for special_symbol in special_symbols:
                if (special_symbol in word):
                    new_word = word.replace(special_symbol, "")
                    words.append(new_word)
                    added = True
                    continue

            for apostrophe in apostrophes:
                if (apostrophe in word):
                    new_word = word.replace(apostrophe, "'")
                    words.append(new_word)
                    added = True
                    continue

            for direct in direct_speech:
                if (direct in word):
                    new_word = word.replace(direct, '"')
                    words.append(new_word)
                    added = True
                    continue

            for dots in three_dots:
                if (dots in word):
                    counter += 1
                    new_word = word.replace(dots, '.')
                    words.append(new_word)
                    added = True
                    continue
            if (not added):
                words.append(word)
        reconstructed_text = " ".join(words)
        texts[i] = reconstructed_text
    return texts
Ejemplo n.º 6
0
    def transform_sentences(self, sentences):
        result = []
        for sentence in sentences:
            words = tk.tokenize_words(sentence)
            words = [re.sub(self.reg, '', x) for x in words]
            words = [x for x in words if x.strip()]
            words = [x.lower() for x in words]
            words = lemmatize(words, morph=self.morph)
            words = delete_stop_words(words, stop_words=self.stop_words)
            result.append(words)

        return result
Ejemplo n.º 7
0
def sentiment_features(comment, stars, check_stars=True, lemmatization=True):
    features = {}
    for word in tokenize_words(comment):
        if lemmatization:
            word = normalize_word(word)
        if word.lower() in uk_sentiment_dict.keys():
            features['sentiment'] = word.lower()
    if 'sentiment' not in features.keys():
        features['sentiment'] = None
    if check_stars:
        if stars == '5':
            features['stars'] = '5'
    return features
Ejemplo n.º 8
0
 def preprocess_sent(self, s):
     s = str(s).lower()
     words = tokenize_uk.tokenize_words(s)
     words = [word for word in words if word not in self.stop_words]
     words = [self.ld[word] if word in self.ld else word for word in words]
     words = [
         self.emb[self.word2id[word]] for word in words
         if word in self.word2id
     ]
     words = np.array(words)
     if words.shape[0] > self.max_words:
         words = np.array([])
     return words
Ejemplo n.º 9
0
 def get_features_sklearn(self, ent, sent):
     features = dict()
     words = fix_hyphens(tokenize_words(sent))
     bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
     n = 3
     char_trigrams = [sent[i:i + n] for i in range(len(sent) - n + 1)]
     for w in words:
         features[w] = 1
     for b in bigrams:
         features[b] = 1
     for c in char_trigrams:
         features[c] = 1
     return ent, features
Ejemplo n.º 10
0
def process_sentence(s):
    tok = tokenize_uk.tokenize_words(s)
    if len(tok) <= 1:
        return
    for (i, token) in enumerate(tok):
        m = morph.parse(token)
        p = m[0]
        if 'NOUN' in p.tag and 'anim' in p.tag:
            left = i - 1
            # right = i+1
            if left >= 0:
                p_left = find_adjective(morph.parse(tok[left]))
                if p_left:
                    add_to_result(p_left, p)
Ejemplo n.º 11
0
def lemmatize_phrase(phrase):
    """
    Also we can stem instead of lemmatizing...
    """
    words = fix_hyphens(tokenize_words(phrase))
    if len(words) == 1:
        wparsed = morph.parse(phrase)[0]
        if not wparsed:
            return phrase
        return wparsed.normal_form
    else:
        new_phrase = ''
        for w in words:
            new_phrase += morph.parse(w)[0].normal_form + ' '
        return new_phrase.strip()
Ejemplo n.º 12
0
def transform_lines_into_rythm(lines, accent_vocab):
    rythm = ''
    for line in lines.split('\n'):
        sent_tokens = tokenize_sents(line)
        for sent in sent_tokens:
            for word in filter(lambda w: w in accent_vocab,
                               tokenize_words(sent)):
                accent_options = accent_vocab[word]

                # use only first one, f**k other options
                word, index = accent_options[0]
                rythm_map = transform_into_rythm_map(word, index)
                rythm += rythm_map
        rythm += '\n'
    return rythm
Ejemplo n.º 13
0
def mask_token_in_sentence(sentence):
    tokenized_sentence = tokenize_uk.tokenize_words(sentence)
    number_of_tokens = len(tokenized_sentence)
    masked_token = False
    while (not masked_token):
        # print(number_of_tokens)
        index = randrange(number_of_tokens)
        # To eliminate tokenization of punctioation like .,;:
        delims = {", ", ".", "!", ":", "?", "'", ";", ''}
        if (not tokenized_sentence[index] in delims):
            tokenized_sentence[index] = "[MASK]"
            masked_token = True
    # print(tokenized_sentence)
    reconstructed_sentence = " ".join(tokenized_sentence)
    return reconstructed_sentence
Ejemplo n.º 14
0
 def get_features_perc(self, ent, sent):
     """
     Given question, get features from it.
     """
     features = {}
     words = fix_hyphens(tokenize_words(sent))
     for i, w in enumerate(words):
         features['word_{i}={w}'.format(i=i, w=w)] = 1
     features['words'] = [('w={w}'.format(w=w), 1) for w in words]
     bigrams = ['_'.join(b) for b in nltk.bigrams(words)]
     features['bigrams'] = [('bg={bg}'.format(bg=bg), 1) for bg in bigrams]
     n = 3
     char_trigrams = [sent[i:i + n] for i in range(len(sent) - n + 1)]
     features['trigrams'] = [('t={t}'.format(t=t), 1)
                             for t in char_trigrams]
     return ent, features
Ejemplo n.º 15
0
def label_data(data, lemmatization=True, check_stars=True):
    for comment in data['results']:
        comment['sentiment'] = 0
        for word in tokenize_words(comment['comment']):
            if lemmatization:
                word = normalize_word(word)
            if word.lower() in uk_sentiment_dict.keys():
                comment['sentiment'] += int(uk_sentiment_dict[word.lower()])
        if comment['sentiment'] > 0:
            comment['sentiment'] = 'positive'
        elif comment['sentiment'] < 0:
            comment['sentiment'] = 'negative'
        else:
            comment['sentiment'] = 'neutral'
        if check_stars:
            if comment['stars'] == '5':
                comment['sentiment'] = 'positive'
Ejemplo n.º 16
0
def main(args):
    lm_files = get_wiki_files(args.wiki_files)
    df = load_files_to_dataframe(lm_files)

    print(df.head().to_string())
    print(df.shape)

    df['text'] = df['text'].apply(lambda x: split_title_from_text(x))
    df['len'] = df['text'].apply(lambda x: len(tokenize_uk.tokenize_words(x)))

    print('Overall number of tokens', df['len'].sum())
    print('Decreasing to ~100 million tokens')

    df = df[df['len'] > 600]

    print('New number of tokens', df['len'].sum())

    df['labels'] = 0
    df = df[['labels', 'text']]

    tokens = UKTokenizer().proc_all_mp(partition_by_cores(df['text'].values))
    labels = list(df['labels'].values.astype(np.int64))

    tokens_trn, tokens_val, labels_trn, labels_val = train_test_split(tokens, labels,
                                                                      test_size=0.1,
                                                                      random_state=1234,
                                                                      shuffle=True)

    # limiting vocabulary to ignore rare words
    freq = Counter(p for o in tokens_trn for p in o)

    itos = [o for o, c in freq.most_common(args.max_vocab) if c > args.min_freq]
    itos.insert(0, '_pad_')
    itos.insert(0, '_unk_')
    stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})

    trn_lm = np.array([[stoi[o] for o in p] for p in tokens_trn])
    val_lm = np.array([[stoi[o] for o in p] for p in tokens_val])

    np.save(os.path.join(args.output_dir, 'trn_ids.npy'), trn_lm)
    np.save(os.path.join(args.output_dir, 'val_ids.npy'), val_lm)

    with open(os.path.join(args.output_dir, 'itos.pkl'), 'wb') as f:
        pickle.dump(itos, f)
Ejemplo n.º 17
0
 def ner_recognize(self, sent):
     sent = sent.strip(string.punctuation)
     tokens = fix_hyphens(tokenize_words(sent))
     feats = []
     for (i, t) in enumerate(tokens):
         if i == 0:
             prev_word = '.'
         else:
             prev_word = tokens[i - 1]
         if i == len(tokens) - 1:
             next_word = '.'
         else:
             next_word = tokens[i + 1]
         feats.append(self._get_ner_features(t, prev_word, next_word))
     labels = self.ner_model.predict(feats)
     first_res = list(zip(tokens, labels))
     res = []
     for token, label in first_res:
         if token in ['море', "моря", "озеро", "озера", "океан", "океану"]:
             res.append((token, 'LOC'))
         else:
             res.append((token, label))
     return res
Ejemplo n.º 18
0
import pymorphy2
import tokenize_uk

morph_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

for i in range(20):
    with open("sampling" + str(i) + ".txt", "r", encoding="utf-8") as file:
        sents = file.readlines()

    pos = {}
    for sentence in sents:
        tokens = tokenize_uk.tokenize_words(sentence)
        for token in tokens:
            if token.isalnum():
                token_pos = morph_analyzer.parse(token)[0].tag.POS
                if token_pos not in pos.keys():
                    pos[token_pos] = 1
                else:
                    pos[token_pos] += 1
    print("Sampling ", i + 1)
    for k, v in pos.items():
        print(k, "-", v)
Ejemplo n.º 19
0
    def test_word_tokenization(self):
        assert tokenize_words("Геогра́фія або земле́пис") == [
            "Геогра́фія", "або", "земле́пис"]

        assert tokenize_words("Комп'ютер") == [
            "Комп'ютер"]
Ejemplo n.º 20
0
        data = f.read()

    log('processing file ' + src_file)

    text = data.decode('utf-8')
    tokens_text = tokenize_uk.tokenize_sents(text)

    log('tokenization finished')

    sents_number = int(math.ceil(len(tokens_text) / float(sents_per_chunk)))

    for i in range(0, sents_number):
        sentences = []
        chunk = tokens_text[i * sents_per_chunk:(i + 1) * sents_per_chunk]

        for sentence in chunk:
            sentences.append(tokenize_uk.tokenize_words(sentence))

            if items_processed % log_interval == 0:
                log('items processed {}'.format(items_processed))

            items_processed += 1

        result_file = os.path.basename(src_file) + str(i) + '.msg'
        with open(sents_folder + result_file, 'wb') as f:
            msgpack.pack(sentences, f)

        log('file {} saved'.format(result_file))

log('done', Fore.GREEN)
Ejemplo n.º 21
0
def lemmatize(text):
    tokens = [t for t in tokenize_uk.tokenize_words(text)]
    lemmas = [morph.parse(t)[0].normal_form for t in tokens]
    return ' '.join(lemmas)
Ejemplo n.º 22
0
 def tokenize(self, x):
     return tokenize_uk.tokenize_words(self.sub_br(x))
Ejemplo n.º 23
0
def check_spelling(sentence):
    tokens = tokenize_uk.tokenize_words(sentence)
    candidates = generate_candidates(tokens)
    return tokens, candidates
Ejemplo n.º 24
0
    def test_word_tokenization(self):
        assert tokenize_words("Геогра́фія або земле́пис") == [
            "Геогра́фія", "або", "земле́пис"
        ]

        assert tokenize_words("Комп'ютер") == ["Комп'ютер"]
Ejemplo n.º 25
0
 def parse_sentence(self, sentence):
     '''
     Tokenizes the sentence to words using tokenize_uk, converts to connlu and gets Pymorphy2 info, see parse_tree method
     '''
     tokens = tokenize_uk.tokenize_words(sentence)
     return self.parse_tree(tokens)
Ejemplo n.º 26
0
from collections import Counter
import tokenize_uk
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')


def get_collocations(text):

    collocations = Counter()
    for i in range(1, len(text)):
        prev_word = text[i - 1]
        word = text[i]
        if word.tag.POS == "NOUN" and word.tag.animacy == "anim" and prev_word.tag.POS == "ADJF":
            collocations[(prev_word.normal_form, word.normal_form)] += 1
    return collocations


with open(
        "/home/dasha/Документы/курс/prj-nlp-2020/tasks/02-structural-linguistics/data/tyhrolovy.txt",
        "r") as f:
    text = f.read()
    words = tokenize_uk.tokenize_words(text)
    parsed = [morph.parse(word)[0] for word in words]

    collocations = get_collocations(parsed)
    for c, freq in sorted(collocations.items(),
                          key=lambda x: x[1],
                          reverse=True):
        print('{0}: {1}'.format(freq, ' '.join(c)))
Ejemplo n.º 27
0
    for t in tokens:
        tag = morph.parse(t)[0].tag.POS
        if tag_mapping.get(tag, str(tag)) in useful_tags:
            filtered_words.append(t)

    return filtered_words


### Preprocessing

tokenized_data = []
for item in dataset:
    sents_list = []
    sents = tokenize_uk.tokenize_sents(item[1])
    for s in sents:
        sents_list.append(tokenize_uk.tokenize_words(s))
    tokenized_data.append(sents_list)

tokenized_lengths = [len(t) for t in tokenized_data]
print("tokenized")
lemmatized_data = [[lemmatize_tokens(i) for i in t] for t in tokenized_data]
print("lemmatized")
digits_cleared_data = [[[item for item in i if not item.isdigit()] for i in l]
                       for l in lemmatized_data]
print("digits")
punct_cleared_data = [[[
    item for item in i if item not in f'{string.punctuation}”№«»'
] for i in d] for d in digits_cleared_data]
print("punct")
print(punct_cleared_data[0])
pos_filtered_data = [[
Ejemplo n.º 28
0
def ner_nlp_extracting(text, model, vesum, word2indx, tag2indx, sess, graph):

    X = list(
        map(
            lambda sentence: tokenize_uk.tokenize_words(sentence),
            tokenize_uk.tokenize_sents(' '.join(
                tokenize_uk.tokenize_words(text)))))

    X_tokenized = np.array([[word for word in sentence] for sentence in X])
    X = [[
        word2indx.get(vesum.get_main_form_from_vesum(word),
                      word2indx['UNKNOWN']) for word in sentence
    ] for sentence in X]
    X = pad_sequences(X,
                      maxlen=70,
                      padding='post',
                      truncating='post',
                      value=word2indx['ENDPAD'])

    with graph.as_default():
        set_session(sess)
        pred = np.argmax(model.predict(X), axis=-1)

    res = [(sent,
            list(
                map(
                    lambda tag: list(
                        filter(lambda key: tag2indx[key] == tag, tag2indx))[0],
                    tags[:len(sent)])))
           for sent, tags in zip(X_tokenized, pred)]

    tokens = list()
    tags = list()

    for tokens_tmp, tags_tmp in res:
        tokens.extend(tokens_tmp)
        tags.extend(tags_tmp)

    find_tags = list()

    start_index = 0
    finish_index = 0

    for ind, tag in enumerate(tags):
        if (ind == 0 or ((ind > 0) and tags[ind - 1] == 'O')) and tag != 'O':
            token = tokens[ind]
            start_index = text.index(token, finish_index)
            finish_index = text.index(token, finish_index) + len(token)
        elif tag != 'O':
            token = tokens[ind]
            finish_index = text.index(token, finish_index) + len(token)
        elif ind > 0 and (tags[ind - 1][0] == 'B'
                          or tags[ind - 1][0] == 'I') and tag == 'O':
            ner = tags[ind - 1][2:]
            ner_dict = dict()
            ner_dict['entity_type'] = ner
            ner_dict['start_index'] = start_index
            ner_dict['finish_index'] = finish_index
            ner_dict['text_entity'] = text[start_index:finish_index]
            find_tags.append(ner_dict)

    return find_tags