Example #1
0
class Tokenizer(object):
    def __init__(self):
        self.segmenter = Segmenter()
        self.morph_vocab = MorphVocab()
        self.emb = NewsEmbedding()
        self.morph_tagger = NewsMorphTagger(self.emb)
        self.syntax_parser = NewsSyntaxParser(self.emb)
        self.ner_tagger = NewsNERTagger(self.emb)
        self.names_extractor = NamesExtractor(self.morph_vocab)
        self.doc = []
        self.term_extractor = TermExtractor()

    def init_doc(self, text):
        self.doc = Doc(text)
        self.doc.segment(self.segmenter)
        self.doc.tag_ner(self.ner_tagger)

    def get_sentance(self, text):
        self.init_doc(text)
        sentences = []
        for sentence in self.doc.sents:
            sentences.append(sentence.text)
        return sentences

    def get_tokens(self, sentence):
        tokens = []
        for term in self.term_extractor(sentence):
            tokens.append(term.normalized)
        return tokens
Example #2
0
def delete_NER(words):
    nf_words = ' '.join(words)
    per_words = []
    loc_words = []
    doc = Doc(nf_words)

    doc.segment(segmenter)

    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)
    for span in doc.spans:
        if span.type == 'PER':
            span.extract_fact(names_extractor)
            per_words.append(span.text)
        if span.type == 'LOC':
            span.extract_fact(names_extractor)
            loc_words.append(span.text)

    for word in per_words:
        if word in nf_words:
            nf_words = nf_words.replace(word, ' PER ')
    for word in loc_words:
        if word in nf_words:
            nf_words = nf_words.replace(word, ' LOC ')
    words = nf_words.split(' ')
    return words
Example #3
0
def anon_ner(text):
    result = ''
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    result_temp = ''
    last = 0
    for span in doc.spans:
        if span.type == 'PER':
            result_temp += text[last:span.start]
            result_temp += 'ИМЯ'
        if span.type == 'ORG':
            result_temp += text[last:span.start]
            result_temp += 'ОРГАНИЗАЦИЯ'
        if span.type == 'LOC':
            result_temp += text[last:span.start]
            result_temp += 'ЛОКАЦИЯ'
        if span.type == 'PER' or span.type == 'ORG' or span.type == 'LOC':
            last = span.stop
    result_temp += text[last:]
    result = result_temp
    result_temp = ""
    last = 0
    countries = [
        'AZ', 'AM', 'BY', 'KZ', 'KG', 'MD', 'RU', 'TJ', 'TM', 'UZ', 'UA'
    ]
    for country in countries:
        for match in phonenumbers.PhoneNumberMatcher(result, country):
            result_temp += result[last:match.start]
            result_temp += 'ТЕЛЕФОН '
            last = match.end
    result_temp += result[last:]
    result = result_temp
    return result
Example #4
0
    def tag_ner(self, text):
        doc = Doc(text)

        doc.segment(self.segmenter)
        doc.tag_ner(self.ner_tagger)
        return [(sp.start, sp.stop, sp.text.replace("\n", " "), sp.type)
                for sp in doc.spans]
Example #5
0
def __tag_text(text):
    doc = Doc(text)
    doc.segment(Segmenter())

    ner_tagger = NewsNERTagger(NewsEmbedding())
    doc.tag_ner(ner_tagger)
    return doc
Example #6
0
def __FuncTokLem(text):
 doc = Doc(text)

 doc.segment(segmenter)
 doc.tag_morph(morph_tagger)
 for token in doc.tokens:
     token.lemmatize(morph_vocab)
 return doc.tokens[0].text
Example #7
0
def lemmatize(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return [token.lemma for token in doc.tokens]
def process_text_file(text_file, mongo=None):
    # nlp = spacy.load('ru_core_news_sm')
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    with open(text_file, 'r', encoding='utf-8') as file:
        file_name = file.name[2:]
        line_number = 0
        for line in file:
            line_number += 1
            if line_number % 100 == 0:
                logging.info(f'Processed line {line_number}')
                if line_number >= 100000:
                    return
            sents = [sent.text for sent in sentenize(line)]
            sentence_number = 0
            for sentence in sents:
                doc = Doc(sentence)
                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)
                doc.parse_syntax(syntax_parser)
                sentence_number += 1
                sentence_tokens = doc.tokens

                # sentence_tokens = [
                #     {
                #         'text': token.text,
                #         'lemma': token.lemma_,
                #         'pos': token.pos_,
                #         'tag': token.tag_,
                #         'dep': token.dep_,
                #         'shape': token.shape_,
                #         'is_alpha': token.is_alpha,
                #         'is_stop': token.is_stop
                #     } for token in sentence]
                words = markup_words(doc.syntax)
                deps = token_deps(doc.syntax.tokens)
                html = show_dep_markup(words, deps)
                save_html(
                    html,
                    f'./htmls/dependency_plot_{file_name}_{line_number}_{sentence_number}.html'
                )
                #
                # svg = displacy.render(sentence, style='dep', options={'compact': False, 'bg': '#09a3d5',
                #                                                       'color': 'white', 'font': 'Source Sans Pro'})
                # output_path = Path(f'./images/dependency_plot_{file_name}_{line_number}_{sentence_number}.svg')
                # output_path.open('w', encoding='utf-8').write(svg)
                PatternExtractor.extract_relations(
                    file_name,
                    line_number,
                    sentence_number,
                    sentence,
                    sentence_tokens,
                    # noun_phrases,
                    # mongo=mongo
                )
Example #9
0
 def respond(self, ctx: Context):
     if not ctx.message_text:
         return Response('привет!')
     doc = Doc(ctx.message_text)
     doc.segment(segmenter)
     doc.tag_morph(morph_tagger)
     for token in doc.tokens:
         token.lemmatize(morph_vocab)
     return Response('Леммы: ' + ' '.join([t.lemma for t in doc.tokens]))
Example #10
0
 def get_extended_lemms(self, str_):
     doc = Doc(str_)
     doc.segment(self.segmenter)
     doc.tag_morph(self.morph_tagger)
     lemms = list()
     for token in doc.tokens:
         token.lemmatize(self.morph_vocab)
         lemms.append([token.lemma, token.text])
     return lemms
Example #11
0
 def get_tokens(self, str_):
     lemms = list()
     doc = Doc(str_)
     doc.segment(self.segmenter)
     doc.tag_morph(self.morph_tagger)
     for token in doc.tokens:
         token.lemmatize(self.morph_vocab)
         lemms.append(token.text)
     return [lemms]
Example #12
0
def anonymoize():
    entities = request.json['entities']
    raw_text = request.json['raw_text']
    if "DATE" in entities:
        raw_text = anonymize_date(raw_text)
    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    return filter_data(doc.spans, raw_text, entities)
Example #13
0
def preprocess_ner(text):
    """Удаление именованных сущностей """
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    new_text = text
    for entity in doc.spans:
        new_text = new_text.replace(text[entity.start:entity.stop], '')
    return new_text
Example #14
0
def calculate_skills_assessment(text, ca):
    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return candidate_conformity
    def get_doc(self, text: str) -> Doc:
        doc = Doc(text)

        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)
        doc.parse_syntax(self.syntax_parser)

        doc.tag_ner(self.ner_tagger)
        return doc
Example #16
0
def segmentate(text: str, date: typing.Optional[datetime.datetime] = None):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)

    return {_.type: _.normal for _ in doc.spans}
def process_russian_text(text, type_of_word_to_highlight='VERB'):
    # check out the original source:
    # https://github.com/natasha/natasha
    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    return [token.text for token in doc.tokens if token.pos == type_of_word_to_highlight]
Example #18
0
def cleaner(text):
    # out = []
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    out = [token.lemma for token in doc.tokens if token.pos != 'PUNCT']
    if len(out) > 2:
        return out
Example #19
0
    def process(self, text: str) -> Doc:
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)

        for token in doc.tokens:
            token.lemmatize(self.morph_vocab)

        doc.parse_syntax(self.syntax_parser)
        return doc
Example #20
0
    def __call__(self, text):
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_ner(self.ner_tagger)

        ner = []
        for span in doc.spans:
            ner.append(span.text)
        #print(ner)
        return ner
Example #21
0
    def select_corefs(self, text: str) -> Tuple[List]:
        '''Метод извлекает из текста кореферентности на основе NER.
        '''
        doc = Doc(text)
        doc.segment(self.segmenter)
        doc.tag_morph(self.morph_tagger)
        for token in doc.tokens:
            token.lemmatize(self.morph_vocab)
        doc.tag_ner(self.ner_tagger)

        # Извлекаем леммы и ищем встречающиеся NER-сущности
        extracted_lemmas = {}
        for span in doc.spans:
            for token in span.tokens:
                if token.lemma in extracted_lemmas:
                    extracted_lemmas[token.lemma] += 1
                else:
                    extracted_lemmas[token.lemma] = 1
        selected_items = [
            item for item in extracted_lemmas if extracted_lemmas[item] > 1
        ]

        # Выбираем антецеденты и упоминания
        coref_sequence = []
        for item in selected_items:
            antecedent_found = -100
            for span in doc.spans:
                for token in span.tokens:
                    if token.lemma == item:
                        if antecedent_found == -100:
                            antecedent_found = span.start
                            coref_sequence.append(
                                CorefItem(span.text, token.lemma, span.type,
                                          span.start, span.stop))
                        else:
                            coref_sequence.append(
                                CorefItem(span.text, token.lemma, span.type,
                                          span.start, span.stop,
                                          antecedent_found))

        # Обзначаем индексы токенов
        sequence = [token for token in doc.tokens]
        indexes = {}
        for item in coref_sequence:
            for i, token in enumerate(doc.tokens):
                if item.start == token.start:
                    indexes[item.start] = i
                    item.start = i
                if item.stop == token.stop:
                    item.stop = i

        for item in coref_sequence:
            if item.coref != -100:
                item.coref = indexes[item.coref]
        return sequence, coref_sequence
Example #22
0
def check_in_sent(text,word1,word2):
    doc = Doc(text)
    doc.segment(segmenter)
    for sents in doc.sents:
     text2 = Doc(sents.text)
     text2.segment(segmenter)
     list_token_sents = text2.tokens[:]
     for k in range(0,len(list_token_sents)):
       for j in range(k+1,len(list_token_sents)):
        if((text2.tokens[k].text == word1) & (text2.tokens[j].text == word2)):
          return 'TRUE'
Example #23
0
 def __call__(self, text):
     doc = Doc(text)
     doc.segment(self.segmenter)
     doc.tag_morph(self.morph_tagger)
     for token in doc.tokens:
         token.lemmatize(self.morph_vocab)
     doc.parse_syntax(self.syntax_parser)
     doc.tag_ner(self.ner_tagger)
     for span in doc.spans:
         span.normalize(self.morph_vocab)
     return doc
Example #24
0
def tag_text(text):
    if not (text in tag_text_cache):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        doc.tag_ner(ner_tagger)
        doc.parse_syntax(syntax_parser)
        for span in doc.spans:
            span.normalize(morph_vocab)
        tag_text_cache[text] = doc

    return tag_text_cache[text]
Example #25
0
def clean_and_tokenize(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text.lower()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    words = []
    {words.append(_.lemma) for _ in doc.tokens if _.lemma not in STOP_WORDS}
    return words
Example #26
0
    def get_tree_structure(self, sentence):
        if self.syntax_model_name == 'natasha':
            doc = Doc(sentence)
            doc.segment(self.segmenter)
            doc.parse_syntax(self.syntax_parser)
            syntax_tree = {}
            for elem in doc.tokens:
                values = [elem.text, re.sub('1_', '', elem.head_id), elem.rel]
                syntax_tree[re.sub('1_', '', elem.id)] = values
        elif self.syntax_model_name == 'deeppavlov':
            tree = self.model_deeppavlov([sentence])
            tree = tree[0]
            tree = re.sub('\\n', '\\t', tree)
            parsed_tree = tree.split('\t')
            counter = 0
            syntax_tree = {}
            tree_elems = []
            for branch in parsed_tree:
                if counter < 10:
                    if branch != '_':
                        tree_elems.append(branch)
                    counter = counter + 1
                else:
                    syntax_tree[str(tree_elems[0])] = tree_elems[1:]
                    tree_elems = [branch]
                    counter = 1
        else:
            tree = self.model_deeppavlov([sentence])
            tree = tree[0]
            tree = re.sub('\\n', '\\t', tree)
            parsed_tree = tree.split('\t')
            counter = 0
            syntax_tree = {}
            tree_elems = []
            for branch in parsed_tree:
                if counter < 10:
                    if branch != '_':
                        tree_elems.append(branch)
                    counter = counter + 1
                else:
                    syntax_tree[str(tree_elems[0])] = tree_elems[1:]
                    tree_elems = [branch]
                    counter = 1

        for i, element in syntax_tree.items():
            if element[1] == '0' and element[2] != 'root':
                syntax_tree[i][2] = 'root'

        return syntax_tree
Example #27
0
def define_speechs_author(bigoutputdict, charlist, chardi):
    for chap in bigoutputdict:
        speeches = chap['speeches']
        for onespeech in speeches:
            if onespeech['author_text'] != None:
                texttosearch = onespeech['author_text']

                for i in texttosearch.split():
                    word = i.strip(punctuation).strip()
                    if len(word) != 1:
                        for i in morph.parse(word):
                            if (
                                ("NOUN" in i.tag) and ("anim" in i.tag) and
                                ('nomn' in i.tag) and ('plur' not in i.tag)
                            ) or word == "Николка" or word == "старший" or word == "Най":
                                #print(word)
                                if word in charlist:
                                    onespeech['author_in_text'] = word
                                    for key in chardi:
                                        if word in chardi[key]:
                                            onespeech['authors_name'] = key
                                    if onespeech[
                                            'authors_name'] == 'undefined':
                                        onespeech['authors_name'] = word

                texttosearch = onespeech['author_text']
                natashatext = Doc(texttosearch)
                natashatext.segment(segmenter)
                natashatext.tag_morph(morph_tagger)
                textnames = ''
                for token in natashatext.tokens:
                    if ((token.pos == "NOUN" and 'Animacy' in token.feats
                         and token.feats['Animacy'] == 'Anim') or
                        (token.pos == "PROPN")) and 'Case' in token.feats and (
                            token.feats['Case'] == 'Nom'):
                        textnames += str(token.text) + ' '
                namestoanalize = Doc(textnames)
                namestoanalize.segment(segmenter)
                namestoanalize.tag_ner(ner_tagger)
                if len(namestoanalize.spans) != 0:
                    for span in namestoanalize.spans:
                        if onespeech['author_in_text'] in str(
                                span.text
                        ) and onespeech['author_in_text'] != str(
                                span.text) and str(span.text) in charlist:
                            onespeech['author_in_text'] = str(span.text)

    with open('resultswithauth.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(bigoutputdict, ensure_ascii=False))
def preprocess_words(corpus):
    doc = Doc(corpus)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    lemmas = []
    stop_words = get_stop_words('russian')

    for token in doc.tokens:
        if token.lemma not in stop_words and not re.match('\W+', token.lemma):
            lemmas.append(token.lemma)
    return lemmas
Example #29
0
def preprocess_sent(incoming_sent):
    doc = Doc(incoming_sent)

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)

    doc.segment(segmenter)

    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    return doc.sents[0]
def _text_preprocess(text):
    if text is None:
        return []

    text = text.strip().replace('`', "'")

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    tokens = [t.lemma for t in doc.tokens]
    return tokens