def filter_bad(row):

    original, paraphrase = row['original'].strip(
        '\n'), row['paraphrase'].strip('\n')

    basic_processing = lambda x: x.lower() \
                                  .replace('ё','е') \
                                  .translate(str.maketrans('', '',
                                             string.punctuation+'—'))

    if any([
            not_cyrillic(text) or '...' in text
            for text in [original, paraphrase]
    ]):
        row['original'], row['paraphrase'] = None, None
        return row

    original = [_.text for _ in razdel.tokenize(basic_processing(original))]
    paraphrase = [
        _.text for _ in razdel.tokenize(basic_processing(paraphrase))
    ]
    n_tokens = [len(original), len(paraphrase)]

    if any([n_tok < 2 or n_tok > 20 for n_tok in n_tokens]) \
        or set(original) == set(paraphrase) \
        or any(len(toks) > 1 and len(set(toks)) == 1
            for toks in [original, paraphrase]):
        row['original'], row['paraphrase'] = None, None

    return row
def postprocess(ref,
                hyp,
                is_multiple_ref=False,
                detokenize_after=False,
                tokenize_after=True):
    if is_multiple_ref:
        reference_sents = ref.split(" s_s ")
        decoded_sents = hyp.split("s_s")
        hyp = [
            w.replace("<", "&lt;").replace(">", "&gt;").strip()
            for w in decoded_sents
        ]
        ref = [
            w.replace("<", "&lt;").replace(">", "&gt;").strip()
            for w in reference_sents
        ]
        hyp = " ".join(hyp)
        ref = " ".join(ref)
    ref = ref.strip()
    hyp = hyp.strip()
    if detokenize_after:
        hyp = punct_detokenize(hyp)
        ref = punct_detokenize(ref)
    if tokenize_after:
        hyp = hyp.replace("@@UNKNOWN@@", "<unk>")
        hyp = " ".join([token.text for token in razdel.tokenize(hyp)])
        ref = " ".join([token.text for token in razdel.tokenize(ref)])
    return ref, hyp
def calc_extraction_score(text, summary, threshold=2):
    text_tokens = [t.text for t in razdel.tokenize(text.lower())]
    summary_tokens = [t.text for t in razdel.tokenize(summary.lower())]
    acs = find_acs(text_tokens, summary_tokens, threshold)
    answer = 0.0
    for s in acs:
        s = s / len(summary_tokens)
        answer += s * (math.exp(s - 1) - (1 - s) / math.exp(1))
    return answer, acs[0] / len(summary_tokens) if acs else 0.0
Beispiel #4
0
def postprocess(refs, hyp, tokenize_after, lower):
    refs = [ref.strip() for ref in refs]
    hyp = hyp.strip()
    if tokenize_after:
        hyp = " ".join([token.text for token in razdel.tokenize(hyp)])
        refs = [" ".join([token.text for token in razdel.tokenize(ref)]) for ref in refs]
    if lower:
        hyp = hyp.lower()
        refs = [ref.lower() for ref in refs]
    return refs, hyp
def main(gold_path, predicted_path, metric, tokenize_after):
    refs = []
    hyps = []
    with open(gold_path, "r") as gold, open(predicted_path, "r") as pred:
        for gold_summary, pred_summary in zip(gold, pred):
            gold_summary = "".join(gold_summary.split(" ")[1:-1]).replace("▁", " ").strip()
            pred_summary = "".join(pred_summary.split(" ")[1:-1]).replace("▁", " ").strip()
            if tokenize_after:
                pred_summary = " ".join([token.text for token in razdel.tokenize(pred_summary)])
                pred_summary = pred_summary.replace("@ @ UNKNOWN @ @", "@@UNKNOWN@@")
                gold_summary = " ".join([token.text for token in razdel.tokenize(gold_summary)])
            refs.append(gold_summary)
            hyps.append(pred_summary)
    print_metrics(refs, hyps, metric)
Beispiel #6
0
def prepare_dataset(df_train, df_val, df_test):
    train_labels = df_train["label"].tolist()
    val_labels = df_val["label"].tolist()
    test_labels = df_test["label"].tolist()

    # text -> list of tokens
    train_texts = [[token.text for token in tokenize(text)]
                   for text in df_train["text"].tolist()]
    val_texts = [[token.text for token in tokenize(text)]
                 for text in df_val["text"].tolist()]
    test_texts = [[token.text for token in tokenize(text)]
                  for text in df_test["text"].tolist()]

    return (train_texts, train_labels), (val_texts, val_labels), (test_texts,
                                                                  test_labels)
def tokenize_roles(role_data):
    tokenized = []

    for set_ in role_data:
        data = {'roles1': [], 'roles2': [], 'y': []}
        for datum in set_:
            # label = int(datum['is_duplicate']) if int(datum['is_duplicate']) != -1 else 2
            data['roles1'].append(
                [token.text for token in list(tokenize(datum['roles1']))])
            data['roles2'].append(
                [token.text for token in list(tokenize(datum['roles2']))])
            # -1 -> 0  0 -> 1  1 -> 2
            data['y'].append(int(datum['is_duplicate']) + 1)
        tokenized.append(data)
    return tokenized
Beispiel #8
0
    def get_texts(dataset):
        texts = []
        for text in dataset["text"]:
            for sentence in sentenize(text):
                texts.append([
                    token.text.lower() for token in tokenize(sentence.text)
                    if token.text not in punctuation
                ])

        for title in dataset["title"]:
            texts.append([
                token.text.lower() for token in tokenize(title)
                if token.text not in punctuation
            ])
        return texts
Beispiel #9
0
def stop_words_remove(text):
    '''
    очистка текста от стоп-слов
    
    на выходе - очищенный от стоп-слов текст
    '''
    # [0]
    if not isinstance(text, str):
        text = str(text)

    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-':  # [2]
            w = w[1:]
        if len(w) > 1:  # [3]
            if w in cache:  # [4]
                words_lem.append(cache[w])
            else:  # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)

    words_lem_without_stopwords = [
        i for i in words_lem if not i in stopword_ru
    ]  # [6]

    words_lem_without_stopwords_string = str(words_lem_without_stopwords)

    return words_lem_without_stopwords_string
def lemmatization(text):
    '''
    Lemmatizes a text:
        [0] - if the type!="str" - make it ""str
        [1] - tokenizes text
        [2] - removes words with the first letter=="-"
        [3] - removes stopwords
        [4] - removes words with len<=1
        [5] - trying to find a word in cache
        [6] - lemmatizes words
    Output: lemmatized text
    '''
    global cache
    # [0]
    if not isinstance(text, str):
        text = str(text)

    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-':  # [2]
            w = w[1:]
        if not w in stopword_ru:  # [3]
            if len(w) > 1:  # [4]
                if w in cache:  # [5]
                    words_lem.append(cache[w])
                else:  # [6]
                    temp_cach = cache[w] = morph.parse(w)[0].normal_form
                    words_lem.append(temp_cach)
    return words_lem
Beispiel #11
0
    def __call__(self, text: str):
        """Performs tokenization and sentence splitting.

        Args:
            text(str): text.

        Returns:
            Dictionary that contains:
            1. tokens - list of objects Token.
            2. sentences - list of objects Sentences.
        """

        ann_tokens = [
            ann.Token(text=token.text, begin=token.start, end=token.stop)
            for token in razdel.tokenize(text)
        ]

        sentences = [
            ProcessorRazdel.offset_to_tokens(offset.start, offset.stop,
                                             ann_tokens)
            for offset in razdel.sentenize(text)
        ]
        ann_sentences = [ann.Sentence(begin, end) for begin, end in sentences]

        return {'tokens': ann_tokens, 'sentences': ann_sentences}
Beispiel #12
0
 def anaphoras_to_corpus(self, anaphoras: List) -> List[Dict]:
     '''Конвертируем найденные анафорические связи в корпус
     '''
     for item in anaphoras:
         corpus: List = []
         sequence = list(tokenize(item['text']))
         for i, s in enumerate(sequence):
             if s.start == item['antecedent']['start']:
                 antecedent = {
                     'token': item['antecedent']['text'],
                     'lemma': item['antecedent']['text'],
                     'start': i,
                     'end': i,
                 }
                 break
         for i, s in enumerate(sequence):
             if s.start == item['anaphor']['start']:
                 mentions = [{
                     'token': item['anaphor']['text'],
                     'lemma': item['anaphor']['text'],
                     'start': i,
                     'end': i,
                     'coref': antecedent['start'],
                 }]
                 break
         corpus.append({
             'text':
             item['text'],
             'sequence': [s.text for s in sequence],
             'coreferences': [{
                 'antecedent': antecedent,
                 'mentions': mentions,
             }],
         })
     return corpus
Beispiel #13
0
def generate_errors(text):
    tokens = list(tokenize(text))
    pos = 0
    while pos < len(tokens):
        token = tokens[pos]
        token_text = token.text
        if token_text.lower() in {'тоже', 'также'}:
            yield (text[:token.start]
                   + token_text[:-2] + ' же'
                   + text[token.stop:])
            pos += 1
        elif token_text.lower() == 'чтоб':
            yield (text[:token.start]
                   + token_text[:-1] + ' б'
                   + text[token.stop:])
            pos += 1
        elif token_text.lower() == 'чтобы':
            yield (text[:token.start]
                   + token_text[:-2] + ' бы'
                   + text[token.stop:])
            pos += 1
        elif pos + 1 < len(tokens) and token_text.lower() in {'то', 'так'} and tokens[pos + 1].text == 'же':
            yield (text[:token.start]
                   + token_text + 'же'
                   + text[tokens[pos + 1].stop:])
            pos += 2
        elif pos + 1 < len(tokens) and token_text.lower() == 'что' and tokens[pos + 1].text in {'б', 'бы'}:
            yield (text[:token.start]
                   + token_text + tokens[pos + 1].text
                   + text[tokens[pos + 1].stop:])
            pos += 2
        else:
            pos += 1
Beispiel #14
0
def ru_tokenizer(text: str) -> list:
    """
    Tokenizes texts in Russian
    Args:
        text (str): input text

    Returns: 
        flair Token objects
    """
    all_sentences = []
    for paragraph in split_newline(text):
        sentences = [x.text for x in list(sentenize(paragraph))]
        all_sentences.extend(sentences)
    words = []
    for sentence in all_sentences:
        sentence_tokens = [x.text for x in list(tokenize(sentence))]
        words.extend(sentence_tokens)
    prev_start_position = 0
    tokens = []
    for word in words:
        start_position = text[prev_start_position:].index(word)
        token = Token(text=word,
                      start_position=prev_start_position + start_position,
                      whitespace_after=False)
        tokens.append(token)
        prev_start_position = start_position + prev_start_position + len(word)
    return tokens
    def _calc_embedding(self,
                        text: List[str],
                        return_numpy=False) -> Union[torch.Tensor, np.ndarray]:
        """ Calculate sentence embedding. 

        Args:
            text (List[str]): list of sentences
            return_numpy (boolean): if True return numpy.ndarray, else torch.tensor 

        Returns:
            np.ndarray: sentence embedding with shape 'bert hidden_size (default: 768)'
        """

        sentences = [[token.text.lower() for token in razdel.tokenize(sent)]
                     for sent in text]

        encoded = self.tokenizer.batch_encode_plus(
            sentences,
            padding="longest",
            is_split_into_words=True,
            truncation="longest_first",
            max_length=256,
        )

        input_ids = torch.tensor(encoded["input_ids"]).to("cuda")
        attention_mask = torch.tensor(encoded["attention_mask"]).to("cuda")
        with torch.no_grad():
            hidden_states = self.bert(input_ids,
                                      attention_mask)["last_hidden_state"]
            sentence_embedding = hidden_states[:, 0, :]

        if return_numpy:
            return sentence_embedding.cpu().numpy()
        return sentence_embedding
Beispiel #16
0
def predict(text: str) -> List[Dict[str, Union[List[Dict[str, Any]], dict]]]:
    tokens = [t.text for t in tokenize(text)]
    cache = {
        t: {
            "loanword": 0,
            "obscene": 0,
            "expressive": 0
        }
        for t in set(t.lower() for t in tokens)
    }
    for t in cache:
        if is_word(t, min_len=3, max_len=30, s_words=stops):
            cache[t]["emb"] = model[t]
            cache[t]["loanword"] = loanword_clf.predict([cache[t]["emb"]
                                                         ]).item()
            cache[t]["obscene"] = obscene_clf.predict([cache[t]["emb"]]).item()
            cache[t]["expressive"] = expressive_clf.predict([cache[t]["emb"]
                                                             ]).item()
    analysis = [{
        "word": t,
        "loanword": cache[t.lower()]["loanword"],
        "obscene": cache[t.lower()]["obscene"],
        "expressive": cache[t.lower()]["expressive"]
    } for t in tokens]
    a = [{
        "word": d["word"],
        "categories": [k for k, v in d.items() if v and k != "word"]
    } for d in analysis]
    return [{"analysis": a, "statistics": statistics(analysis)}]
Beispiel #17
0
def _fix_dictionary(original_sentences):
    for i, sentence in enumerate(original_sentences):
        for key, value in substrings_fixes.items():
            if key in sentence or key.capitalize() in sentence:
                original_sentences[i] = sentence.replace(key, value)
                original_sentences[i] = original_sentences[i].replace(key.capitalize(), value.capitalize())

    tokenized_sentences = [(sentence, list(tokenize(sentence))) for sentence in original_sentences]
    fixed_sentences = []
    for sentence, tokens in tokenized_sentences:
        fixed_sentence = ""
        offset = 0
        for i, token in enumerate(tokens):
            tokens[i].start += offset
            tokens[i].stop += offset
            token_text = token.text
            fixed_token_text = tokens_fixes.get(token_text, None)
            if fixed_token_text is not None:
                tokens[i].text = fixed_token_text
                offset += len(fixed_token_text) - len(token_text)
        fixed_sentence = sentence
        for token in tokens:
            fixed_sentence = fixed_sentence[:token.start] + token.text + fixed_sentence[token.stop:]
        fixed_sentences.append(fixed_sentence)

    return fixed_sentences
Beispiel #18
0
    def __TextTokenize(self, text):
        tokens = sent_tokenize(text)

        for sentence in range(len(tokens)):
            tokens[sentence] = list(tokenize(tokens[sentence]))
            tokens[sentence] = [_.text for _ in tokens[sentence]]

        return tokens
Beispiel #19
0
def find_jokes(query, index):
    result = Counter()
    for token in tokenize(query):
        lemma = lemmatize(token.text)
        docs = index.get(lemma, [])  # find documents with this word
        for doc in docs:
            result[doc] += 1 / max(1.0, math.log(len(docs) + 1))
    return result
Beispiel #20
0
def tokenize(string):
    string = re.sub(r"[^A-Za-zА-Яа-я0-9()\-,!?\'\`’:]", " ", string)
    string = re.sub(r"’|`", "'", string)
    string = re.sub(znaks, lambda x: f' {x.group()} ', string)
    string = re.sub(r"\s{2,}", " ", string)
    string = string.strip().lower()
    tokens = [token.text for token in razdel.tokenize(string)]
    return tokens
Beispiel #21
0
def evaluate(test_path, batch_size, metric,
             max_count, report_every, is_multiple_ref=False,
             model_path=None, model_config_path=None, baseline=None,
             reader_config_path=None, detokenize_after=False,
             tokenize_after=False):
    reader_params = get_reader_params(reader_config_path, model_config_path, model_path)
    is_subwords = "tokenizer" in reader_params and reader_params["tokenizer"]["type"] == "subword"
    reader = DatasetReader.from_params(reader_params)
    run_model = get_model_runner(model_path, reader) if not baseline else None

    hyps = []
    refs = []
    for batch in get_batches(reader, test_path, batch_size):
        batch_refs, batch_hyps = run_model(batch) if not baseline else run_baseline(batch, baseline)
        for ref, hyp in zip(batch_refs, batch_hyps):
            hyp = hyp if not is_subwords else "".join(hyp.split(" ")).replace("▁", " ")
            if is_multiple_ref:
                reference_sents = ref.split(" s_s ")
                decoded_sents = hyp.split("s_s")
                hyp = [w.replace("<", "&lt;").replace(">", "&gt;").strip() for w in decoded_sents]
                ref = [w.replace("<", "&lt;").replace(">", "&gt;").strip() for w in reference_sents]
                hyp = " ".join(hyp)
                ref = " ".join(ref)
            ref = ref.strip()
            hyp = hyp.strip()
            if detokenize_after:
                hyp = detokenize(hyp)
                ref = detokenize(ref)
            if tokenize_after:
                hyp = " ".join([token.text for token in razdel.tokenize(hyp)])
                hyp = hyp.replace("@ @ UNKNOWN @ @", "@@UNKNOWN@@")
                ref = " ".join([token.text for token in razdel.tokenize(ref)])
            if isinstance(ref, str) and len(ref) <= 1:
                ref = "some content"
                print("Empty ref")
            if isinstance(hyp, str) and len(hyp) <= 1:
                hyp = "some content"
                print("Empty hyp. Ref: ", ref)

            refs.append(ref)
            hyps.append(hyp)
            if len(hyps) % report_every == 0:
                calc_metrics(refs, hyps, metric)
            if max_count and len(hyps) >= max_count:
                break
    calc_metrics(refs, hyps, metric)
Beispiel #22
0
def parse(refs_path, hyps_path, num_refs, lng='en'):
    logging.info('STARTING TO PARSE INPUTS...')
    print('STARTING TO PARSE INPUTS...')
    # references
    references = []
    for i in range(num_refs):
        fname = refs_path + str(i) if num_refs > 1 else refs_path
        with codecs.open(fname, 'r', 'utf-8') as f:
            texts = f.read().split('\n')
            for j, text in enumerate(texts):
                if len(references) <= j:
                    references.append([text])
                else:
                    references[j].append(text)

    # references tokenized
    references_tok = copy.copy(references)
    for i, refs in enumerate(references_tok):
        if lng == 'ru':
            references_tok[i] = [
                ' '.join([_.text for _ in tokenize(ref)]) for ref in refs
            ]
        else:
            references_tok[i] = [
                ' '.join(nltk.word_tokenize(ref)) for ref in refs
            ]

    # hypothesis
    with codecs.open(hyps_path, 'r', 'utf-8') as f:
        hypothesis = f.read().split('\n')

    # hypothesis tokenized
    hypothesis_tok = copy.copy(hypothesis)
    if lng == 'ru':
        hypothesis_tok = [
            ' '.join([_.text for _ in tokenize(hyp)]) for hyp in hypothesis_tok
        ]
    else:
        hypothesis_tok = [
            ' '.join(nltk.word_tokenize(hyp)) for hyp in hypothesis_tok
        ]

    logging.info('FINISHING TO PARSE INPUTS...')
    print('FINISHING TO PARSE INPUTS...')
    return references, references_tok, hypothesis, hypothesis_tok
def my_preprocess(text: str):
    text = str(text)
    text = text.replace("\n", " ").replace('/', ' ')
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokenized_text = list(tokenize(text))
    lemm = [morph.parse(i.text)[0].normal_form for i in tokenized_text]
    words = [i for i in lemm if i not in stop]
    return " ".join(words)
Beispiel #24
0
    def preprocess_for_conds(self, text):
        '''
        Предобработка строки для правил
        '''

        text = text.lower()
        tokens = list(razdel.tokenize(text))

        return ' '.join([self.lemmatize(t.text) for t in tokens if regex.search(r'^[\w\d]*$', t.text)])
Beispiel #25
0
    def new_model(self, arr: list):
        if arr == []:
            return 1
        fil = open('dict.json')
        self.dict0 = json.loads(fil.read())
        fil.close()

        fil = open('dialog.txt')
        dialog = fil.read().split("\n")
        fil.close()
        fil = open('text.txt', "w")
        fil.write(" ".join(dialog))
        fil.close()
        a = []
        b = []
        for i in range(len(dialog)):
            if i % 2:
                b += [dialog[i]]
            else:
                a += [dialog[i]]
        fil = open('train.a', 'w')
        fil.write(json.dumps(a, ensure_ascii=False))
        fil.close()
        fil = open('train.b', 'w')
        fil.write(json.dumps(b, ensure_ascii=False))
        fil.close()

        fil = open('text.txt')
        text = fil.read()
        fil.close()
        self.dict0 = list(set([_.text for _ in list(tokenize(text.lower()))]))
        fil = open('dict.json', 'w')
        fil.write(json.dumps(self.dict0, ensure_ascii=False))
        fil.close()

        fil = open('train.a')
        a = json.loads(fil.read())
        fil.close()
        fil = open('train.b')
        b = json.loads(fil.read())
        fil.close()
        for i in range(len(a)):
            a[i] = self.text2dict1(a[i])
            b[i] = self.text2dict1(b[i])
        self.x = np.asarray(a)
        self.y = np.asarray(b)

        self.model = models.Sequential()
        self.model.add(
            layers.Dense(arr[0], input_dim=len(self.dict0), activation='tanh'))
        for i in arr[1:]:
            self.model.add(layers.Dense(i, activation='tanh'))
        self.model.add(layers.Dense(len(self.dict0), activation='tanh'))
        self.model.compile(optimizer=tf.train.AdamOptimizer(0.001),
                           loss='mse',
                           metrics=['mae'])
Beispiel #26
0
def generate_errors(text):
    for token in tokenize(text):
        symbol_index = 0
        token_text = token.text
        if token_text.endswith('тся'):
            yield (text[:token.start] + token_text[:-3] + 'ться' +
                   text[token.stop:])
        if token_text.endswith('ться'):
            yield (text[:token.start] + token_text[:-4] + 'тся' +
                   text[token.stop:])
Beispiel #27
0
 def words(self, fileids=None, categories=None):
     """
     Функция проводит токенизацию всех предложений по каждой новости,
     возвращает список токенизированных предложений по всем новостям
     """
     for sentence in self.sents(fileids, categories):
         for word in sentence:
             if len(word) == 0:
                 continue
             tokens = list(tokenize(word))
             yield([_.text for _ in tokens])
Beispiel #28
0
 def transform_token_answer_words(self, data):
     question, answer = data.question, data.answer
     words = razdel.tokenize(answer)
     answer_list = list(answer)
     for j in range(len(words), -1, -1):
         word = words[j]
         text = word.text
         if text not in string.punctuation:
             start, end = word.start, word.end
             answer_list.insert(start, self.token_unk)
     return f'{question}{self.token_unk}{"".join(answer_list)}'
Beispiel #29
0
 def save_syntax_analysis_by_text(self,
                                  text,
                                  file,
                                  is_many_sentences=False):
     f = open(file, 'a')
     sys.stdout = f
     print('-' * 100)
     if text != 'None':
         if not is_many_sentences:
             chunk = list()
             for sent in sentenize(text):
                 tokens = [_.text for _ in tokenize(sent.text)]
                 chunk.append(tokens)
             markup = next(self.syntax.map(chunk))
             words, deps = list(), list()
             for token in markup.tokens:
                 words.append(token.text)
                 source = int(token.head_id) - 1
                 target = int(token.id) - 1
                 if source > 0 and source != target:
                     deps.append([source, target, token.rel])
             show_markup(words, deps)
         else:
             for sentence in text.split('.'):
                 if len(sentence.split()) > 5:
                     chunk = list()
                     for sent in sentenize(sentence):
                         tokens = [_.text for _ in tokenize(sent.text)]
                         chunk.append(tokens)
                     markup = next(self.syntax.map(chunk))
                     words, deps = list(), list()
                     for token in markup.tokens:
                         words.append(token.text)
                         source = int(token.head_id) - 1
                         target = int(token.id) - 1
                         if source > 0 and source != target:
                             deps.append([source, target, token.rel])
                     show_markup(words, deps)
     else:
         print('None')
     print('-' * 100)
Beispiel #30
0
    def __TextTokenize(self, text):
        punct = string.punctuation
        punct += '—–...«»***\n '
        tokens = sent_tokenize(text)

        for sentence in range(len(tokens)):
            tokens[sentence] = list(tokenize(tokens[sentence].lower()))
            tokens[sentence] = [
                _.text for _ in tokens[sentence] if _.text not in punct
            ]

        return tokens