def show_matches(rule, *lines):
    parser = Parser(rule)
    for line in lines:
        matches = parser.findall(line)
        spans = [_.span for _ in matches]

        show_markup(line, spans)
Exemple #2
0
def test_samples(rules: Union[NamedRule, List[NamedRule]],
                 texts: List[str],
                 num: int = 20,
                 seed: int = None,
                 markup=None,
                 fact=False):
    from random import seed as sed, sample

    sed(seed)
    texts, num = (texts,
                  len(texts)) if len(texts) < num else (sample(texts, num),
                                                        num)
    results: Dict[int, Dict[int, List]] = defaultdict(dict)

    if not (isinstance(rules, list) or isinstance(rules, tuple)):
        rules = [rules]

    for rule_idx, rule in enumerate(rules):
        parser = Parser(rule)

        for text_idx in range(num):
            matches = parser.findall(texts[text_idx])
            results[text_idx][rule_idx] = list(matches)

    for text_idx, rule_matches in results.items():
        spans = [(m.span[0], m.span[1], str(rules[rule_idx].name))
                 for rule_idx, matches in rule_matches.items()
                 for m in matches]

        show_markup(texts[text_idx], spans, markup or BoxLabelMarkup)

        if fact:
            for rule_idx, matches in rule_matches.items():
                for m in matches:
                    display(m.fact)
Exemple #3
0
def test(rule, *lines, tree=False, facts=False):
    is_at = lambda span, set: any((span == s) for s in set)
    parser = Parser(rule)

    for line in lines:
        if isinstance(line, str):
            text, expected = line, []
        else:
            text = line[0]
            expected = [find(text, substr) for substr in line[1:]]

        matches = list(sorted(parser.findall(text), key=lambda _: _.span))
        # display(matches)
        matched_spans = [_.span for _ in matches]
        spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \
                + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)]

        show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup)

        if matches:
            for _ in matches:
                if tree:
                    display(matches[0].tree.as_dot)
                if facts:
                    display(_.fact)
Exemple #4
0
def test_on_k_random_records(K):
    records = load_lenta(lenta_path)
    records_num = [i for i in range(N)]
    chosen_records_num = random.choices(records_num, k=K)
    my_records = []
    for i in chosen_records_num:
        my_records.append(get_k_record(records, i))

    print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END +
          ' records\n')

    for i in range(K):
        print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t')
        markup = ner(my_records[i].text)
        show_markup(markup.text, markup.spans)
        print('\n--------------------------\n\n')
Exemple #5
0
 def save_syntax_analysis_by_text(self,
                                  text,
                                  file,
                                  is_many_sentences=False):
     f = open(file, 'a')
     sys.stdout = f
     print('-' * 100)
     if text != 'None':
         if not is_many_sentences:
             chunk = list()
             for sent in sentenize(text):
                 tokens = [_.text for _ in tokenize(sent.text)]
                 chunk.append(tokens)
             markup = next(self.syntax.map(chunk))
             words, deps = list(), list()
             for token in markup.tokens:
                 words.append(token.text)
                 source = int(token.head_id) - 1
                 target = int(token.id) - 1
                 if source > 0 and source != target:
                     deps.append([source, target, token.rel])
             show_markup(words, deps)
         else:
             for sentence in text.split('.'):
                 if len(sentence.split()) > 5:
                     chunk = list()
                     for sent in sentenize(sentence):
                         tokens = [_.text for _ in tokenize(sent.text)]
                         chunk.append(tokens)
                     markup = next(self.syntax.map(chunk))
                     words, deps = list(), list()
                     for token in markup.tokens:
                         words.append(token.text)
                         source = int(token.head_id) - 1
                         target = int(token.id) - 1
                         if source > 0 and source != target:
                             deps.append([source, target, token.rel])
                     show_markup(words, deps)
     else:
         print('None')
     print('-' * 100)
Exemple #6
0
# print(df)

from razdel import tokenize, sentenize
from navec import Navec
from slovnet import Morph, Syntax, NER
from ipymarkup import show_span_ascii_markup as show_markup

df = pd.read_csv('categories.csv')
categories = df.to_records(index=False)

navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
# morph = Morph.load('slovnet_morph_news_v1.tar')
# syntax = Syntax.load('slovnet_syntax_news_v1.tar')
ner = NER.load('slovnet_ner_news_v1.tar')

# morph.navec(navec)
# syntax.navec(navec)
ner.navec(navec)

with open('sport_texts_clear.txt', 'r', encoding='UTF-8') as f:
    i = 1
    for line in f:
        sents = [sent.text for sent in sentenize(line)]
        for sent in sents:
            i += 1
            # tokens = tokenize(sent)
            # print(list(tokens))
            markup = ner(sent)
            show_markup(markup.text, markup.spans)
        if i > 20:
            break
Exemple #7
0
    'packages')

from yargy import Parser
from ipymarkup import show_markup
from random import sample
from IPython.display import display
from dataloader import DataLoader
from fact import CONTENT_TYPE_PARSER, ContentTypeFact

lines = DataLoader().exemple_offers()
parser = Parser(CONTENT_TYPE_PARSER)

for line in lines[:10]:
    result_list = []
    matches = list(parser.findall(line))
    print(
        '----------------------------------------------------------------------'
    )
    print("Content: ")
    for match in matches:
        if match is not None:
            try:
                result_list.append(match.fact.contenttype)
                print("\t", match.fact.contenttype)
            except KeyError:
                print("Content: KeyError")
    result_list = list(dict.fromkeys(result_list))
    print("Result:")
    print(result_list)
    show_markup(line, [_.span for _ in matches])
Exemple #8
0
def test_on_random_record():
    records = load_lenta(lenta_path)
    record, k = get_random_record(records)
    markup = ner(record.text)
    print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n')
    show_markup(markup.text, markup.spans)