def show_matches(rule, *lines): parser = Parser(rule) for line in lines: matches = parser.findall(line) spans = [_.span for _ in matches] show_markup(line, spans)
def test_samples(rules: Union[NamedRule, List[NamedRule]], texts: List[str], num: int = 20, seed: int = None, markup=None, fact=False): from random import seed as sed, sample sed(seed) texts, num = (texts, len(texts)) if len(texts) < num else (sample(texts, num), num) results: Dict[int, Dict[int, List]] = defaultdict(dict) if not (isinstance(rules, list) or isinstance(rules, tuple)): rules = [rules] for rule_idx, rule in enumerate(rules): parser = Parser(rule) for text_idx in range(num): matches = parser.findall(texts[text_idx]) results[text_idx][rule_idx] = list(matches) for text_idx, rule_matches in results.items(): spans = [(m.span[0], m.span[1], str(rules[rule_idx].name)) for rule_idx, matches in rule_matches.items() for m in matches] show_markup(texts[text_idx], spans, markup or BoxLabelMarkup) if fact: for rule_idx, matches in rule_matches.items(): for m in matches: display(m.fact)
def test(rule, *lines, tree=False, facts=False): is_at = lambda span, set: any((span == s) for s in set) parser = Parser(rule) for line in lines: if isinstance(line, str): text, expected = line, [] else: text = line[0] expected = [find(text, substr) for substr in line[1:]] matches = list(sorted(parser.findall(text), key=lambda _: _.span)) # display(matches) matched_spans = [_.span for _ in matches] spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \ + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)] show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup) if matches: for _ in matches: if tree: display(matches[0].tree.as_dot) if facts: display(_.fact)
def test_on_k_random_records(K): records = load_lenta(lenta_path) records_num = [i for i in range(N)] chosen_records_num = random.choices(records_num, k=K) my_records = [] for i in chosen_records_num: my_records.append(get_k_record(records, i)) print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END + ' records\n') for i in range(K): print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t') markup = ner(my_records[i].text) show_markup(markup.text, markup.spans) print('\n--------------------------\n\n')
def save_syntax_analysis_by_text(self, text, file, is_many_sentences=False): f = open(file, 'a') sys.stdout = f print('-' * 100) if text != 'None': if not is_many_sentences: chunk = list() for sent in sentenize(text): tokens = [_.text for _ in tokenize(sent.text)] chunk.append(tokens) markup = next(self.syntax.map(chunk)) words, deps = list(), list() for token in markup.tokens: words.append(token.text) source = int(token.head_id) - 1 target = int(token.id) - 1 if source > 0 and source != target: deps.append([source, target, token.rel]) show_markup(words, deps) else: for sentence in text.split('.'): if len(sentence.split()) > 5: chunk = list() for sent in sentenize(sentence): tokens = [_.text for _ in tokenize(sent.text)] chunk.append(tokens) markup = next(self.syntax.map(chunk)) words, deps = list(), list() for token in markup.tokens: words.append(token.text) source = int(token.head_id) - 1 target = int(token.id) - 1 if source > 0 and source != target: deps.append([source, target, token.rel]) show_markup(words, deps) else: print('None') print('-' * 100)
# print(df) from razdel import tokenize, sentenize from navec import Navec from slovnet import Morph, Syntax, NER from ipymarkup import show_span_ascii_markup as show_markup df = pd.read_csv('categories.csv') categories = df.to_records(index=False) navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar') # morph = Morph.load('slovnet_morph_news_v1.tar') # syntax = Syntax.load('slovnet_syntax_news_v1.tar') ner = NER.load('slovnet_ner_news_v1.tar') # morph.navec(navec) # syntax.navec(navec) ner.navec(navec) with open('sport_texts_clear.txt', 'r', encoding='UTF-8') as f: i = 1 for line in f: sents = [sent.text for sent in sentenize(line)] for sent in sents: i += 1 # tokens = tokenize(sent) # print(list(tokens)) markup = ner(sent) show_markup(markup.text, markup.spans) if i > 20: break
'packages') from yargy import Parser from ipymarkup import show_markup from random import sample from IPython.display import display from dataloader import DataLoader from fact import CONTENT_TYPE_PARSER, ContentTypeFact lines = DataLoader().exemple_offers() parser = Parser(CONTENT_TYPE_PARSER) for line in lines[:10]: result_list = [] matches = list(parser.findall(line)) print( '----------------------------------------------------------------------' ) print("Content: ") for match in matches: if match is not None: try: result_list.append(match.fact.contenttype) print("\t", match.fact.contenttype) except KeyError: print("Content: KeyError") result_list = list(dict.fromkeys(result_list)) print("Result:") print(result_list) show_markup(line, [_.span for _ in matches])
def test_on_random_record(): records = load_lenta(lenta_path) record, k = get_random_record(records) markup = ner(record.text) print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n') show_markup(markup.text, markup.spans)