# .replace('ё', 'ё') \ # .strip() line = utils.norm_text2(re1.sub('', line)) if line: lines.append(' '.join(line.split())) if len(lines) >= _utils.MIN_TEXT_LINES: texts_total += 1 if link_no > start_link_idx: with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) print(header, file=f) f.write('\n'.join(lines)) print('\r{} (of {})'.format(texts_total, min(utils.TEXTS_FOR_SOURCE, num_links)), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(num_links) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(num_links, isdialog=False)
text = None break if not res: if not SILENT: if not text: print('no text') #if nop: # exit() else: print('text beyond limits:') print(text) continue texts_total += 1 with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(utils.TEXTS_FOR_SOURCE) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(utils.TEXTS_FOR_SOURCE, isdialog=False)
#text = unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .replace('\n\n', '\n').strip() text = utils.norm_text2(text).replace('\n\n', '\n') if text: texts_total += 1 with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format( texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)), end='') need_enter = True #exit() if driver: driver.quit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(num_page_links) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(num_page_links, isdialog=False)
sent = speaker + '\t' + ' '.join(sent.split()) lines.append(sent) issent = False if speaker: prev_speaker, prev_strong = speaker, strong curr_speaker = None if key_lines >= _utils.MIN_TEXT_LINES: texts_total += 1 if link_no > start_link_idx: with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write('\n'.join(lines)) print('\r{} (of {})'.format(texts_total, min(utils.TEXTS_FOR_SOURCE, num_links)), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(num_links) #, moderator=SPEAKER_A) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(num_links)
print('{}\t{}'.format(author_id, author), file=f) authors_ignore[author_id] = author texts_total += 1 need_enter = True break if texts_total > utils.TEXTS_FOR_SOURCE: raise OverflowError() except OverflowError: pass if need_enter: print() if os.path.isfile(utils.get_data_path(utils.CHUNKS_DIR, MAX_FILES, 1)): print('WARNING: Chunks are already exist. ' 'Delete them if you want to recreate') exit() page_fns = utils.get_file_list(utils.PAGES_DIR, MAX_FILES) text_fns = utils.get_file_list(utils.TEXTS_DIR, MAX_FILES) assert len(page_fns) == len(text_fns) #new_order = utils.shuffle_file_list(page_fns) utils.shuffle_file_list(text_fns, new_order=None) '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(MAX_FILES) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(MAX_FILES, isdialog=False)
if not text: print('no text') #if nop: # exit() else: print('text beyond limits:') print(text) continue texts_total += 1 if link_no > start_link_idx: with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(MAX_PAGE) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(MAX_PAGE, isdialog=False)
text = None break if not res: if not SILENT: if not text: print('no text') #if nop: # exit() else: print('text beyond limits:') print(text) continue texts_total += 1 with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format(texts_total, utils.TEXTS_FOR_SOURCE), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(MAX_LINKS) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(MAX_LINKS, isdialog=False, norm_punct=False)
lines.append(line) if key_lines >= MIN_TEXT_LINES: texts_total += 1 if link_no > start_link_idx: with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) if lines[-1][0] == '\t': lines = lines[:-1] with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write('\n'.join(lines)) print('\r{} (of {})'.format(texts_total, min(utils.TEXTS_FOR_SOURCE, num_links)), end='') need_enter = True #exit() if need_enter: print() '''=========================================================================== Chunks creation ===========================================================================''' _utils.make_chunks(num_links, trim_ending=False, moderator=SPEAKER_A, min_chunk_lines=MIN_CHUNK_LINES) '''=========================================================================== Tokenization ===========================================================================''' utils.tokenize(num_links)