def assemble_text(doc, ordered=None): res, isli = None, None if isinstance(doc, list): lines, li_no = [], 1 for line in doc: line = assemble_text(line) if line: if isinstance(line, tuple): line = line[0] if line: if ordered: prefix = str(li_no) + '.' li_no += 1 else: prefix = '-' line = prefix + ' ' + line if line: lines.append(line) res, need_space = '', False for line in lines: need_space_ = line[-1] != '\n' res += ' ' * (need_space and need_space_) + line need_space = need_space_ else: e = doc['e'] if e in ['blockquote', 'code', 'spoilertext']: res = '' elif e in ['br', 'hr', 'table']: res = '\n' elif e == 'li': res = utils.norm_text2(assemble_text(doc['c'])) + '\n' isli = True elif e == 'link': res = utils.norm_text2(doc['t']) link = doc['u'] if res.find(link) < 0: res += ' (' + link + ')' elif e == 'list': res = assemble_text(doc['c'], ordered=doc['o']) elif e in ['par', 'h']: res = utils.norm_text2(assemble_text(doc['c'])) + '\n' elif e in ['text', 'raw'] \ or (len(e) == 2 and e[1] == '/' and e[0] >= 'a' and e[0] <= 'z'): res = utils.norm_text2(doc['t']) else: from pprint import pprint with open('1111', 'wt', encoding='utf-8') as f: pprint(doc, stream=f) assert 0, 'ERROR: Unknown type "{}"'.format(e) if res: res = re2.sub(' ', re3.sub('\n', res)) return (res, isli) if isli is not None else res
def norm_(text): text = re6.sub( '', text.replace('\n', ' ').replace( '<br>', '\n').replace('<br/>', '\n').replace( '<br />', '\n')) return '\n'.join( x for x in (' '.join(x.split()).strip() for x in utils.norm_text2( text).split('\n')) if x)
def parse_page(page): text = re.sub(r'<br>', '\n', page) text = re.sub(r'<[^>]*>', '', text) text0 = [] for line in text.split('\n'): #line = unescape(line).replace('\u00a0', ' ') \ # .replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .strip() line = utils.norm_text2(line) if line: text0.append(re.sub(r'\s+', ' ', line)) return '\n'.join(text0)
f.write(page) if page.find('<title>Пожалуйста, войдите под своим именем пользователя') > 0 \ or page.find('<h1 class="not-found-title">') > 0: continue else: if not os.path.isfile(page_fn): continue if os.path.isfile(text_fn): texts_total += 1 continue with open(page_fn, 'rt', encoding='utf-8') as f: link = f.readline().rstrip() page = f.read() match = re10a.search(page) assert match, "ERROR: Can't find header1 on page {}".format(link) header = utils.norm_text2(match.group(1)) match = re10.search(page) assert match, "ERROR: Can't find header2 on page {}".format(link) header += '\n' + utils.norm_text2(match.group(1)) match = re11.search(page) assert match, "ERROR: Can't find review on page {}".format(link) text = match.group(1) text = text.replace('\n', '') \ .replace('<em>', '').replace('</em>', '') \ .replace('<strong>', '').replace('</strong>', '') text = re12a.sub(' ', text) if DUMP: with open('12a.html', 'wt', encoding='utf-8') as f: f.write(text) text = re12b.sub('</p>', text).replace('<p></p>', '') \ .replace('<ul>', '') \
if len(page_fns) > 0 else \ 0 texts_total = 0 re0a = re.compile(r'<div class="article__summary article__summary_article-page' r'[^">]*">(.+?)</div>') re0 = re.compile(r'<div class="article__text article__text_article-page' r'[^">]*">(.+?)</div>') re1 = re.compile(r'<p>((?:.|\n)*?)</p>') re2 = re.compile(r'<.*?>') need_enter = False for link_no, link in enumerate(links, start=1): link, header = link.split('\t') #header = unescape(header).replace('\u200b', '') \ # .replace('\ufeff', '').strip() header = utils.norm_text2(header) if texts_total >= utils.TEXTS_FOR_SOURCE: break #link = 'https://www.interfax.ru/interview/374150' page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no) text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no) page = None if link_no > start_link_idx: time.sleep(1) res = utils.get_url(link) page = res.text else: if not os.path.isfile(page_fn): continue if os.path.isfile(text_fn): texts_total += 1
if pos > 0: res = res[:pos] res = res.replace('\n', ' ') res = re1.sub('{img}', res) res = re2.sub('', res) res = re3.sub( lambda x: re3a.sub(' ', x.group(1).upper()) + ':', res ) res = res.replace('\r', '') \ .replace('<br>', '\n').replace('</p>', '\n') res = re4.sub(' ', '<' + res) #txt = unescape(res).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') txt = utils.norm_text2(res) lines = [] maybe_caption = False for line in [x.strip() for x in txt.split('\n')]: if '{img}' in line: maybe_caption = True continue if line and (not line.isupper() or '.' in line) \ and not (len(line) >= 2 and ((line[0] == '(' and line[-1] == ')') or (line[0] == '[' and line[-1] == ']') or (line[0] == '«' and line[-1] == '»'))) \ and (not maybe_caption or not line[-1].isalnum()): lines.append(line.split()) maybe_caption = False lines = normalize_text(lines)
continue if os.path.isfile(text_fn): texts_total += 1 continue with open(page_fn, 'rt', encoding='utf-8') as f: link = f.readline().rstrip() page = f.read() res = re0.findall(page) lines, key_lines = [], 0 issent = False prev_speaker, prev_strong, curr_speaker = None, None, None for line in res: #line = unescape(line).replace('\u200b', '').replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .replace('</strong><strong>', '') line = utils.norm_text2(line).replace('</strong><strong>', '') line = re1.sub(r'{\g<1>}', line) line = re2.sub('', line) line = re2a.sub(' ', line).strip() sents = [ x.strip() for x in line.split('{strong') for x in x.split('/strong}') ] for sent in sents: if sent.startswith('}') and sent.endswith('{'): sent = sent[1:-1].strip() speaker, strong = SPEAKER_A, True else: speaker, strong = SPEAKER_B, False if curr_speaker: speaker = curr_speaker
print(res, file=f) print('===', file=f) print(lines, file=f) #exit() break res = res[pos:] pos = res.find('>') attr = res[:pos] res = res[pos + 1:] pos = res.find(end_token) lines_ = res[:pos] if pos >= 0 else res if 'right' in attr or 'center' in attr: continue lines_ = [x for x in lines_.split('<br />') for x in x.split('<br>')] for line_no, line in enumerate(lines_): if not utils.norm_text2(line): continue #print(line) if line.startswith('<') and line.endswith('>'): if not lines: isbold = True elif not isbold: lines = [] #print('== delete ==') continue elif isbold: lines = [] #print('== delete ==') isbold = False line = line.replace('<strong>', '').replace('</strong>', '') \ .replace('<em>', '').replace('</em>', '') \
book_url = book[:pos] book = book[pos:] token = "<a class='uline' href='/authors/" pos = book.find(token) assert pos >= 0, \ 'ERROR: Not found: {}\n{}\n{}'.format(url, token, book) book = book[pos + len(token):] pos = book.find("'>") assert pos >= 0, \ 'ERROR: Not found: {}\n{}\n{}'.format(url, token, book) author_url = '/authors/' + book[:pos] book = book[pos + 2:] pos = book.find('<') assert pos >= 0, \ 'ERROR: Not found: {}\n{}\n{}'.format(url, token, book) author_name = utils.norm_text2(book[:pos]).strip() book = book[pos:] token = '<div class="desc2">' pos = book.find(token) assert pos >= 0, \ 'ERROR: Not found: {}\n{}\n{}'.format(url, token, book) book = book[pos + len(token):] pos = book.find('<') assert pos >= 0, \ 'ERROR: Not found: {}\n{}\n{}'.format(url, token, book) genre = book[:pos] book = book[pos:] #pos = genre.find(',') #if pos > 0: # genre = genre[:pos] token = "<div class='desc2'>Язык оригинала: "
page_fns = utils.get_file_list(utils.PAGES_DIR, num_links) start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1] .replace(utils.DATA_EXT, '')) \ if len(page_fns) > 0 else \ 0 texts_total = 0 re0 = re.compile(r'<div class="article__text">((?:.|\n)*?)</div>') re1 = re.compile(r'<p>((?:.|\n)*?)</p>') re2 = re.compile(r'<.*?>|\(.*?\)') need_enter = False for link_no, link in enumerate(links, start=1): link, header = link.split('\t') #header = unescape(header).replace('\u200b', '').replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё').strip() header = utils.norm_text2(header) if texts_total >= utils.TEXTS_FOR_SOURCE: break #link = 'https://www.interfax.ru/interview/374150' page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no) text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no) page = None if link_no > start_link_idx: res = utils.get_url(link) page = res.text else: if not os.path.isfile(page_fn): continue if os.path.isfile(text_fn): texts_total += 1 continue
res = res[:pos].strip() assert res.endswith(';'), \ 'ERROR: No state end on page {}'.format(link) res = res[:-1] if DUMP: with open('1111.json', 'wt', encoding='utf-8') as f: f.write(res) state = json.loads(res) if DUMP: from pprint import pprint with open('1111.json', 'wt', encoding='utf-8') as f: pprint(state, stream=f) products = state.get('entities', {}).get('products') assert products, 'ERROR: No products in state on page {}'.format(link) product = products[0] header = utils.norm_text2(product['name']) text = utils.norm_text2(product['description']) if DUMP: with open('1111.txt', 'at', encoding='utf-8') as f: f.write(text) lines = [header ] + [x for x in (x.strip() for x in text.split('\n')) if x] res, text = False, None while len(lines) >= _utils.MIN_TEXT_LINES: text = '\n'.join(lines) text0 = re0.sub('', text) text1 = re1.sub('', text0) if any(x in 'ЀЂЃЄЅІЇЈЉЊЋЌЍЎЏѐђѓєѕіїјљњћќѝўџѠѡѢѣѤѥѦѧѨѩѪѫѬѭѮѯѰѱѲѳѴѵ' 'ѶѷѸѹѺѻѼѽѾѿҀҁ҂҃҄҅҆҇҈҉ҊҋҌҍҎҏҐґҒғҔҕҖҗҘҙҚқҜҝҞҟҠҡҢңҤҥҦҧҨҩ' 'ҪҫҬҭҮүҰұҲҳҴҵҶҷҸҹҺһҼҽҾҿӀӁӂӃӄӅӆӇӈӉӊӋӌӍӎӏӐӑӒӓӔӕӖӗӘәӚӛӜӝ' 'ӞӟӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯӰӱӲӳӴӵӶӷӸӹӺӻӼӽӾӿ' for x in text0):
continue with open(page_fn, 'rt', encoding='utf-8') as f: link, p_link = f.readline().rstrip().split() p_link = p_link[1:-1] page = f.read() if page: text = re.sub(r'<br>', '\n', page) text = re.sub(r'<[^>]*>', '', text) text0 = [] for line in text.split('\n'): #line = unescape(line).replace('\u00a0', ' ') \ # .replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .strip() line = utils.norm_text2(line) if line: text0.append(re.sub(r'\s+', ' ', line)) text = '\n'.join(text0) if text: texts_total += 1 with open(page_fn, 'wt', encoding='utf-8') as f: print('{} ({})'.format(link, p_link), file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print('{} ({})'.format(link, p_link), file=f) f.write(text) print('\r{} (of {})'.format( texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)), end='') need_enter = True
page = f.read() check_ignore = False if DUMP: with open('1111.html', 'wt', encoding='utf-8') as f: f.write(page) if page.find('class="item-closed-warning"') > 0 \ or page.find('<title>Ошибка 404') > 0: continue token = '<span class="title-info-title-text" itemprop="name">' pos = page.find(token) if pos < 0: print('WARNING: No author on page {} (a list?)"'.format(link)) continue res = page[pos + len(token):] pos = res.find('<') header = utils.norm_text2(res[:pos]) token = '<div class="item-description-text" itemprop="description">' pos = res.find(token) if pos < 0: token = '<div class="item-description-html" itemprop="description">' pos = res.find(token) assert pos > 0, "ERROR: Can't find text on page {}".format(link) res = res[pos + len(token):] pos = res.find('</div>') text = res[:pos] res = res[pos:] if DUMP: with open('1111.txt', 'wt', encoding='utf-8') as f: f.write(text) text = '\n'.join([ x for i, x in enumerate(x for x in text.split('<p>')
start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1] .replace(utils.DATA_EXT, '')) \ if len(page_fns) > 0 else \ 0 texts_total = 0 re2 = re.compile(r'<div itemprop="articleBody" class="article-text-body">' r'((?:.|\n)+?)</div>') re0 = re.compile(r'<p>((?:.|\n)*?)</p>') re1 = re.compile(r'<.*?>') need_enter = False for link_no, link in enumerate(links, start=1): link, header = link.split('\t') #header = unescape(header).replace('\u200b', '').replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё').strip() header = utils.norm_text2(header) if texts_total >= utils.TEXTS_FOR_SOURCE: break #link = 'https://www.interfax.ru/interview/374150' page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no) text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no) page = None if link_no > start_link_idx: res = utils.get_url(link) page = res.text else: if not os.path.isfile(page_fn): continue if os.path.isfile(text_fn): texts_total += 1 continue
r'\n\g<1>\n', text0) text0 = re.sub( r'<div class="\w+ cxmmr5t8 oygrvhab hcukyx3x c1et5uql ii04i59q">([^>]*)</div>', r'\n\g<1>\n', text0) text0 = re.sub(r'<div[^>]*>([^>]*)</div>', r'\g<1>', text0) text0 = [] for line in text.split('\n'): line = line.strip() if line: text0.append(re.sub(r'\s+', ' ', line)) text = '\n'.join(text0) #text = unescape(text).replace('\u200b', '') \ # .replace('\ufeff', '') \ # .replace('й', 'й').replace('ё', 'ё') \ # .replace('\n\n', '\n').strip() text = utils.norm_text2(text).replace('\n\n', '\n') if text: texts_total += 1 with open(page_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(page) with open(text_fn, 'wt', encoding='utf-8') as f: print(link, file=f) f.write(text) print('\r{} (of {})'.format( texts_total, min(utils.TEXTS_FOR_SOURCE, num_page_links)), end='') need_enter = True #exit() if driver: driver.quit()
===========================================================================''' page_fns = utils.get_file_list(utils.PAGES_DIR, num_links) start_link_idx = int(os.path.split(sorted(page_fns)[-1])[-1] .replace(utils.DATA_EXT, '')) \ if len(page_fns) > 0 else \ 0 texts_total = 0 re0 = re.compile(r'<p>((?:.|\n)*?)</p>') re1 = re.compile(r'<.*?>') need_enter = False for link_no, link in enumerate(links, start=1): link, header = link.split('\t') #header = unescape(header).replace('\u200b', '') \ # .replace('\ufeff', '').strip() header = utils.norm_text2(header) if texts_total >= utils.TEXTS_FOR_SOURCE: break #link = 'https://www.interfax.ru/interview/374150' page_fn = utils.get_data_path(utils.PAGES_DIR, num_links, link_no) text_fn = utils.get_data_path(utils.TEXTS_DIR, num_links, link_no) page = None if link_no > start_link_idx: res = utils.get_url(link) page = res.text else: if not os.path.isfile(page_fn): continue if os.path.isfile(text_fn): texts_total += 1 continue
wikipedia_utils.Wikipedia().articles()): file_no = article_nos.get(article_no) if file_no or file_nos: if file_no: file_nos.append(file_no) id_, title, page = article if page: lines = page.split('\n') text_lines = [] for line in lines: if line and (line[-1] != '.' or line == 'См. также:'): break text_lines.append(line) res = False while True: text = utils.norm_text2('\n'.join(text_lines).strip()) text0 = re0.sub('', text) text1 = re1.sub('', text0) if text0 and len(text1) / len(text0) >= .9: num_words = len( [x for x in text.split() if re5.sub('', x)]) if num_words > MAX_CHUNK_WORDS: text_lines = text_lines[:-1] continue if num_words >= MIN_CHUNK_WORDS: res = True break if res: if file_no: file_nos.pop() else: