def extract_html_rec(start_url, max_depth=99, url=None, max_links=None): """ Recursively extract all HTMLs from a start URL. :param start_url: URL of the initial webpage. :param url: URL of the webpage. Used internaly for the recursive calls. Always use None when calling this function from another function. :param max_depth: Depth of crawling. """ global visited if url is None: url = start_url visited = [] if max_depth == 0: return [] if max_links is not None and (len(visited) >= max_links): return [] url = clean_url(url) visited.append(url) html = download(url) if html is None: return [] text = [html] if not text_extraction.is_pdf(html): for link in extract_links(get_crawler_io().get_redirect(url), text_utils.str2unicode(html)): link = clean_url(link) if not filter_url(start_url, link): text = text + extract_html_rec(start_url, max_depth - 1, link, max_links) return text
def extract_text(doc, title_weight=None, header_weights=None, use_pdf=True, use_stemmer=False, ukkonen_len=0): """ Extracts cleaned text from an HTML or PDF. """ if is_pdf(doc): if use_pdf: try: text = extract_text_pdf(doc) except: # TODO: Nice error handling return '' else: return '' else: text = extract_text_html( str2unicode(doc), title_weight=title_weight, header_weights=header_weights) # Replace newlines etc. text = re.sub('\s+', ' ', text) # Replace Umlaute and apply 'unidecode' text = clean_text(text) # Remove punctuation replace_punctuation = string.maketrans( string.punctuation, ' ' * len(string.punctuation)) text = text.translate(replace_punctuation) # Remove multiple spaces text = re.sub(' +', ' ', text) # Strip text = text.strip() # Split camel case words text = ' '.join([split_camel_case(word) for word in text.split(' ')]) # Remove digits text = ''.join([c for c in text if not c.isdigit()]) # Remove single characters text = ' '.join([word for word in text.split(' ') if len(word) > 1]) # Remove multiple spaces text = re.sub(' +', ' ', text) # Stem if use_stemmer: text = apply_stemmer(text) # Remove long repeated strings if ukkonen_len: text = remove_repeated_long_strings(text, ukkonen_len) return text.lower()
def extract_text_pdf(s): """ Extracts text from a PDF. """ fp = StringIO.StringIO() outfp = StringIO.StringIO() fp.write(s) laparams = LAParams() imagewriter = None pagenos = set() caching = True rsrcmgr = PDFResourceManager(caching=caching) maxpages = 0 rotation = 0 password = '' device = TextConverter( rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() return str2unicode(outfp.getvalue())