コード例 #1
0
ファイル: crawl.py プロジェクト: neocortex/cuisifier
def extract_html_rec(start_url, max_depth=99, url=None, max_links=None):
    """ Recursively extract all HTMLs from a start URL.

        :param start_url: URL of the initial webpage.
        :param url: URL of the webpage. Used internaly for the recursive calls.
                    Always use None when calling this function from another
                    function.
        :param max_depth: Depth of crawling.

    """
    global visited
    if url is None:
        url = start_url
        visited = []
    if max_depth == 0:
        return []
    if max_links is not None and (len(visited) >= max_links):
        return []
    url = clean_url(url)
    visited.append(url)
    html = download(url)
    if html is None:
        return []
    text = [html]
    if not text_extraction.is_pdf(html):
        for link in extract_links(get_crawler_io().get_redirect(url), text_utils.str2unicode(html)):
            link = clean_url(link)
            if not filter_url(start_url, link):
                text = text + extract_html_rec(start_url, max_depth - 1, link, max_links)
    return text
コード例 #2
0
def extract_text(doc, title_weight=None, header_weights=None, use_pdf=True,
                 use_stemmer=False, ukkonen_len=0):
    """ Extracts cleaned text from an HTML or PDF. """
    if is_pdf(doc):
        if use_pdf:
            try:
                text = extract_text_pdf(doc)
            except:
                # TODO: Nice error handling
                return ''
        else:
            return ''
    else:
        text = extract_text_html(
            str2unicode(doc),
            title_weight=title_weight,
            header_weights=header_weights)
    # Replace newlines etc.
    text = re.sub('\s+', ' ', text)
    # Replace Umlaute and apply 'unidecode'
    text = clean_text(text)
    # Remove punctuation
    replace_punctuation = string.maketrans(
        string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(replace_punctuation)
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Strip
    text = text.strip()
    # Split camel case words
    text = ' '.join([split_camel_case(word) for word in text.split(' ')])
    # Remove digits
    text = ''.join([c for c in text if not c.isdigit()])
    # Remove single characters
    text = ' '.join([word for word in text.split(' ') if len(word) > 1])
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    # Stem
    if use_stemmer:
        text = apply_stemmer(text)
    # Remove long repeated strings
    if ukkonen_len:
        text = remove_repeated_long_strings(text, ukkonen_len)
    return text.lower()
コード例 #3
0
def extract_text_pdf(s):
    """ Extracts text from a PDF. """
    fp = StringIO.StringIO()
    outfp = StringIO.StringIO()
    fp.write(s)
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    maxpages = 0
    rotation = 0
    password = ''
    device = TextConverter(
        rsrcmgr, outfp,
        laparams=laparams, imagewriter=imagewriter)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    return str2unicode(outfp.getvalue())