Example #1
0
def url_processing_pipeline(args, input_urls, sleeptime):
    '''Aggregated functions to show a list and download and process an input list'''
    input_urls = url_processing_checks(args.blacklist, input_urls)
    # print list without further processing
    if args.list:
        for url in input_urls:
            write_result(url, args)  # print('\n'.join(input_urls))
        return None
    # build domain-aware processing list
    domain_dict = dict()
    while len(input_urls) > 0:
        url = input_urls.pop()
        domain_name = extract_domain(url)
        if domain_name not in domain_dict:
            domain_dict[domain_name] = []
        domain_dict[domain_name].append(url)
    # initialize file counter if necessary
    if len(input_urls) > MAX_FILES_PER_DIRECTORY:
        counter = 0
    else:
        counter = None
    if len(domain_dict) <= 5:
        backoff_dict = dict()
        single_threaded_processing(domain_dict, backoff_dict, args, sleeptime,
                                   counter)
    else:
        multi_threaded_processing(domain_dict, args, sleeptime, counter)
Example #2
0
def multi_threaded_processing(domain_dict, args, sleeptime, counter):
    '''Implement a multi-threaded processing algorithm'''
    i = 0
    backoff_dict = dict()
    if args.parallel is not None:
        download_threads = args.parallel
    else:
        download_threads = DOWNLOAD_THREADS
    while len(domain_dict) > 0:
        # the remaining list is too small, process it differently
        if len({x
                for v in domain_dict.values() for x in v}) < download_threads:
            single_threaded_processing(domain_dict, backoff_dict, args,
                                       sleeptime, counter)
            return
        # populate buffer
        bufferlist, bufferdomains = list(), set()
        while len(bufferlist) < download_threads:
            domain = random.choice(list(domain_dict.keys()))
            if domain not in backoff_dict or \
            (datetime.now() - backoff_dict[domain]).total_seconds() > sleeptime:
                bufferlist.append(domain_dict[domain].pop())
                bufferdomains.add(domain)
                backoff_dict[domain] = datetime.now()
            # safeguard
            else:
                i += 1
                if i > len(domain_dict) * 3:
                    LOGGER.debug('spacing request for domain name %s', domain)
                    sleep(sleeptime)
                    i = 0
        # start several threads
        with ThreadPoolExecutor(max_workers=download_threads) as executor:
            future_to_url = {
                executor.submit(fetch_url, url): url
                for url in bufferlist
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                # register in backoff dictionary to ensure time between requests
                domain = extract_domain(url)
                backoff_dict[domain] = datetime.now()
                # handle result
                counter = process_result(future.result(), args, url, counter)
        # clean registries
        for domain in bufferdomains:
            if not domain_dict[domain]:
                del domain_dict[domain]
                del backoff_dict[domain]
Example #3
0
def sitemap_search(url):
    domain = extract_domain(url)
    sitemapurl = url.rstrip('/') + '/sitemap.xml'
    sitemapurls, linklist = process_sitemap(sitemapurl, domain)
    if sitemapurls == [] and len(linklist) > 0:
        return linklist
    if sitemapurls == [] and linklist == []:
        for sitemapurl in find_robots_sitemaps(url):
            tmp_sitemapurls, tmp_linklist = process_sitemap(url, domain)
            sitemapurls.extend(tmp_sitemapurls)
            linklist.extend(tmp_linklist)
        while sitemapurls:
            tmp_sitemapurls, tmp_linklist = process_sitemap(
                sitemapurls.pop(), domain)
            sitemapurls.extend(tmp_sitemapurls)
            linklist.extend(tmp_linklist)
    return linklist
Example #4
0
def extract_metadata(filecontent, default_url=None, date_config=None):
    '''Main process for metadata extraction'''
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # correction: author not a name
    if metadata['author'] is not None:
        if ' ' not in metadata['author'] or metadata['author'].startswith(
                'http'):
            metadata['author'] = None
    # fix: try json-ld metadata and override
    metadata = extract_json(tree, metadata)
    # try with x-paths
    # title
    if metadata['title'] is None:
        metadata['title'] = extract_title(tree)
    # author
    if metadata['author'] is None:
        metadata['author'] = extract_author(tree)
    # url
    if metadata['url'] is None:
        metadata['url'] = extract_url(tree, default_url)
    # hostname
    if metadata['url'] is not None:
        metadata['hostname'] = extract_domain(metadata['url'])
    # extract date with external module htmldate
    if date_config is None:
        date_config = HTMLDATE_CONFIG
    date_config['url'] = metadata['url']
    try:
        metadata['date'] = find_date(tree, **date_config)
    # temporary fix for htmldate bug
    except UnicodeError:
        pass
    # sitename
    if metadata['sitename'] is None:
        metadata['sitename'] = extract_sitename(tree)
    if metadata['sitename'] is not None:
        if metadata['sitename'].startswith('@'):
            # scrap Twitter ID
            metadata['sitename'] = re.sub(r'^@', '', metadata['sitename'])
        # capitalize
        try:
            if not '.' in metadata['sitename'] and not metadata['sitename'][
                    0].isupper():
                metadata['sitename'] = metadata['sitename'].title()
        # fix for empty name
        except IndexError:
            pass
    else:
        # use URL
        if metadata['url']:
            mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)',
                               metadata['url'])
            if mymatch:
                metadata['sitename'] = mymatch.group(1)
    # categories
    if not metadata['categories']:
        metadata['categories'] = extract_catstags('category', tree)
    # tags
    if not metadata['tags']:
        metadata['tags'] = extract_catstags('tags', tree)
    # for safety: length check
    for key, value in metadata.items():
        if value is not None and len(value) > 10000:
            metadata[key] = value[:9999] + '…'
    # return
    return metadata
Example #5
0
def extract_metadata(filecontent, default_url=None, date_config=None):
    """Main process for metadata extraction.

    Args:
        filecontent: HTML code as string.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().

    Returns:
        A dict() containing the extracted metadata information or None.

    """
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # correction: author not a name
    if metadata['author'] is not None:
        if ' ' not in metadata['author'] or metadata['author'].startswith(
                'http'):
            metadata['author'] = None
    # fix: try json-ld metadata and override
    metadata = extract_json(tree, metadata)
    # try with x-paths
    # title
    if metadata['title'] is None:
        metadata['title'] = extract_title(tree)
    # author
    if metadata['author'] is None:
        metadata['author'] = extract_author(tree)
    # url
    if metadata['url'] is None:
        metadata['url'] = extract_url(tree, default_url)
    # hostname
    if metadata['url'] is not None:
        metadata['hostname'] = extract_domain(metadata['url'])
    # extract date with external module htmldate
    if date_config is None:
        date_config = HTMLDATE_CONFIG
    date_config['url'] = metadata['url']
    try:
        metadata['date'] = find_date(tree, **date_config)
    # temporary fixes for htmldate bugs # todo: remove later
    except (TypeError, UnicodeError):
        pass
    # sitename
    if metadata['sitename'] is None:
        metadata['sitename'] = extract_sitename(tree)
    if metadata['sitename'] is not None:
        if metadata['sitename'].startswith('@'):
            # scrap Twitter ID
            metadata['sitename'] = re.sub(r'^@', '', metadata['sitename'])
        # capitalize
        try:
            if not '.' in metadata['sitename'] and not metadata['sitename'][
                    0].isupper():
                metadata['sitename'] = metadata['sitename'].title()
        # fix for empty name
        except IndexError:
            pass
    else:
        # use URL
        if metadata['url']:
            mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)',
                               metadata['url'])
            if mymatch:
                metadata['sitename'] = mymatch.group(1)
    # categories
    if not metadata['categories']:
        metadata['categories'] = extract_catstags('category', tree)
    # tags
    if not metadata['tags']:
        metadata['tags'] = extract_catstags('tags', tree)
    # license
    for element in tree.xpath('//a[@rel="license"]', ):
        if element.text is not None:
            metadata['license'] = trim(element.text)
            break
    # for safety: length check
    for key, value in metadata.items():
        if value is not None and len(value) > 10000:
            metadata[key] = value[:9999] + '…'
    # remove spaces and control characters
    for item in metadata:
        if metadata[item] is not None and isinstance(metadata[item], str):
            metadata[item] = line_processing(metadata[item])
    # return
    return metadata
Example #6
0
def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=False, author_blacklist=None):
    """Main process for metadata extraction.

    Args:
        filecontent: HTML code as string.
        default_url: Previously known URL of the downloaded document.
        date_config: Provide extraction parameters to htmldate as dict().
        author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.

    Returns:
        A dict() containing the extracted metadata information or None.

    """
    # init
    if author_blacklist is None:
        author_blacklist = set()
    # load contents
    tree = load_html(filecontent)
    if tree is None:
        return None
    # initialize dict and try to strip meta tags
    metadata = examine_meta(tree)
    # to check: remove it and replace with author_blacklist in test case
    if metadata.author is not None and ' ' not in metadata.author:
        metadata.author = None
    # fix: try json-ld metadata and override
    try:
        metadata = extract_meta_json(tree, metadata)
    # todo: fix bugs in json_metadata.py
    except TypeError as err:
        LOGGER.warning('error in JSON metadata extraction: %s', err)
    # try with x-paths
    # title
    if metadata.title is None:
        metadata.title = extract_title(tree)
    # check author in blacklist
    if metadata.author is not None and len(author_blacklist) > 0:
        metadata.author = check_authors(metadata.author, author_blacklist)
    # author
    if metadata.author is None:
        metadata.author = extract_author(tree)
    # recheck author in blacklist
    if metadata.author is not None and len(author_blacklist) > 0:
        metadata.author = check_authors(metadata.author, author_blacklist)
    # url
    if metadata.url is None:
        metadata.url = extract_url(tree, default_url)
    # hostname
    if metadata.url is not None:
        metadata.hostname = extract_domain(metadata.url)
    # extract date with external module htmldate
    if date_config is None:
        # decide on fast mode
        if fastmode is False:
            date_config = HTMLDATE_CONFIG_EXTENSIVE
        else:
            date_config = HTMLDATE_CONFIG_FAST
    date_config['url'] = metadata.url
    metadata.date = find_date(tree, **date_config)
    # sitename
    if metadata.sitename is None:
        metadata.sitename = extract_sitename(tree)
    if metadata.sitename is not None:
        if metadata.sitename.startswith('@'):
            # scrap Twitter ID
            metadata.sitename = re.sub(r'^@', '', metadata.sitename)
        # capitalize
        try:
            if (
                '.' not in metadata.sitename
                and not metadata.sitename[0].isupper()
            ):
                metadata.sitename = metadata.sitename.title()
        # fix for empty name
        except IndexError as err:
            LOGGER.warning('error in sitename extraction: %s', err)
    # use URL
    elif metadata.url:
        mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata.url)
        if mymatch:
            metadata.sitename = mymatch.group(1)
    # categories
    if not metadata.categories:
        metadata.categories = extract_catstags('category', tree)
    # tags
    if not metadata.tags:
        metadata.tags = extract_catstags('tag', tree)
    # license
    metadata.license = extract_license(tree)
    # safety checks
    metadata.clean_and_trim()
    # return result
    return metadata
Example #7
0
def determine_domain(url):
    '''Extraction of domain/host name from URL via courlan module'''
    domain = extract_domain(url)
    return domain