def url_processing_pipeline(args, input_urls, sleeptime): '''Aggregated functions to show a list and download and process an input list''' input_urls = url_processing_checks(args.blacklist, input_urls) # print list without further processing if args.list: for url in input_urls: write_result(url, args) # print('\n'.join(input_urls)) return None # build domain-aware processing list domain_dict = dict() while len(input_urls) > 0: url = input_urls.pop() domain_name = extract_domain(url) if domain_name not in domain_dict: domain_dict[domain_name] = [] domain_dict[domain_name].append(url) # initialize file counter if necessary if len(input_urls) > MAX_FILES_PER_DIRECTORY: counter = 0 else: counter = None if len(domain_dict) <= 5: backoff_dict = dict() single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter) else: multi_threaded_processing(domain_dict, args, sleeptime, counter)
def multi_threaded_processing(domain_dict, args, sleeptime, counter): '''Implement a multi-threaded processing algorithm''' i = 0 backoff_dict = dict() if args.parallel is not None: download_threads = args.parallel else: download_threads = DOWNLOAD_THREADS while len(domain_dict) > 0: # the remaining list is too small, process it differently if len({x for v in domain_dict.values() for x in v}) < download_threads: single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter) return # populate buffer bufferlist, bufferdomains = list(), set() while len(bufferlist) < download_threads: domain = random.choice(list(domain_dict.keys())) if domain not in backoff_dict or \ (datetime.now() - backoff_dict[domain]).total_seconds() > sleeptime: bufferlist.append(domain_dict[domain].pop()) bufferdomains.add(domain) backoff_dict[domain] = datetime.now() # safeguard else: i += 1 if i > len(domain_dict) * 3: LOGGER.debug('spacing request for domain name %s', domain) sleep(sleeptime) i = 0 # start several threads with ThreadPoolExecutor(max_workers=download_threads) as executor: future_to_url = { executor.submit(fetch_url, url): url for url in bufferlist } for future in as_completed(future_to_url): url = future_to_url[future] # register in backoff dictionary to ensure time between requests domain = extract_domain(url) backoff_dict[domain] = datetime.now() # handle result counter = process_result(future.result(), args, url, counter) # clean registries for domain in bufferdomains: if not domain_dict[domain]: del domain_dict[domain] del backoff_dict[domain]
def sitemap_search(url): domain = extract_domain(url) sitemapurl = url.rstrip('/') + '/sitemap.xml' sitemapurls, linklist = process_sitemap(sitemapurl, domain) if sitemapurls == [] and len(linklist) > 0: return linklist if sitemapurls == [] and linklist == []: for sitemapurl in find_robots_sitemaps(url): tmp_sitemapurls, tmp_linklist = process_sitemap(url, domain) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) while sitemapurls: tmp_sitemapurls, tmp_linklist = process_sitemap( sitemapurls.pop(), domain) sitemapurls.extend(tmp_sitemapurls) linklist.extend(tmp_linklist) return linklist
def extract_metadata(filecontent, default_url=None, date_config=None): '''Main process for metadata extraction''' # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # correction: author not a name if metadata['author'] is not None: if ' ' not in metadata['author'] or metadata['author'].startswith( 'http'): metadata['author'] = None # fix: try json-ld metadata and override metadata = extract_json(tree, metadata) # try with x-paths # title if metadata['title'] is None: metadata['title'] = extract_title(tree) # author if metadata['author'] is None: metadata['author'] = extract_author(tree) # url if metadata['url'] is None: metadata['url'] = extract_url(tree, default_url) # hostname if metadata['url'] is not None: metadata['hostname'] = extract_domain(metadata['url']) # extract date with external module htmldate if date_config is None: date_config = HTMLDATE_CONFIG date_config['url'] = metadata['url'] try: metadata['date'] = find_date(tree, **date_config) # temporary fix for htmldate bug except UnicodeError: pass # sitename if metadata['sitename'] is None: metadata['sitename'] = extract_sitename(tree) if metadata['sitename'] is not None: if metadata['sitename'].startswith('@'): # scrap Twitter ID metadata['sitename'] = re.sub(r'^@', '', metadata['sitename']) # capitalize try: if not '.' in metadata['sitename'] and not metadata['sitename'][ 0].isupper(): metadata['sitename'] = metadata['sitename'].title() # fix for empty name except IndexError: pass else: # use URL if metadata['url']: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata['url']) if mymatch: metadata['sitename'] = mymatch.group(1) # categories if not metadata['categories']: metadata['categories'] = extract_catstags('category', tree) # tags if not metadata['tags']: metadata['tags'] = extract_catstags('tags', tree) # for safety: length check for key, value in metadata.items(): if value is not None and len(value) > 10000: metadata[key] = value[:9999] + '…' # return return metadata
def extract_metadata(filecontent, default_url=None, date_config=None): """Main process for metadata extraction. Args: filecontent: HTML code as string. default_url: Previously known URL of the downloaded document. date_config: Provide extraction parameters to htmldate as dict(). Returns: A dict() containing the extracted metadata information or None. """ # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # correction: author not a name if metadata['author'] is not None: if ' ' not in metadata['author'] or metadata['author'].startswith( 'http'): metadata['author'] = None # fix: try json-ld metadata and override metadata = extract_json(tree, metadata) # try with x-paths # title if metadata['title'] is None: metadata['title'] = extract_title(tree) # author if metadata['author'] is None: metadata['author'] = extract_author(tree) # url if metadata['url'] is None: metadata['url'] = extract_url(tree, default_url) # hostname if metadata['url'] is not None: metadata['hostname'] = extract_domain(metadata['url']) # extract date with external module htmldate if date_config is None: date_config = HTMLDATE_CONFIG date_config['url'] = metadata['url'] try: metadata['date'] = find_date(tree, **date_config) # temporary fixes for htmldate bugs # todo: remove later except (TypeError, UnicodeError): pass # sitename if metadata['sitename'] is None: metadata['sitename'] = extract_sitename(tree) if metadata['sitename'] is not None: if metadata['sitename'].startswith('@'): # scrap Twitter ID metadata['sitename'] = re.sub(r'^@', '', metadata['sitename']) # capitalize try: if not '.' in metadata['sitename'] and not metadata['sitename'][ 0].isupper(): metadata['sitename'] = metadata['sitename'].title() # fix for empty name except IndexError: pass else: # use URL if metadata['url']: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata['url']) if mymatch: metadata['sitename'] = mymatch.group(1) # categories if not metadata['categories']: metadata['categories'] = extract_catstags('category', tree) # tags if not metadata['tags']: metadata['tags'] = extract_catstags('tags', tree) # license for element in tree.xpath('//a[@rel="license"]', ): if element.text is not None: metadata['license'] = trim(element.text) break # for safety: length check for key, value in metadata.items(): if value is not None and len(value) > 10000: metadata[key] = value[:9999] + '…' # remove spaces and control characters for item in metadata: if metadata[item] is not None and isinstance(metadata[item], str): metadata[item] = line_processing(metadata[item]) # return return metadata
def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=False, author_blacklist=None): """Main process for metadata extraction. Args: filecontent: HTML code as string. default_url: Previously known URL of the downloaded document. date_config: Provide extraction parameters to htmldate as dict(). author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. Returns: A dict() containing the extracted metadata information or None. """ # init if author_blacklist is None: author_blacklist = set() # load contents tree = load_html(filecontent) if tree is None: return None # initialize dict and try to strip meta tags metadata = examine_meta(tree) # to check: remove it and replace with author_blacklist in test case if metadata.author is not None and ' ' not in metadata.author: metadata.author = None # fix: try json-ld metadata and override try: metadata = extract_meta_json(tree, metadata) # todo: fix bugs in json_metadata.py except TypeError as err: LOGGER.warning('error in JSON metadata extraction: %s', err) # try with x-paths # title if metadata.title is None: metadata.title = extract_title(tree) # check author in blacklist if metadata.author is not None and len(author_blacklist) > 0: metadata.author = check_authors(metadata.author, author_blacklist) # author if metadata.author is None: metadata.author = extract_author(tree) # recheck author in blacklist if metadata.author is not None and len(author_blacklist) > 0: metadata.author = check_authors(metadata.author, author_blacklist) # url if metadata.url is None: metadata.url = extract_url(tree, default_url) # hostname if metadata.url is not None: metadata.hostname = extract_domain(metadata.url) # extract date with external module htmldate if date_config is None: # decide on fast mode if fastmode is False: date_config = HTMLDATE_CONFIG_EXTENSIVE else: date_config = HTMLDATE_CONFIG_FAST date_config['url'] = metadata.url metadata.date = find_date(tree, **date_config) # sitename if metadata.sitename is None: metadata.sitename = extract_sitename(tree) if metadata.sitename is not None: if metadata.sitename.startswith('@'): # scrap Twitter ID metadata.sitename = re.sub(r'^@', '', metadata.sitename) # capitalize try: if ( '.' not in metadata.sitename and not metadata.sitename[0].isupper() ): metadata.sitename = metadata.sitename.title() # fix for empty name except IndexError as err: LOGGER.warning('error in sitename extraction: %s', err) # use URL elif metadata.url: mymatch = re.match(r'https?://(?:www\.|w[0-9]+\.)?([^/]+)', metadata.url) if mymatch: metadata.sitename = mymatch.group(1) # categories if not metadata.categories: metadata.categories = extract_catstags('category', tree) # tags if not metadata.tags: metadata.tags = extract_catstags('tag', tree) # license metadata.license = extract_license(tree) # safety checks metadata.clean_and_trim() # return result return metadata
def determine_domain(url): '''Extraction of domain/host name from URL via courlan module''' domain = extract_domain(url) return domain