Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
    parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.')
    args = parser.parse_args()

    lookyloo = Lookyloo()
    if args.rebuild_pickles:
        lookyloo.rebuild_all()
    else:
        lookyloo.rebuild_cache()

    indexing = Indexing()
    indexing.clear_indexes()
    for capture_uuid in lookyloo.capture_uuids:
        index = True
        try:
            tree = lookyloo.get_crawled_tree(capture_uuid)
        except Exception as e:
            print(capture_uuid, e)
            continue

        if lookyloo.is_public_instance:
            cache = lookyloo.capture_cache(capture_uuid)
            if not cache:
                continue
            if cache.no_index is not None:
                index = False

        # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
        if index:
            indexing.index_cookies_capture(tree)
            indexing.index_body_hashes_capture(tree)
            indexing.index_url_capture(tree)
            categories = list(lookyloo.categories_capture(capture_uuid).keys())
            indexing.index_categories_capture(capture_uuid, categories)
Esempio n. 2
0
calendar_week = today.isocalendar()[1]
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
    {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
     calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}


def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
        splitted = urlparse(url)
        domains.add(splitted.hostname)
    return domains


for uuid in lookyloo.capture_uuids:
    cache = lookyloo.capture_cache(uuid)
    if not cache or not hasattr(cache, 'timestamp'):
        continue
    date = cache.timestamp
    if date.year not in stats:
        stats[date.year] = {}
    if date.month not in stats[date.year]:
        stats[date.year][date.month] = {
            'analysis': 0,
            'analysis_with_redirects': 0,
            'redirects': 0,
            'uniq_urls': set()
        }
    stats[date.year][date.month]['analysis'] += 1
    if len(cache.redirects) > 0:
        stats[date.year][date.month]['analysis_with_redirects'] += 1