def exists_url(db, url):
    """Return if url is exists in db"""
    url_hash = urls.hash(url)

    result = db['Urls'].find_one({'_id': url_hash})

    return result is not None
def set_canonical_group_to_alias(db, original_url, canonical_group):
    """If there was redirect, set the canonical group to
    the orginal alias url"""

    modification = {'canonical_group': canonical_group}
    return db['Urls'].find_one_and_update({'_id': urls.hash(original_url)},
                                          {'$set': modification})
def set_visited_url(db, url, response, soup, noindex, original_url=None):
    """Try to set url to visited and update other important informations"""
    url_hash = urls.hash(url)

    is_redirect, is_permanent_redirect = _determine_type_of_redirect(response)

    url_addition = {}

    # Pairing url with canonical group id
    # Remove script tags from soup
    for script in soup('script'):
        script.extract()

    text = soup.getText(separator='\n')

    try:
        url_addition['language'] = detect(text)
    except Exception as e:
        # Fallback language
        url_addition['language'] = 'cs'

    text_hash = document.hash_document(
        document.extract_document_text_for_hash(soup))
    url_addition['canonical_group'] = get_or_create_canonical_group(
        db, text_hash)

    url_addition['visited'] = True
    url_addition['queued'] = False
    url_addition['indexed'] = False
    url_addition['noindex'] = noindex

    url_addition['progress.last_visited'] = str(datetime.utcnow())

    url_addition['content.binary'] = response.content
    url_addition['content.hashes.text'] = text_hash
    url_addition['content.encoding'] = response.encoding
    # Later detect language

    url_addition['response.elapsed'] = str(response.elapsed)
    url_addition['response.is_redirect'] = is_redirect
    url_addition['response.is_permanent_redirect'] = is_permanent_redirect
    url_addition['response.status_code'] = response.status_code
    url_addition['response.reason'] = response.reason

    url_addition = _format_response_header(response, url_addition)

    result = db['Urls'].find_one_and_update({'_id': url_hash},
                                            {'$set': url_addition})

    # If there was redirect, set the canonical group to the orginal alias url
    if original_url is not None:
        set_canonical_group_to_alias(db, original_url,
                                     url_addition['canonical_group'])

    # If insertion was successful update representative of canonical group
    if result is not None:
        _update_representatives_of_canonical_groups(
            db, url_addition['canonical_group'])

    return result is not None
def set_url_for_recrawl(db, url):
    """Set url for recrawl later"""
    url_hash = urls.hash(url)

    result = db['Urls'].find_one_and_update(
        {'_id': url_hash}, {'$set': {
            'queued': False,
            'visited': False
        }})

    return result is not None
def batch_insert_pagerank_outlinks(db, from_url, to_urls):
    """Inser batch of outlinks into database"""

    url_documents = []

    for to_url in to_urls:
        to_url = to_url.get('url')
        url_object = {
            'from_hash': urls.hash(from_url),
            'to_hash': urls.hash(to_url)
        }

        url_documents.append(url_object)

    try:
        result = db['PageRank'].insert_many(url_documents, ordered=False)
    except pymongo.errors.BulkWriteError:
        result = None

    return result
def set_timeout_url(db, url):
    """Try to set url as timouted"""
    url_hash = urls.hash(url)

    result = db['Urls'].find_one_and_update({'_id': url_hash}, {
        '$set': {
            'queued': False,
            'timeout.timeout': True,
            'timeout.last_timeout': str(datetime.utcnow())
        }
    })

    return result is not None
def set_alias_visited_url(db, url):
    url_hash = urls.hash(url)

    url_addition = {}

    url_addition['visited'] = True
    url_addition['queued'] = False
    url_addition['alias'] = True
    url_addition['progress.last_visited'] = str(datetime.utcnow())

    result = db['Urls'].find_one_and_update({'_id': url_hash},
                                            {'$set': url_addition})

    return result is not None
def set_visited_invalid_url(db, url, response, reason, is_file=False):
    url_hash = urls.hash(url)

    url_addition = {}

    url_addition['visited'] = True
    url_addition['queued'] = False
    url_addition['invalid'] = True
    url_addition['file'] = is_file
    url_addition['invalid_reason'] = reason
    url_addition['progress.last_visited'] = str(datetime.utcnow())

    result = db['Urls'].find_one_and_update({'_id': url_hash},
                                            {'$set': url_addition})

    return result is not None
def _prepare_url_object(url, visited, queued, depth):
    """Prepare url object before inserting into database"""
    url_object = {
        '_id': urls.hash(url),
        'url': url,
        'domain': urls.domain(url),
        'depth': depth,
        'visited': visited,
        'queued': queued,
        'alias': False,
        'invalid': False,
        'file': False,
        'progress': {
            'discovered': str(datetime.utcnow())
        }
    }

    return url_object
def select_representative_for_canonical_group(db, canonical_group):
    """Return id of URL which is suitable
    as representative of canonical group"""

    urls_representatives = db['Urls'].find({
        'canonical_group':
        ObjectId(canonical_group),
        'alias':
        False,
        'invalid':
        False
    })

    representatives = []

    for url in urls_representatives:
        representatives.append(url.get('url'))

    # Return hash of the shortest url
    return urls.hash(min(representatives, key=len))
Exemple #11
0
def _handle_response(database,
                     url,
                     original_url,
                     redirected,
                     response,
                     depth,
                     max_depth,
                     limit_domain,
                     blacklist,
                     ignore_blacklist=False):
    try:
        url_document = mongodb.get_url(database, url)
        regex = urls.generate_regex(limit_domain)

        # Redirect handling
        if original_url != url:
            log.info('Redirect: {1} (original: {0})'.format(original_url, url))

            # Check if redirected url is valid
            is_valid_redirect, reason = validator.validate(
                url, regex, blacklist)

            if (is_valid_redirect is False) and (
                    reason == 'UrlIsBlacklisted') and ignore_blacklist:
                is_valid_redirect = True

            if is_valid_redirect:
                mongodb.set_alias_visited_url(database, original_url)

                url_document = mongodb.get_url(database, url)

                if url_document is not None:
                    if url_document.get(
                            'visited') and not url_document.get('alias'):
                        canonical_group = url_document.get('canonical_group')
                        mongodb.set_canonical_group_to_alias(
                            database, original_url, canonical_group)

                        log.info(
                            'Already visited redirect: {0} (original: {1})'.
                            format(url, original_url))

                        return
                else:
                    if not urls.is_same_domain(url, original_url):
                        depth = max_depth

                    mongodb.insert_url(database, url, False, False, depth)

            else:
                mongodb.set_visited_invalid_url(database, original_url,
                                                response, "invalid_redirect")
                mongodb.delete_pagerank_edge_to(database,
                                                urls.hash(original_url))

                log.info('Not Valid Redirect: {0} (original: {1})'.format(
                    url, original_url))

                return
        else:
            # Check if url is already visited
            if url_document is not None:
                if url_document.get('visited'):
                    log.info('Already visited: {0}'.format(url))
                    return

        # File handling
        content_type = response.headers.get('Content-Type')

        if content_type is None:
            content_type = ''

        is_content_type_file = test_content_type_file(content_type)
        is_file_valid_type = test_file_valid_type(content_type)

        if is_content_type_file:
            if not is_file_valid_type:
                mongodb.delete_pagerank_edge_to(database, urls.hash(url))
                mongodb.set_visited_invalid_url(database, url, response,
                                                "invalid_file", True)

                log.info('Not valid file: {0}'.format(url))
                return
            else:
                if original_url != url:
                    mongodb.set_visited_file_url(database, url, response,
                                                 original_url)
                else:
                    mongodb.set_visited_file_url(database, url, response)
                log.info('Done (file) [{0}]: {1}'.format(response.reason, url))
        else:
            # Handle normal page
            soup = BeautifulSoup(response.content, 'html5lib')
            no_index = link_extractor.has_noindex(soup)
            validated_urls_on_page, not_valid_urls = link_extractor.validated_page_urls(
                soup, url, regex, blacklist)

            urls_for_insert = []

            for page_url in validated_urls_on_page:
                insert_url = {'url': page_url}

                if urls.is_same_domain(url, page_url):
                    if depth - 1 != 0:
                        insert_url['depth'] = depth - 1
                    else:
                        continue
                else:
                    insert_url['depth'] = max_depth

                urls_for_insert.append(insert_url)

            if len(urls_for_insert) > 0:
                mongodb.batch_insert_url(database, urls_for_insert, False,
                                         False)
                mongodb.batch_insert_pagerank_outlinks(database, url,
                                                       urls_for_insert)

            if original_url != url:
                mongodb.set_visited_url(database, url, response, soup,
                                        no_index, original_url)
            else:
                mongodb.set_visited_url(database, url, response, soup,
                                        no_index)

            log.info('Done [{0}]: {1}'.format(response.reason, url))

            return
    except Exception as e:
        mongodb.delete_url(database, url)
        log.exception('Exception: {0} {1}'.format(url, e))
        raise
def set_visited_file_url(db, url, response, original_url=None):
    """Save file into database and set is as visited"""

    content_type = response.headers.get('Content-Type')

    if 'application/pdf' in content_type:
        file_type = 'pdf'
    elif 'text/plain' in content_type:
        file_type = 'txt'
    else:
        file_type = None

    url_hash = urls.hash(url)

    is_redirect, is_permanent_redirect = _determine_type_of_redirect(response)

    url_addition = {}

    # Pairing url with canonical group id
    content_hash = urls.hash_document(response.content)
    url_addition['canonical_group'] = get_or_create_canonical_group(
        db, content_hash)

    url_addition['visited'] = True
    url_addition['queued'] = False
    url_addition['indexed'] = False
    url_addition['noindex'] = False
    url_addition['file'] = True
    url_addition['file_type'] = file_type

    url_addition['progress.last_visited'] = str(datetime.utcnow())

    # GridFS connection
    fs = gridfs.GridFS(db)
    file_id = fs.put(response.content)

    url_addition['content.binary'] = file_id

    url_addition['content.hashes.content'] = content_hash

    url_addition['response.elapsed'] = str(response.elapsed)
    url_addition['response.is_redirect'] = is_redirect
    url_addition['response.is_permanent_redirect'] = is_permanent_redirect
    url_addition['response.status_code'] = response.status_code
    url_addition['response.reason'] = response.reason

    url_addition = _format_response_header(response, url_addition)

    result = db['Urls'].find_one_and_update({'_id': url_hash},
                                            {'$set': url_addition})

    # If there was redirect, set the canonical group to the orginal alias url
    if original_url is not None:
        set_canonical_group_to_alias(db, original_url,
                                     url_addition['canonical_group'])

    # If insertion was successful update representative of canonical group
    if result is not None:
        _update_representatives_of_canonical_groups(
            db, url_addition['canonical_group'])

    return result is not None
def get_url(db, url):
    document = db['Urls'].find_one({'_id': urls.hash(url)})

    return document
def delete_url(db, url):
    """Try to delete url from db, returns True if case of success"""
    result = db['Urls'].delete_one({'_id': urls.hash(url)})

    return result.deleted_count > 0