Exemple #1
0
def process_get_tags_by_filter(filename, tags, yield_text, debug):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    http_resp = load_http_response_from_temp_file(filename)

    document_parser = DocumentParser(http_resp)
    parser = document_parser.get_parser()

    # Not all parsers have tags
    if not hasattr(parser, 'get_tags_by_filter'):
        return write_tags_to_temp_file([])

    filtered_tags = []
    for tag in parser.get_tags_by_filter(tags, yield_text=yield_text):
        filtered_tags.append(tag)

    msg = ('Returned %s Tag instances at get_tags_by_filter() for URL %s'
           ' and tags filter %r')
    args = (len(filtered_tags), http_resp.get_uri(), tags)
    om.out.debug(msg % args)

    result_filename = write_tags_to_temp_file(filtered_tags)

    return result_filename
def process_get_tags_by_filter(http_resp, tags, yield_text, debug):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    document_parser = DocumentParser(http_resp)

    # Not all parsers have tags
    if not hasattr(document_parser, 'get_tags_by_filter'):
        return []

    filtered_tags = []
    for tag in document_parser.get_tags_by_filter(tags, yield_text=yield_text):
        filtered_tags.append(tag)

    return filtered_tags
Exemple #3
0
    def can_parse(self, http_response):
        """
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        """
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        #
        # We need to verify if we can parse this HTTP response
        #
        try:
            can_parse = DocumentParser.can_parse(http_response)
        except:
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse
Exemple #4
0
def process_get_tags_by_filter(http_resp, tags, yield_text,
                               processes, hash_string, debug):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    # Save this for tracking
    pid = multiprocessing.current_process().pid
    processes[hash_string] = pid

    document_parser = DocumentParser(http_resp, False)

    # Not all parsers have tags
    if not hasattr(document_parser, 'get_tags_by_filter'):
        return []

    filtered_tags = []
    for tag in document_parser.get_tags_by_filter(tags, yield_text=yield_text):
        filtered_tags.append(tag)

    return filtered_tags
Exemple #5
0
def process_get_tags_by_filter(http_resp, tags, yield_text, debug):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    document_parser = DocumentParser(http_resp)
    parser = document_parser.get_parser()

    # Not all parsers have tags
    if not hasattr(parser, 'get_tags_by_filter'):
        return []

    filtered_tags = []
    for tag in parser.get_tags_by_filter(tags, yield_text=yield_text):
        filtered_tags.append(tag)

    msg = ('Returned %s Tag instances at get_tags_by_filter() for URL %s'
           ' and tags filter %r')
    args = (len(filtered_tags), http_resp.get_uri(), tags)
    om.out.debug(msg % args)

    return filtered_tags
Exemple #6
0
def process_document_parser(http_resp, debug):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    pid = multiprocessing.current_process().pid

    if debug:
        msg = '[mp_document_parser] PID %s is starting to parse %s'
        args = (pid, http_resp.get_url())
        om.out.debug(msg % args)

    try:
        # Parse
        document_parser = DocumentParser(http_resp)
    except Exception, e:
        if debug:
            msg = ('[mp_document_parser] PID %s finished parsing %s with'
                   ' exception: "%s"')
            args = (pid, http_resp.get_url(), e)
            om.out.debug(msg % args)
        raise
Exemple #7
0
 def _test_parse_http_response(self, http_response, *args):
     """
     Left here for testing!
     """
     return DocumentParser(http_response)