Python parse Examples

Programming Language: Python

Namespace/Package Name: metahtml

Method/Function: parse

Examples at hotexamples.com: 2

Python parse - 2 examples found. These are the top rated real world Python examples of metahtml.parse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: downloader_host.py Project: chenpatrickc/search_engine

def process_cdx_url(connection, url, batch_size=100, source='cc', **kwargs):
    '''
    NOTE:
    ideally, this function would be wrapped in a transaction;
    but this causes deadlocks when it is run concurrently with other instances of itself
    '''
    cdx = cdx_toolkit.CDXFetcher(source)

    # create a new entry in the source table for this bulk insertion
    name = 'process_cdx_url(url="' + str(url) + '", source="' + str(
        source) + '", **kwargs=' + str(kwargs) + ')'
    log.info("name=" + str(name.replace('"', r'\"')))
    try:
        sql = sqlalchemy.sql.text('''
        INSERT INTO source (name) VALUES (:name) RETURNING id;
        ''')
        res = connection.execute(sql, {'name': name})
        id_source = res.first()['id']
        log.info('id_source=' + str(id_source))

    # if an entry already exists in source,
    # then this bulk insertion has already happened (although may not be complete),
    # and so we skip this insertion
    except sqlalchemy.exc.IntegrityError:
        logging.warning('skipping name=' + name)
        return

    # ensure that we search all records, and not just records from the last year
    if 'from_ts' not in kwargs:
        kwargs['from_ts'] = '19000101000000'

    # the cc archive supports filtering by status code, but the ia archive does not;
    # since we only care about status=200, add this filter if possible
    if 'filter' not in kwargs and source == 'cc':
        kwargs['filter'] = 'status:200'

    # estimate the total number of matching urls
    estimated_urls = cdx.get_size_estimate(url, kwargs)
    log.info("estimated_urls=" + str(estimated_urls))

    # loop through each matching url
    # and add it to the batch
    batch = []
    for i, result in enumerate(cdx.iter(url, **kwargs)):

        # process only urls with 200 status code (i.e. successful)
        if result['status'] == '200':
            log.info('fetching result; progress=' + str(i) + '/' +
                     str(estimated_urls) +
                     '={:10.4f}'.format(i / estimated_urls) + ' url=' +
                     result['url'])

            # FIXME: extract a warc record from the result variable
            record = result.fetch_warc_record()

            # FIXME: extract the information from the warc record
            url = result['url']
            accessed_at = datetime.strptime(result['timestamp'],
                                            '%Y%m%d%H%M%S')
            html = result.content
            log.debug("url=" + url)

            # FIXME: extract the metainfo using the metahtml library
            try:
                meta = metahtml.parse(html, url)
                try:
                    pspacy_title = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                    pspacy_content = pspacy.lemmatize(
                        meta['language']['best']['value'],
                        meta['title']['best']['value'])
                except TypeError:
                    pspacy_title = None
                    pspacy_content = None

            # if there was an error in metahtml, log it
            except Exception as e:
                logging.warning('url=' + url + ' exception=' + str(e))
                meta = {
                    'exception': {
                        'str(e)': str(e),
                        'type': type(e).__name__,
                        'location': 'metahtml',
                        'traceback': traceback.format_exc()
                    }
                }
                pspacy_title = None
                pspacy_content = None
            #meta = metahtml.parse(html, url)
            #pspacy_title = None
            #pspacy_content = None

            # append to the batch
            batch.append({
                'accessed_at': accessed_at,
                'id_source': id_source,
                'url': url,
                'jsonb': json.dumps(meta, default=str),
                'pspacy_title': pspacy_title,
                'pspacy_content': pspacy_content
            })

        if len(batch) >= batch_size:
            bulk_insert(connection, batch)
            batch = []

    # finished loading urls,
    # so insert the last batch and update the source table
    if len(batch) > 0:
        bulk_insert(connection, batch)
        batch = []
    sql = sqlalchemy.sql.text('''
    UPDATE source SET finished_at=now() where id=:id;
    ''')
    res = connection.execute(sql, {'id': id_source})

Example #2

Show file

File: downloader_warc.py Project: anan-ara/search_engine

def process_warc_from_disk(connection, warc_path, id_source, batch_size=100):
    '''
    '''
    with open(warc_path, 'rb') as stream:

        # for efficiency, we will not insert items into the db one at a time;
        # instead, we add them to the batch list,
        # and then bulk insert the batch list when it reaches len(batch)==batch_size
        batch = []

        for record in ArchiveIterator(stream):

            # WARC files contain many entries;
            # we only care about HTTP200 status code responses
            if record.rec_type == 'response':

                # extract the information from the warc archive
                url = record.rec_headers.get_header('WARC-Target-URI')
                accessed_at = record.rec_headers.get_header('WARC-Date')
                html = record.content_stream().read()
                logging.debug("url=" + url)

                # extract the meta
                try:
                    meta = metahtml.parse(html, url)
                    try:
                        pspacy_title = pspacy.lemmatize(
                            meta['language']['best']['value'],
                            meta['title']['best']['value'])
                        pspacy_content = pspacy.lemmatize(
                            meta['language']['best']['value'],
                            meta['title']['best']['value'])
                    except TypeError:
                        pspacy_title = None
                        pspacy_content = None

                # if there was an error in metahtml, log it
                except Exception as e:
                    logging.warning('url=' + url + ' exception=' + str(e))
                    meta = {
                        'exception': {
                            'str(e)': str(e),
                            'type': type(e).__name__,
                            'location': 'metahtml',
                            'traceback': traceback.format_exc()
                        }
                    }
                    pspacy_title = None
                    pspacy_content = None

                # add the results to the batch
                meta_json = json.dumps(meta, default=str)
                batch.append({
                    'accessed_at': accessed_at,
                    'id_source': id_source,
                    'url': url,
                    'jsonb': meta_json,
                    'pspacy_title': pspacy_title,
                    'pspacy_content': pspacy_content
                })

            # bulk insert the batch
            if len(batch) >= batch_size:
                bulk_insert(batch)
                batch = []

        # we have finished looping over the archive;
        # we should bulk insert everything in the batch list that hasn't been inserted
        if len(batch) > 0:
            bulk_insert(batch)