Ejemplo n.º 1
0
def process_instrument(row=None,
                       db=None,
                       refresh=False,
                       tree=None,
                       latest=False,
                       strategy={'links': process_instrument_links}):
    current_app.logger.info('Processing: %s' % row.get('title'))
    if not tree:
        tree = etree.fromstring(row.get('document'), parser=large_parser)
    if not row.get('latest', True):
        tree.attrib['old-version'] = 'true'

    definitions = Definitions()

    extra_formatting(tree, row.get('version'))
    start = time.time()
    tree = strategy['links'](tree, db)
    current_app.logger.debug('Populated Links in %.2f seconds' %
                             (time.time() - start))
    title = unicode(row.get('title').decode('utf-8'))
    start = time.time()
    tree, definitions = populate_definitions(tree,
                                             definitions=definitions,
                                             title=title,
                                             expire=True,
                                             document_id=row.get('id'))

    current_app.logger.debug('Populated Definitions in %.2f seconds' %
                             (time.time() - start))
    definitions = add_parent_definitions(row,
                                         definitions=definitions,
                                         db=db,
                                         strategy=strategy)
    # now mark them

    start = time.time()
    tree, definitions = process_definitions(tree, definitions)
    current_app.logger.debug('Found Definitions in %.2f seconds' %
                             (time.time() - start))

    with (db or get_db()).cursor() as cur:
        query = """UPDATE documents d SET processed_document =  %(doc)s
                    WHERE  d.id =  %(id)s """
        cur.execute(
            query, {
                'id': row.get('id'),
                'doc': etree.tostring(tree, encoding='UTF-8', method="html"),
            })
        defs = definitions.render(document_id=row.get('id'))
        if len(defs):
            current_app.logger.info('New Definitions: %d' % len(defs))
            args_str = ','.join(
                cur.mogrify("(%s,%s,%s,%s,%s,%s,%s)", (
                    row.get('id'), x['id'], x['full_word'], x['keys'],
                    x['html'], x['expiry_tags'], x['priority'])) for x in defs)
            cur.execute("DELETE FROM definitions where document_id = %(id)s",
                        {'id': row.get('id')})
            cur.execute(
                "INSERT INTO definitions (document_id, id, full_word, words, html, expiry_tags, priority) VALUES "
                + args_str)
        if refresh:
            cur.execute("select update_views()")
    (db or get_db()).commit()
    return tree, definitions