Example #1
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    fields = [status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name, user_tbl.c.screen_name]
    q = sql.select(fields, from_obj=q, use_labels=True)
    dt = datetime.utcnow() - timedelta(days=28)
    q = q.where(sql.and_(status_tbl.c.lang == 'de',
                         status_tbl.c.id >= min(offsets.values()),
                         status_tbl.c.created_at > dt))
    q = q.order_by(status_tbl.c.id.asc())

    offset = 0
    while True:
        engine.begin()
        lq = q.limit(PAGE_SIZE).offset(offset)
        offset += PAGE_SIZE
        print offset, PAGE_SIZE
        has_records = False
        for i, status in enumerate(engine.query(lq)):
            has_records = True
            max_id = max(max_id, status.get('status_id'))
            handle_status(status, rules, offsets)
        if not has_records:
            break
        for regex in regexen:
            offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex'])
        engine.commit()
    dedup_tags()
Example #2
0
def classify_tweets():
    rules, regexen = get_rules()
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    engine.begin()
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True)
    q = q.where(sql.and_(status_tbl.c.lang == 'de',
                         status_tbl.c.id >= min(offsets.values())))
    q = q.order_by(status_tbl.c.id.desc())
    for i, status in enumerate(engine.query(q)):
        max_id = max(max_id, status.get('status_id'))
        for (field, rule), data in rules.items():
            if offsets.get(data.get('regex')) > status.get('status_id'):
                continue
            m = rule.search(unicode(status.get(field)).lower())
            #print [field,data.get('regex'), m]
            if m is not None:
                #print [field, data.get('regex'), m]
                data['status_id'] = status['status_id']
                tag_table.insert(data)
        if i % 1000 == 0:
            print 'Processed: ', i
    for regex in regexen:
        offset_table.upsert({'regex': regex, 'status_id': max_id}, ['regex'])
    engine.commit()
    dedup_tags()
Example #3
0
def delete_old_tags(rules):
    engine.begin()
    regexen = [d.get('regex') for (a, d) in rules.items()]
    for row in tag_table.distinct('regex'):
        if row.get('regex') not in regexen:
            tag_table.delete(regex=row.get('regex'))
    engine.commit()
Example #4
0
def dump_batches():
    if len(raw_table) < BATCH_SIZE:
        log.info("Not enough entries remaining.")
        return False
    data, min_id, max_id = [], None, 0
    log.info("Fetching %s raw tweets...", BATCH_SIZE)
    engine.begin()
    for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']):
        if min_id is None:
            min_id = row['id']
        data.append(json.loads(row['json']))
        raw_table.delete(id=row['id'])
    log.info("Saving file...")
    fh = open('dumps/raw_%s.json' % min_id, 'wb')
    json.dump(data, fh)
    fh.close()
    engine.commit()
    return True
Example #5
0
def dump_batches():
    if len(raw_table) < BATCH_SIZE:
        log.info("Not enough entries remaining.")
        return False
    data, min_id, max_id = [], None, 0
    log.info("Fetching %s raw tweets...", BATCH_SIZE)
    engine.begin()
    for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']):
        if min_id is None:
            min_id = row['id']
        data.append(row['json'])
        raw_table.delete(id=row['id'])
    log.info("Saving file...")
    fh = open('dumps/raw_%s.json' % min_id, 'wb')
    data = '\n'.join(data)
    fh.write(data.encode('utf-8'))
    fh.close()
    engine.commit()
    return True
Example #6
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(regexen)
    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    max_id = 0
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    fields = [
        status_tbl.c.id, status_tbl.c.text, user_tbl.c.id, user_tbl.c.name,
        user_tbl.c.screen_name
    ]
    q = sql.select(fields, from_obj=q, use_labels=True)
    dt = datetime.utcnow() - timedelta(days=28)
    q = q.where(
        sql.and_(status_tbl.c.lang == 'de',
                 status_tbl.c.id >= min(offsets.values()),
                 status_tbl.c.created_at > dt))
    q = q.order_by(status_tbl.c.id.asc())

    offset = 0
    while True:
        engine.begin()
        lq = q.limit(PAGE_SIZE).offset(offset)
        offset += PAGE_SIZE
        print offset, PAGE_SIZE
        has_records = False
        for i, status in enumerate(engine.query(lq)):
            has_records = True
            max_id = max(max_id, status.get('status_id'))
            handle_status(status, rules, offsets)
        if not has_records:
            break
        for regex in regexen:
            offset_table.upsert({
                'regex': regex,
                'status_id': max_id
            }, ['regex'])
        engine.commit()
    dedup_tags()
Example #7
0
def classify_tweets(rules):
    regexen = [d.get('regex') for (a, d) in rules.items()]
    offsets = get_offsets(regexen)
    delete_old_tags(rules)

    q = text("""
        INSERT INTO tag (category, tag, status_id, classified_at, regex) 
        SELECT :category, :tag, s.id, NOW(), :regex
            FROM status s
            LEFT JOIN tag_offset tgo ON tgo.regex = :regex
            LEFT JOIN "user" u ON s.user_id = u.id
            WHERE
                (s.id > tgo.status_id OR tgo.status_id IS NULL) AND
                (s.text ~* :regex
                 OR u.name ~* :regex
                 OR u.screen_name ~* :regex)
                AND s.lang = 'de'
                AND s.created_at > NOW() - INTERVAL '28 days'
        """)

    offsets_q = text("""
        INSERT INTO tag_offset (regex, status_id)
            SELECT :regex, t.status_id
                FROM tag t
                WHERE t.regex = :regex 
                ORDER BY t.status_id DESC
                LIMIT 1
        """)

    for rule in rules.values():
        print rule
        engine.begin()
        engine.query(q, **rule)
        offset_table.delete(regex=rule['regex'])
        engine.query(offsets_q, regex=rule['regex'])
        engine.commit()

    dedup_tags()
Example #8
0
def delete_old_tags(regexen):
    engine.begin()
    for row in tag_table.distinct('regex'):
        if row.get('regex') not in regexen:
            tag_table.delete(regex=row.get('regex'))
    engine.commit()
Example #9
0
def delete_old_tags(regexen):
    engine.begin()
    for row in tag_table.distinct('regex'):
        if row.get('regex') not in regexen:
            tag_table.delete(regex=row.get('regex'))
    engine.commit()
Example #10
0
def parse(filename, file_content):
    #fh = open(filename, 'rb')
    xmldata = file_content.replace('xmlns="', 'xmlns_="')
    #fh.close()
    #print xmldata.decode('utf-8').encode('ascii', 'replace')
    root = etree.fromstring(xmldata)
    form = root.find('.//FORM_SECTION')
    form.getparent().remove(form)
    ext = Extractor(root)
    cpvs = [{
        'code': e.get('CODE'),
        'text': e.text
    } for e in root.findall('.//NOTICE_DATA/ORIGINAL_CPV')]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/ORIGINAL_CPV')

    refs = [
        e.text for e in root.findall('.//NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')
    ]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')

    data = {
        'technical_reception_id':
        ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'technical_comments':
        ext.text('./TECHNICAL_SECTION/COMMENTS'),
        'technical_deletion_date':
        ext.text('./TECHNICAL_SECTION/DELETION_DATE'),
        'technical_form_lang':
        ext.text('./TECHNICAL_SECTION/FORM_LG_LIST'),
        'reception_id':
        ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'oj_collection':
        ext.text('.//REF_OJS/COLL_OJ'),
        'oj_number':
        ext.text('.//REF_OJS/NO_OJ'),
        'oj_date':
        ext.text('.//REF_OJS/DATE_PUB'),
        'doc_no':
        ext.text('.//NOTICE_DATA/NO_DOC_OJS'),
        'doc_url':
        ext.text('.//NOTICE_DATA//URI_DOC[@LG="EN"]')
        or ext.text('.//NOTICE_DATA//URI_DOC'),
        'info_url':
        ext.text('.//NOTICE_DATA/IA_URL_GENERAL'),
        'etendering_url':
        ext.text('.//NOTICE_DATA/IA_URL_ETENDERING'),
        'orig_language':
        ext.text('.//NOTICE_DATA/LG_ORIG'),
        'orig_nuts':
        ext.text('.//NOTICE_DATA/ORIGINAL_NUTS'),
        'orig_nuts_code':
        ext.attr('.//NOTICE_DATA/ORIGINAL_NUTS', 'CODE'),
        'iso_country':
        ext.attr('.//NOTICE_DATA/ISO_COUNTRY', 'VALUE'),
        'original_cpv':
        cpvs,
        'references':
        refs,
        'dispatch_date':
        ext.text('.//CODIF_DATA/DS_DATE_DISPATCH'),
        'request_document_date':
        ext.text('.//CODIF_DATA/DD_DATE_REQUEST_DOCUMENT'),
        'submission_date':
        ext.text('.//CODIF_DATA/DT_DATE_FOR_SUBMISSION'),
        'heading':
        ext.text('.//CODIF_DATA/HEADING'),
        'directive':
        ext.attr('.//CODIF_DATA/DIRECTIVE', 'VALUE'),
        'authority_type_code':
        ext.attr('.//CODIF_DATA/AA_AUTHORITY_TYPE', 'CODE'),
        'authority_type':
        ext.text('.//CODIF_DATA/AA_AUTHORITY_TYPE'),
        'document_type_code':
        ext.attr('.//CODIF_DATA/TD_DOCUMENT_TYPE', 'CODE'),
        'document_type':
        ext.text('.//CODIF_DATA/TD_DOCUMENT_TYPE'),
        'contract_nature_code':
        ext.attr('.//CODIF_DATA/NC_CONTRACT_NATURE', 'CODE'),
        'contract_nature':
        ext.text('.//CODIF_DATA/NC_CONTRACT_NATURE'),
        'procedure_code':
        ext.attr('.//CODIF_DATA/PR_PROC', 'CODE'),
        'procedure':
        ext.text('.//CODIF_DATA/PR_PROC'),
        'regulation_code':
        ext.attr('.//CODIF_DATA/RP_REGULATION', 'CODE'),
        'regulation':
        ext.text('.//CODIF_DATA/RP_REGULATION'),
        'bid_type_code':
        ext.attr('.//CODIF_DATA/TY_TYPE_BID', 'CODE'),
        'bid_type':
        ext.text('.//CODIF_DATA/TY_TYPE_BID'),
        'award_criteria_code':
        ext.attr('.//CODIF_DATA/AC_AWARD_CRIT', 'CODE'),
        'award_criteria':
        ext.text('.//CODIF_DATA/AC_AWARD_CRIT'),
        'main_activities_code':
        ext.attr('.//CODIF_DATA/MA_MAIN_ACTIVITIES', 'CODE'),
        'main_activities':
        ext.text('.//CODIF_DATA/MA_MAIN_ACTIVITIES'),
        'title_text':
        ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TEXT'),
        'title_town':
        ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TOWN'),
        'title_country':
        ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_CY'),
        'authority_name':
        ext.text('./TRANSLATION_SECTION/ML_AA_NAMES/AA_NAME')
    }

    ext.ignore('./LINKS_SECTION/FORMS_LABELS_LINK')
    ext.ignore('./LINKS_SECTION/OFFICIAL_FORMS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_NUTS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_CPV_LINK')
    ext.ignore('./LINKS_SECTION/XML_SCHEMA_DEFINITION_LINK')

    # TODO: Figure out if we need any of this, even with the forms.
    ext.ignore(
        './CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/SINGLE_VALUE/VALUE'
    )
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST')
    ext.ignore(
        './CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/RANGE_VALUE/VALUE'
    )

    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/TOWN')
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/POSTAL_CODE'
    )
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/PHONE')
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ORGANISATION/OFFICIALNAME'
    )
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/FAX')
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/COUNTRY')
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/CONTACT_POINT'
    )
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ATTENTION')
    ext.ignore(
        './TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ADDRESS')
    ext.audit()

    form_ = select_form(form, data['orig_language'])
    contracts = []
    if form_.tag.startswith('CONTRACT_AWARD_'):
        from forms.contract_award import parse_form
        contracts = parse_form(form_)

    # save to DB
    doc_no = data['doc_no']
    engine.begin()
    cpvs_table.delete(doc_no=doc_no)
    references_table.delete(doc_no=doc_no)
    contracts_table.delete(doc_no=doc_no)
    documents_table.delete(doc_no=doc_no)

    for cpv in data.pop('original_cpv'):
        cpv['doc_no'] = doc_no
        cpvs_table.insert(cpv)

    for ref in data.pop('references'):
        obj = {'doc_no': doc_no, 'ref': ref}
        references_table.insert(obj)

    for contract in contracts:
        contract['doc_no'] = doc_no
        contracts_table.insert(contract)

    documents_table.insert(data)
    engine.commit()
Example #11
0
def parse(filename, file_content):
    #fh = open(filename, 'rb')
    xmldata = file_content.replace('xmlns="', 'xmlns_="')
    #fh.close()
    #print xmldata.decode('utf-8').encode('ascii', 'replace')
    root = etree.fromstring(xmldata)
    form = root.find('.//FORM_SECTION')
    form.getparent().remove(form)
    ext = Extractor(root)
    cpvs = [{'code': e.get('CODE'), 'text': e.text} for e in root.findall('.//NOTICE_DATA/ORIGINAL_CPV')]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/ORIGINAL_CPV')
    
    refs = [e.text for e in root.findall('.//NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')

    data = {
        'technical_reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'technical_comments': ext.text('./TECHNICAL_SECTION/COMMENTS'),
        'technical_deletion_date': ext.text('./TECHNICAL_SECTION/DELETION_DATE'),
        'technical_form_lang': ext.text('./TECHNICAL_SECTION/FORM_LG_LIST'),
        'reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'oj_collection': ext.text('.//REF_OJS/COLL_OJ'),
        'oj_number': ext.text('.//REF_OJS/NO_OJ'),
        'oj_date': ext.text('.//REF_OJS/DATE_PUB'),
        'doc_no': ext.text('.//NOTICE_DATA/NO_DOC_OJS'),
        'doc_url': ext.text('.//NOTICE_DATA//URI_DOC[@LG="EN"]') or ext.text('.//NOTICE_DATA//URI_DOC'),
        'info_url': ext.text('.//NOTICE_DATA/IA_URL_GENERAL'),
        'etendering_url': ext.text('.//NOTICE_DATA/IA_URL_ETENDERING'),
        'orig_language': ext.text('.//NOTICE_DATA/LG_ORIG'),
        'orig_nuts': ext.text('.//NOTICE_DATA/ORIGINAL_NUTS'),
        'orig_nuts_code': ext.attr('.//NOTICE_DATA/ORIGINAL_NUTS', 'CODE'),
        'iso_country': ext.attr('.//NOTICE_DATA/ISO_COUNTRY', 'VALUE'),
        'original_cpv': cpvs,
        'references': refs,
        'dispatch_date': ext.text('.//CODIF_DATA/DS_DATE_DISPATCH'),
        'request_document_date': ext.text('.//CODIF_DATA/DD_DATE_REQUEST_DOCUMENT'),
        'submission_date': ext.text('.//CODIF_DATA/DT_DATE_FOR_SUBMISSION'),
        'heading': ext.text('.//CODIF_DATA/HEADING'),
        'directive': ext.attr('.//CODIF_DATA/DIRECTIVE', 'VALUE'),
        'authority_type_code': ext.attr('.//CODIF_DATA/AA_AUTHORITY_TYPE', 'CODE'),
        'authority_type': ext.text('.//CODIF_DATA/AA_AUTHORITY_TYPE'),
        'document_type_code': ext.attr('.//CODIF_DATA/TD_DOCUMENT_TYPE', 'CODE'),
        'document_type': ext.text('.//CODIF_DATA/TD_DOCUMENT_TYPE'),
        'contract_nature_code': ext.attr('.//CODIF_DATA/NC_CONTRACT_NATURE', 'CODE'),
        'contract_nature': ext.text('.//CODIF_DATA/NC_CONTRACT_NATURE'),
        'procedure_code': ext.attr('.//CODIF_DATA/PR_PROC', 'CODE'),
        'procedure': ext.text('.//CODIF_DATA/PR_PROC'),
        'regulation_code': ext.attr('.//CODIF_DATA/RP_REGULATION', 'CODE'),
        'regulation': ext.text('.//CODIF_DATA/RP_REGULATION'),
        'bid_type_code': ext.attr('.//CODIF_DATA/TY_TYPE_BID', 'CODE'),
        'bid_type': ext.text('.//CODIF_DATA/TY_TYPE_BID'),
        'award_criteria_code': ext.attr('.//CODIF_DATA/AC_AWARD_CRIT', 'CODE'),
        'award_criteria': ext.text('.//CODIF_DATA/AC_AWARD_CRIT'),
        'main_activities_code': ext.attr('.//CODIF_DATA/MA_MAIN_ACTIVITIES', 'CODE'),
        'main_activities': ext.text('.//CODIF_DATA/MA_MAIN_ACTIVITIES'),
        'title_text': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TEXT'),
        'title_town': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TOWN'),
        'title_country': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_CY'),
        'authority_name': ext.text('./TRANSLATION_SECTION/ML_AA_NAMES/AA_NAME')
    }

    ext.ignore('./LINKS_SECTION/FORMS_LABELS_LINK')
    ext.ignore('./LINKS_SECTION/OFFICIAL_FORMS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_NUTS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_CPV_LINK')
    ext.ignore('./LINKS_SECTION/XML_SCHEMA_DEFINITION_LINK')
    
    # TODO: Figure out if we need any of this, even with the forms.
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/SINGLE_VALUE/VALUE')
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST')
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/RANGE_VALUE/VALUE')
    
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/TOWN')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/POSTAL_CODE')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/PHONE')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ORGANISATION/OFFICIALNAME')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/FAX')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/COUNTRY')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/CONTACT_POINT')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ATTENTION')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ADDRESS')
    ext.audit()
    
    form_ = select_form(form, data['orig_language'])
    contracts = []
    if form_.tag.startswith('CONTRACT_AWARD_'):
        from forms.contract_award import parse_form
        contracts = parse_form(form_)
    
    # save to DB
    doc_no = data['doc_no']
    engine.begin()
    cpvs_table.delete(doc_no=doc_no)
    references_table.delete(doc_no=doc_no)
    contracts_table.delete(doc_no=doc_no)
    documents_table.delete(doc_no=doc_no)
    
    for cpv in data.pop('original_cpv'):
        cpv['doc_no'] = doc_no
        cpvs_table.insert(cpv)

    for ref in data.pop('references'):
        obj = {'doc_no': doc_no, 'ref': ref}
        references_table.insert(obj)

    for contract in contracts:
        contract['doc_no'] = doc_no
        contracts_table.insert(contract)

    documents_table.insert(data)
    engine.commit()