Beispiel #1
0
def parse_form(root):
    form_type = 'std'
    if 'DEFENCE' in root.tag:
        form_type = 'mil'
    elif 'UTILITIES' in root.tag:
        form_type = 'util'

    lookup = lambda k: _lookup(form_type, k)

    ext = Extractor(root)
    form = {
        'file_reference': ext.text(lookup('reference')),
        'relates_to_eu_project': ext.text('.//RELATES_TO_EU_PROJECT_YES/P'),
        'notice_dispatch_day': ext.text('.//NOTICE_DISPATCH_DATE/DAY'),
        'notice_dispatch_month': ext.text('.//NOTICE_DISPATCH_DATE/MONTH'),
        'notice_dispatch_year': ext.text('.//NOTICE_DISPATCH_DATE/YEAR'),
        'appeal_procedure': ext.text('.//PROCEDURES_FOR_APPEAL//LODGING_OF_APPEALS_PRECISION/P'),
        'location': ext.text(lookup('award_description')+'/LOCATION_NUTS/LOCATION/P') or ext.text(lookup('award_description')+'/LOCATION_NUTS/LOCATION'),
        'location_nuts': ext.attr(lookup('award_description')+'/LOCATION_NUTS/NUTS', 'CODE'),
        'type_contract': ext.attr(lookup('award_description')+'//TYPE_CONTRACT', 'VALUE'),
        'gpa_covered': ext.attr(lookup('award_description')+'/CONTRACT_COVERED_GPA', 'VALUE'),
        'electronic_auction': ext.attr(lookup('electronic_auction'), 'VALUE'),
        'cpv_code': ext.attr(lookup('award_description')+'/CPV/CPV_MAIN/CPV_CODE', 'CODE'),
        #'reason_lawful': ext.html('.//REASON_CONTRACT_LAWFUL'),
        #'cpv_additional_code': ext.attr('.//DESCRIPTION_AWARD_NOTICE_INFORMATION/CPV/CPV_ADDITIONAL/CPV_CODE', 'CODE'),
        #'authority_type': ext.text(lookup('authority_type'), 'VALUE'),
        #'authority_type_other': ext.text(lookup('authority_type_other'), 'VALUE'),
        'activity_type': ext.text(lookup('activity_type')),
        'activity_type_other': ext.text(lookup('activity_type_other')),
        'activity_contractor': ext.attr('.//ACTIVITIES_OF_CONTRACTING_ENTITY/ACTIVITY_OF_CONTRACTING_ENTITY', 'VALUE'),
        'concessionaire_email': ext.text('.//CA_CE_CONCESSIONAIRE_PROFILE/E_MAILS/E_MAIL'),
        'concessionaire_nationalid': ext.text('.//CA_CE_CONCESSIONAIRE_PROFILE/ORGANISATION/NATIONALID'),
        'concessionaire_contact': ext.text('.//CA_CE_CONCESSIONAIRE_PROFILE/CONTACT_POINT'),
        'contract_award_title': ext.text(lookup('award_description')+'/TITLE_CONTRACT/P'),
        'contract_description': ext.html(lookup('short_desc')),
        'additional_information': ext.html(lookup('additional_info')),
        'contract_type_supply': ext.attr('.//TYPE_CONTRACT_LOCATION_W_PUB/TYPE_SUPPLIES_CONTRACT', 'VALUE')
    }

    form.update(extract_address(ext, 'authority', lookup('authority')))
    form.update(extract_address(ext, 'appeal_body', lookup('appeal_body')))
    form.update(extract_address(ext, 'on_behalf', './/TYPE_AND_ACTIVITIES_AND_PURCHASING_ON_BEHALF/PURCHASING_ON_BEHALF//'))
    #form.update(extract_address(ext, 'lodging_info', './/PROCEDURES_FOR_APPEAL/LODGING_INFORMATION_FOR_SERVICE//'))
    ext.ignore('.//PROCEDURES_FOR_APPEAL/MEDIATION_PROCEDURE_BODY_RESPONSIBLE/*')
    ext.ignore('.//PROCEDURES_FOR_APPEAL/LODGING_INFORMATION_FOR_SERVICE/*')
    ext.ignore('./FD_CONTRACT_AWARD_DEFENCE/COMPLEMENTARY_INFORMATION_CONTRACT_AWARD/PROCEDURES_FOR_APPEAL/LODGING_INFORMATION_FOR_SERVICE/*')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/CONTRACTING_ENTITY_CONTRACT_AWARD_UTILITIES/NAME_ADDRESSES_CONTACT_CONTRACT_AWARD_UTILITIES/INTERNET_ADDRESSES_CONTRACT_AWARD_UTILITIES/URL_GENERAL')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/COMPLEMENTARY_INFORMATION_CONTRACT_AWARD_UTILITIES/APPEAL_PROCEDURES/SERVICE_FROM_INFORMATION/*')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/PROCEDURES_CONTRACT_AWARD_UTILITIES/ADMINISTRATIVE_INFO_CONTRACT_AWARD_UTILITIES/PREVIOUS_PUBLICATION_INFORMATION_NOTICE_F6/*')

    # Make awards criteria their own table.
    ext.ignore('./FD_CONTRACT_AWARD/PROCEDURE_DEFINITION_CONTRACT_AWARD_NOTICE/AWARD_CRITERIA_CONTRACT_AWARD_NOTICE_INFORMATION/AWARD_CRITERIA_DETAIL_F03/*')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/PROCEDURES_CONTRACT_AWARD_UTILITIES/F06_AWARD_CRITERIA_CONTRACT_UTILITIES_INFORMATION/*')
    ext.ignore('./FD_CONTRACT_AWARD_DEFENCE/PROCEDURE_DEFINITION_CONTRACT_AWARD_NOTICE_DEFENCE/AWARD_CRITERIA_CONTRACT_AWARD_NOTICE_INFORMATION_DEFENCE/AWARD_CRITERIA_DETAIL_F18/*')
    ext.ignore('.FD_CONTRACT_AWARD_UTILITIES/PROCEDURES_CONTRACT_AWARD_UTILITIES/F06_AWARD_CRITERIA_CONTRACT_UTILITIES_INFORMATION/PRICE_AWARD_CRITERIA/*')
    ext.ignore('./FD_CONTRACT_AWARD_DEFENCE/PROCEDURE_DEFINITION_CONTRACT_AWARD_NOTICE_DEFENCE/ADMINISTRATIVE_INFORMATION_CONTRACT_AWARD_DEFENCE/PREVIOUS_PUBLICATION_INFORMATION_NOTICE_F18/*')
    ext.ignore('./FD_CONTRACT_AWARD/AWARD_OF_CONTRACT/*')
    ext.ignore('./FD_CONTRACT_AWARD_DEFENCE/AWARD_OF_CONTRACT_DEFENCE/*')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/AWARD_CONTRACT_CONTRACT_AWARD_UTILITIES/*')
    ext.ignore('./FD_CONTRACT_AWARD_UTILITIES/OBJECT_CONTRACT_AWARD_UTILITIES/DESCRIPTION_CONTRACT_AWARD_UTILITIES/SHORT_DESCRIPTION/*')
    ext.ignore('./FD_CONTRACT_AWARD/PROCEDURE_DEFINITION_CONTRACT_AWARD_NOTICE/ADMINISTRATIVE_INFORMATION_CONTRACT_AWARD/PREVIOUS_PUBLICATION_INFORMATION_NOTICE_F3/*')

    ext.text('.//TYPE_CONTRACT_LOCATION_W_PUB/SERVICE_CATEGORY_PUB')
    ext.text('.//CPV/CPV_ADDITIONAL/CPV_CODE')

    conversion_date = '%s-%s-01' % (form['notice_dispatch_year'], form['notice_dispatch_month'])
    form.update(extract_values(ext, conversion_date, 'total_value', lookup('total_value')))

    #from lxml import etree
    #el = root.find('./FD_CONTRACT_AWARD/OBJECT_CONTRACT_INFORMATION_CONTRACT_AWARD_NOTICE/TOTAL_FINAL_VALUE')
    #if el:
    #    print etree.tostring(el, pretty_print=True)
    #    #pprint(form)
    #ext.audit()

    contracts = []
    for award in root.findall(lookup('award_dest')):
        contract = parse_award(award, lookup, conversion_date)
        contract.update(form)
        contracts.append(contract)
        # pprint(contract)
    return contracts
Beispiel #2
0
def parse(filename, file_content):
    #fh = open(filename, 'rb')
    xmldata = file_content.replace('xmlns="', 'xmlns_="')
    #fh.close()
    #print xmldata.decode('utf-8').encode('ascii', 'replace')
    root = etree.fromstring(xmldata)
    form = root.find('.//FORM_SECTION')
    form.getparent().remove(form)
    ext = Extractor(root)
    cpvs = [{'code': e.get('CODE'), 'text': e.text} for e in root.findall('.//NOTICE_DATA/ORIGINAL_CPV')]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/ORIGINAL_CPV')

    refs = [e.text for e in root.findall('.//NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')]
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/REF_NOTICE/NO_DOC_OJS')

    data = {
        'technical_reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'technical_comments': ext.text('./TECHNICAL_SECTION/COMMENTS'),
        'technical_deletion_date': ext.text('./TECHNICAL_SECTION/DELETION_DATE'),
        'technical_form_lang': ext.text('./TECHNICAL_SECTION/FORM_LG_LIST'),
        'reception_id': ext.text('./TECHNICAL_SECTION/RECEPTION_ID'),
        'oj_collection': ext.text('.//REF_OJS/COLL_OJ'),
        'oj_number': ext.text('.//REF_OJS/NO_OJ'),
        'oj_date': ext.text('.//REF_OJS/DATE_PUB'),
        'doc_no': ext.text('.//NOTICE_DATA/NO_DOC_OJS'),
        'doc_url': ext.text('.//NOTICE_DATA//URI_DOC[@LG="EN"]') or ext.text('.//NOTICE_DATA//URI_DOC'),
        'info_url': ext.text('.//NOTICE_DATA/IA_URL_GENERAL'),
        'etendering_url': ext.text('.//NOTICE_DATA/IA_URL_ETENDERING'),
        'orig_language': ext.text('.//NOTICE_DATA/LG_ORIG'),
        'orig_nuts': ext.text('.//NOTICE_DATA/ORIGINAL_NUTS'),
        'orig_nuts_code': ext.attr('.//NOTICE_DATA/ORIGINAL_NUTS', 'CODE'),
        'iso_country': ext.attr('.//NOTICE_DATA/ISO_COUNTRY', 'VALUE'),
        'original_cpv': cpvs,
        'references': refs,
        'dispatch_date': ext.text('.//CODIF_DATA/DS_DATE_DISPATCH'),
        'request_document_date': ext.text('.//CODIF_DATA/DD_DATE_REQUEST_DOCUMENT'),
        'submission_date': ext.text('.//CODIF_DATA/DT_DATE_FOR_SUBMISSION'),
        'heading': ext.text('.//CODIF_DATA/HEADING'),
        'directive': ext.attr('.//CODIF_DATA/DIRECTIVE', 'VALUE'),
        'authority_type_code': ext.attr('.//CODIF_DATA/AA_AUTHORITY_TYPE', 'CODE'),
        'authority_type': ext.text('.//CODIF_DATA/AA_AUTHORITY_TYPE'),
        'document_type_code': ext.attr('.//CODIF_DATA/TD_DOCUMENT_TYPE', 'CODE'),
        'document_type': ext.text('.//CODIF_DATA/TD_DOCUMENT_TYPE'),
        'contract_nature_code': ext.attr('.//CODIF_DATA/NC_CONTRACT_NATURE', 'CODE'),
        'contract_nature': ext.text('.//CODIF_DATA/NC_CONTRACT_NATURE'),
        'procedure_code': ext.attr('.//CODIF_DATA/PR_PROC', 'CODE'),
        'procedure': ext.text('.//CODIF_DATA/PR_PROC'),
        'regulation_code': ext.attr('.//CODIF_DATA/RP_REGULATION', 'CODE'),
        'regulation': ext.text('.//CODIF_DATA/RP_REGULATION'),
        'bid_type_code': ext.attr('.//CODIF_DATA/TY_TYPE_BID', 'CODE'),
        'bid_type': ext.text('.//CODIF_DATA/TY_TYPE_BID'),
        'award_criteria_code': ext.attr('.//CODIF_DATA/AC_AWARD_CRIT', 'CODE'),
        'award_criteria': ext.text('.//CODIF_DATA/AC_AWARD_CRIT'),
        'main_activities_code': ext.attr('.//CODIF_DATA/MA_MAIN_ACTIVITIES', 'CODE'),
        'main_activities': ext.text('.//CODIF_DATA/MA_MAIN_ACTIVITIES'),
        'title_text': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TEXT'),
        'title_town': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_TOWN'),
        'title_country': ext.text('.//ML_TITLES/ML_TI_DOC[@LG="EN"]/TI_CY'),
        'authority_name': ext.text('./TRANSLATION_SECTION/ML_AA_NAMES/AA_NAME')
    }

    ext.ignore('./LINKS_SECTION/FORMS_LABELS_LINK')
    ext.ignore('./LINKS_SECTION/OFFICIAL_FORMS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_NUTS_LINK')
    ext.ignore('./LINKS_SECTION/ORIGINAL_CPV_LINK')
    ext.ignore('./LINKS_SECTION/XML_SCHEMA_DEFINITION_LINK')

    # TODO: Figure out if we need any of this, even with the forms.
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/SINGLE_VALUE/VALUE')
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST')
    ext.ignore('./CODED_DATA_SECTION/NOTICE_DATA/VALUES_LIST/VALUES/RANGE_VALUE/VALUE')

    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/TOWN')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/POSTAL_CODE')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/PHONE')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ORGANISATION/OFFICIALNAME')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/FAX')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/COUNTRY')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/CONTACT_POINT')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ATTENTION')
    ext.ignore('./TRANSLATION_SECTION/TRANSLITERATIONS/TRANSLITERATED_ADDR/ADDRESS')
    ext.audit()

    form_ = select_form(form, data['orig_language'])
    contracts = []
    #print form_
    if form_.tag.startswith('CONTRACT_AWARD'):
        contracts = parse_form(form_)

    # save to DB
    doc_no = data['doc_no']

    if documents_table.find_one(doc_no=doc_no):
        log.info('Skipping: %s', doc_no)
        return

    #engine.begin()
    log.info('Parsed: %s, %s (%s)', doc_no, form_.tag, len(contracts))
    cpvs_table.delete(doc_no=doc_no)
    references_table.delete(doc_no=doc_no)
    contracts_table.delete(doc_no=doc_no)
    documents_table.delete(doc_no=doc_no)

    for cpv in data.pop('original_cpv'):
        cpv['doc_no'] = doc_no
        cpvs_table.insert(cpv)

    for ref in data.pop('references'):
        obj = {'doc_no': doc_no, 'ref': ref}
        references_table.insert(obj)

    for i, contract in enumerate(contracts):
        contract['doc_no'] = doc_no
        contract['index'] = i
        #contract['slug'] = slugify('%s-c%s' % (contract['doc_no'], contract['index']))
        contracts_table.insert(contract)

    #data['slug'] = slugify(doc_no)
    documents_table.insert(data)