def parse_madrid_international_filing_record():
     dbc = Db()
     mifr_elements = case.findall('madrid-international-filing-requests/madrid-international-filing-record')
     for child in mifr_elements:
         madrid_international_filing_record = {'serial_number': doc_id}
         madrid_international_filing_record_items = (
             'entry_number', 'reference_number', 'original_filing_date_uspto',
             'international_registration_number',
             'international_registration_date', 'international_status_code',
             'international_status_date', 'irregularity_reply_by_date', 'international_renewal_date')
         for hitem in madrid_international_filing_record_items:
             search_term = hitem.replace('_', '-') + '/text()'
             madrid_international_filing_record[hitem] = get_text_or_none(child, search_term)
         mifr_id = dbc.insert_dict(madrid_international_filing_record,
                                   'trademark_app_madrid_international_filing_record')
         mhe_elements = child.findall('madrid-history-events/madrid-history-event')
         lst = []
         for subchild in mhe_elements:
             madrid_history_events = {'serial_number': doc_id,
                                      'madrid_international_filing_record_id': mifr_id}
             madrid_history_events_items = (
                 'code', 'date', 'description_text', 'entry_number')
             for hitem in madrid_history_events_items:
                 search_term = hitem.replace('_', '-') + '/text()'
                 madrid_history_events[hitem] = get_text_or_none(subchild, search_term)
             lst.append(madrid_history_events)
         result = dbc.insert_listdict(lst, 'trademark_app_madrid_history_events')
 def parse_case_files():
     dbc = Db()
     case_files = {'serial_number': doc_id,
                   'file_id': file_id,
                   'registration_number': get_text_or_none(case, 'registration-number/text()'),
                   'transaction_date': get_text_or_none(case, 'transaction-date/text()')}
     return dbc.insert_dict(case_files, 'trademark_app_case_files')
 def parse_headers():
     dbc = Db()
     case_file_headers = {'serial_number': doc_id}
     case_file_header_items = (
         'filing_date', 'status_code', 'status_date', 'mark_identification', 'mark_drawing_code',
         'attorney_docket_number', 'attorney_name', 'principal_register_amended_in',
         'supplemental_register_amended_in', 'trademark_in', 'collective_trademark_in', 'service_mark_in',
         'collective_service_mark_in', 'collective_membership_mark_in', 'certification_mark_in',
         'cancellation_pending_in', 'published_concurrent_in', 'concurrent_use_in',
         'concurrent_use_proceeding_in', 'interference_pending_in', 'opposition_pending_in', 'section_12c_in',
         'section_2f_in', 'section_2f_in_part_in', 'renewal_filed_in', 'section_8_filed_in',
         'section_8_partial_accept_in', 'section_8_accepted_in', 'section_15_acknowledged_in',
         'section_15_filed_in', 'supplemental_register_in', 'foreign_priority_in', 'change_registration_in',
         'intent_to_use_in', 'intent_to_use_current_in', 'filed_as_use_application_in',
         'amended_to_use_application_in', 'use_application_currently_in', 'amended_to_itu_application_in',
         'filing_basis_filed_as_44d_in', 'amended_to_44d_application_in', 'filing_basis_current_44d_in',
         'filing_basis_filed_as_44e_in', 'filing_basis_current_44e_in', 'amended_to_44e_application_in',
         'without_basis_currently_in', 'filing_current_no_basis_in', 'color_drawing_filed_in',
         'color_drawing_current_in', 'drawing_3d_filed_in', 'drawing_3d_current_in',
         'standard_characters_claimed_in', 'filing_basis_filed_as_66a_in', 'filing_basis_current_66a_in',
         'current_location', 'location_date', 'employee_name', 'registration_date',
         'published_for_opposition_date', 'amend_to_register_date', 'abandonment_date', 'cancellation_code',
         'cancellation_date', 'republished_12c_date', 'domestic_representative_name', 'renewal_date',
         'law_office_assigned_location_code')
     for hitem in case_file_header_items:
         search_term = 'case-file-header/' + hitem.replace('_', '-') + '/text()'
         case_file_headers[hitem] = get_text_or_none(case, search_term)
     return dbc.insert_dict(case_file_headers, 'trademark_app_case_file_headers')
Exemple #4
0
def main_worker(file):
    dbc = Db()
    file_check = dbc.file_check(file)
    if file_check is None:
        xml_filename = download_file(file['url'])
        if xml_filename is not None:
            inserted_id = dbc.file_insert(file, os.path.basename(xml_filename))
            try:
                parse_file(xml_filename, inserted_id)
            except Exception:
                logger.exception('message')
                raise
    elif file_check['status'] in ['new', 'reparsing'] or args.force:
        logger.warning('File %s exists into database. Going to process again',
                       file_check['filename'])
        if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])):
            xml_filename = download_file(file['url'])
        else:
            xml_filename = file_check['filename']
        try:
            parse_file(xml_filename, file_check['id'])
        except Exception:
            logger.exception('message')
            raise
    else:
        logger.info('File %s is already inserted into database. Skiping it',
                    file_check['filename'])
        if args.parse:
            logger.info('Nothing to work. Exiting.')
            exit()
 def parse_design_searches():
     dbc = Db()
     cfds_elements = case.findall('design-searches/design-search')
     lst = []
     for child in cfds_elements:
         case_file_design_searches = {'serial_number': doc_id,
                                      'code': get_text_or_none(child, 'code/text()')}
         lst.append(case_file_design_searches)
     result = dbc.insert_listdict(lst, 'trademark_app_design_searches')
 def parse_statements():
     dbc = Db()
     cfs_elements = case.findall('case-file-statements/case-file-statement')
     lst = []
     for child in cfs_elements:
         case_file_statements = {'serial_number': doc_id,
                                 'type_code': get_text_or_none(child, 'type-code/text()'),
                                 'text': get_text_or_none(child, 'text/text()')}
         lst.append(case_file_statements)
     result = dbc.insert_listdict(lst, 'trademark_app_case_file_statements')
 def parse_correspondents():
     dbc = Db()
     correspondent_elements = case.findall('correspondent')
     lst = []
     for child in correspondent_elements:
         case_file_correspondent = {'serial_number': doc_id}
         case_file_correspondent_items = (
             'address_1', 'address_2', 'address_3', 'address_4', 'address_5')
         for hitem in case_file_correspondent_items:
             search_term = hitem.replace('_', '-') + '/text()'
             case_file_correspondent[hitem] = get_text_or_none(child, search_term)
         lst.append(case_file_correspondent)
     result = dbc.insert_listdict(lst, 'trademark_app_correspondents')
 def parse_prior_registration_applications():
     dbc = Db()
     pra_elements = case.findall('prior-registration-applications/prior-registration-application')
     other_related_in = get_text_or_none(case, 'prior-registration-applications/other-related-in/text()')
     lst = []
     for child in pra_elements:
         prior_registration_applications = {'serial_number': doc_id,
                                            'other_related_in': other_related_in,
                                            'relationship_type': get_text_or_none(child,
                                                                                  'relationship-type/text()'),
                                            'number': get_text_or_none(child, 'number/text()')}
         lst.append(prior_registration_applications)
     result = dbc.insert_listdict(lst, 'trademark_app_prior_registration_applications')
 def parse_foreign_applications():
     dbc = Db()
     fa_elements = case.findall('foreign-applications/foreign-application')
     lst = []
     for child in fa_elements:
         foreign_applications = {'serial_number': doc_id}
         foreign_applications_items = (
             'filing_date', 'registration_date', 'registration_expiration_date', 'registration_renewal_date',
             'registration_renewal_expiration_date', 'entry_number', 'application_number', 'country',
             'other', 'registration_number', 'renewal_number', 'foreign_priority_claim_in')
         for hitem in foreign_applications_items:
             search_term = hitem.replace('_', '-') + '/text()'
             foreign_applications[hitem] = get_text_or_none(child, search_term)
         lst.append(foreign_applications)
     result = dbc.insert_listdict(lst, 'trademark_app_foreign_applications')
 def parse_international_registration():
     dbc = Db()
     cfir_elements = case.findall('international-registration')
     lst = []
     for child in cfir_elements:
         case_file_international_registration = {'serial_number': doc_id}
         case_file_international_registration_items = (
             'international_registration_number', 'international_registration_date',
             'international_publication_date', 'international_renewal_date', 'auto_protection_date',
             'international_death_date', 'international_status_code', 'international_status_date',
             'priority_claimed_in', 'priority_claimed_date', 'first_refusal_in')
         for hitem in case_file_international_registration_items:
             search_term = hitem.replace('_', '-') + '/text()'
             case_file_international_registration[hitem] = get_text_or_none(child, search_term)
         lst.append(case_file_international_registration)
     result = dbc.insert_listdict(lst, 'trademark_app_international_registration')
 def parse_owners():
     dbc = Db()
     cfo_elements = case.findall('case-file-owners/case-file-owner')
     lst = []
     for child in cfo_elements:
         case_file_owners = {'serial_number': doc_id}
         case_file_owners_items = (
             'entry_number', 'party_type', 'legal_entity_type_code', 'entity_statement', 'party_name',
             'address_1', 'address_2', 'city', 'state', 'country', 'other', 'postcode', 'dba_aka_text',
             'composed_of_statement', 'name_change_explanation')
         for hitem in case_file_owners_items:
             search_term = hitem.replace('_', '-') + '/text()'
             case_file_owners[hitem] = get_text_or_none(child, search_term)
         case_file_owners['nationality'] = get_text_or_none(child, 'nationality/country/text()')
         lst.append(case_file_owners)
     result = dbc.insert_listdict(lst, 'trademark_app_case_file_owners')
def parse_file(filename, file_id):
    dbc = Db()
    if WORK_DIR not in filename:
        filename = os.path.join(WORK_DIR, filename)
    with open(filename, 'rb') as inputfile:
        file_start_time = time.time()
        logger.info('Parsing file %s' % filename)
        context = etree.iterparse(inputfile, events=('end',), tag='case-file')
        for event, case in context:
            doc_id = int(get_text_or_none(case, 'serial-number/text()'))
            serial_db = dbc.serial_get(doc_id, file_id)
            if serial_db is not None:
                new_file_date = int(re.sub(r"\D", "", filename))
                db_file_date = int(re.sub(r"\D", "", serial_db['filename']))
                if new_file_date > db_file_date \
                        or serial_db['status'] is False \
                        or (new_file_date >= db_file_date and args.parseall and args.force):
                    for t in ('trademark_app_case_files', 'trademark_app_case_file_event_statements',
                              'trademark_app_case_file_headers', 'trademark_app_case_file_owners',
                              'trademark_app_case_file_statements', 'trademark_app_classifications',
                              'trademark_app_correspondents', 'trademark_app_design_searches',
                              'trademark_app_foreign_applications', 'trademark_app_international_registration',
                              'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record',
                              'trademark_app_prior_registration_applications', 'trademark_app_us_codes'):
                        dbc.delete_serial(doc_id, t)
                    logger.info('Processing existing serial number %s', doc_id)
                    parse_case(case, doc_id, file_id)
            else:
                logger.info('Processing new serial number %s', doc_id)
                parse_case(case, doc_id, file_id)
            case.clear()
    dbc.file_update_status(file_id, 'finished')
    os.remove(filename)
    logger.info('Finished parsing file %s in [%s sec]', filename, time.time() - file_start_time)
 def parse_description():
     dbc = Db()
     description_element = case.find('description')
     description_text_full = etree.tostring(description_element).decode()
     text = re.sub('<.*?>|</.*?>', '', description_text_full)
     text = re.sub('[\n\t\r\f]+', ' ', text)
     text = re.sub('^\d+\.\s+', '', text)
     text = re.sub('\s+', ' ', text)
     description = {'app_id': app_id, 'uuid': id_generator(), 'text': text}
def parse_file(filename, file_id):
    dbc = Db()
    if WORK_DIR not in filename:
        filename = os.path.join(settings.APP_XMLDIR, filename)
    with open(filename, 'rb') as inputfile:
        file_start_time = time.time()
        logger.info('Parsing file %s' % filename)
        context = etree.iterparse(inputfile,
                                  events=('end', ),
                                  tag='us-patent-application')
        app_counter = 0
        for event, case in context:
            data_application = case.find('us-bibliographic-data-application')
            app_ref = data_application.find('application-reference')
            app_id = int(
                get_text_or_none(app_ref, 'document-id/doc-number/text()'))
            app_id_db = dbc.app_id_get(app_id, file_id)
            if app_id_db is not None:
                logger.info(
                    'APP_id %s do not exists in database and will be inserted',
                    app_id)
                new_file_date = int(re.sub(r"\D", "", filename))
                db_file_date = int(re.sub(r"\D", "", app_id_db['filename']))
                print(new_file_date, db_file_date)
                if new_file_date > db_file_date \
                        or app_id_db['status'] is False \
                        or (new_file_date >= db_file_date and args.parseall and args.force):
                    for t in (
                            'trademark_app_case_files',
                            'trademark_app_case_file_event_statements',
                            'trademark_app_case_file_headers',
                            'trademark_app_case_file_owners',
                            'trademark_app_case_file_statements',
                            'trademark_app_classifications',
                            'trademark_app_correspondents',
                            'trademark_app_design_searches',
                            'trademark_app_foreign_applications',
                            'trademark_app_international_registration',
                            'trademark_app_madrid_history_events',
                            'trademark_app_madrid_international_filing_record',
                            'trademark_app_prior_registration_applications',
                            'trademark_app_us_codes'):
                        dbc.delete_serial(app_id, t)
                    logger.info('Processing existing serial number %s', app_id)
                    parse_case(case, app_id, file_id)
            else:
                logger.info('Processing new app_id %s', app_id)
                parse_app(case, app_id, filename)
            app_counter += 1
            case.clear()
            if app_counter == 5:
                sys.exit()
    dbc.file_update_status(file_id, 'finished')
    os.remove(filename)
    logger.info('Finished parsing file %s in [%s sec]', filename,
                time.time() - file_start_time)
 def parse_classifications():
     dbc = Db()
     classification_elements = case.findall('classifications/classification')
     for child in classification_elements:
         classifications = {'serial_number': doc_id}
         classifications_items = (
             'international_code_total_no', 'us_code_total_no', 'international_code', 'status_code',
             'status_date', 'first_use_anywhere_date', 'first_use_in_commerce_date', 'primary_code')
         for hitem in classifications_items:
             search_term = hitem.replace('_', '-') + '/text()'
             classifications[hitem] = get_text_or_none(child, search_term)
         classification_id = dbc.insert_dict(classifications, 'trademark_app_classifications')
         code_elements = child.findall('us-code')
         lst = []
         for subchild in code_elements:
             case_file_us_codes = {'serial_number': doc_id,
                                   'classification_id': classification_id,
                                   'us_code': subchild.text}
             lst.append(case_file_us_codes)
         result = dbc.insert_listdict(lst, 'trademark_app_us_codes')
def sub_main(args):
    files_tuple = get_urls(MAIN_URL)
    dbc = Db()
    for file in files_tuple:
        file_check = dbc.file_check(file)
        if file_check is None:
            xml_filename = download_file(file['url'])
            if xml_filename is not None:
                inserted_id = dbc.file_insert(file, xml_filename)
                parse_file(xml_filename, inserted_id)
        elif file_check['status'] == 'new':
            logger.warning('File %s exists into database. Going to process again', file_check['filename'])
            if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])):
                xml_filename = download_file(file['url'])
            else:
                xml_filename = file_check['filename']
            parse_file(xml_filename, file_check['id'])
        else:
            logger.info('File %s is already inserted into database.', file_check['filename'])
            if args.parse:
                logger.info('Nothing to work. Exiting.')
                sys.exit()
    def parse_application():
        dbc = Db()

        pub_date = get_text_or_none(pub_ref, 'document-id/date/text()')
        if pub_date[6:] != "00":
            pub_date = pub_date[:4] + '-' + pub_date[4:6] + '-' + pub_date[6:]
            year = pub_date[:4]
        else:
            pub_date = pub_date[:4] + '-' + pub_date[4:6] + '-' + '01'
            year = pub_date[:4]

        abstract_p_list = case.findall('abstract/p')
        abstract = ''
        for p in abstract_p_list:
            abstract += get_text_or_none(p, 'text()')

        application = {
            'id':
            year + '/' +
            get_text_or_none(pub_ref, 'document-id/doc-number/text()'),
            'type':
            app_ref.attrib['appl-type'],
            'number':
            get_text_or_none(pub_ref, 'document-id/doc-number/text()'),
            'app_id':
            app_id,
            'country':
            get_text_or_none(app_ref, 'document-id/country/text()'),
            'date':
            pub_date,
            'abstract':
            abstract,
            'title':
            get_text_or_none(
                case,
                'us-bibliographic-data-application/invention-title/text()'),
            'granted':
            None,
            'num_claims':
            len(claims_element_list),
            'filename':
            filename.split('\\')[-1],
        }
 def parse_claims():
     dbc = Db()
     claims_list = []
     for claim_element in claims_element_list:
         sequence = claim_element.attrib['num']
         claim_text_full = etree.tostring(claim_element).decode()
         dependent = re.search('<claim-ref idref="CLM-(\d+)">',
                               claim_text_full)
         dependent = int(
             dependent.group(1)) if dependent is not None else None
         text = re.sub('<.*?>|</.*?>', '', claim_text_full)
         text = re.sub('[\n\t\r\f]+', '', text)
         text = re.sub('^\d+\.\s+', '', text)
         text = re.sub('\s+', ' ', text)
         claim = {
             'uuid': id_generator(),
             'application_id': application['id'],
             'app_id': application['app_id'],
             'text': text,
             'dependent': dependent,
             'sequence': sequence
         }
         claims_list.append(claim)
def main_worker(file):
    dbc = Db()
    file_check = dbc.file_check(file)
    if file_check is None:
        xml_filename = download_file(file['url'])
        if xml_filename is not None:
            inserted_id = dbc.file_insert(file, xml_filename)
            parse_file(xml_filename, inserted_id)
    elif file_check['status'] in ['new', ''] or file_check['status'] is None:
        logger.warning('File %s exists into database. Going to process again',
                       file_check['filename'])
        if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])):
            xml_filename = download_file(file['url'])
        else:
            xml_filename = file_check['filename']
        if settings.APP_XMLDIR not in xml_filename:
            xml_filename = os.path.join(settings.APP_XMLDIR, xml_filename)
        parse_file(xml_filename, file_check['id'])
    else:
        logger.info('File %s is already inserted into database.',
                    file_check['filename'])
        if args.parse:
            logger.info('Nothing to work. Exiting.')
            sys.exit()
                'city':
                get_text_or_none(applicant_element,
                                 'addressbook/address/city/text()'),
                'state':
                get_text_or_none(applicant_element,
                                 'addressbook/address/state/text()'),
                'country':
                get_text_or_none(applicant_element,
                                 'addressbook/address/country/text()'),
                'country_transformed':
                get_text_or_none(applicant_element,
                                 'addressbook/address/country/text()'),
            }
            rawlocation_list.append(rawlocation)

    dbc = Db()
    start_time = time.time()

    application = parse_application()
    # print(application)
    parse_claims()
    parse_description()
    parse_assignees()
    parse_inventors()
    parse_applicants()

    # with cf.ThreadPoolExecutor(max_workers=12) as executor:
    #     executor.submit(parse_case_files)
    #     executor.submit(parse_headers)
    #     executor.submit(parse_statements)
    #     executor.submit(parse_event_statements)
Exemple #21
0
def parse_file(filename, file_id):
    dbc = Db()
    if WORK_DIR not in filename:
        filename = os.path.join(WORK_DIR, filename)
    with open(filename, 'rb') as inputfile:
        file_start_time = time.time()
        logger.info('Parsing file %s' % filename)
        context = etree.iterparse(inputfile, events=('end', ), tag='case-file')
        for event, case in context:
            doc_id = int(get_text_or_none(case, 'serial-number/text()'))
            serial_db = dbc.serial_get(doc_id, file_id)
            if serial_db is not None:
                transaction_date_string = get_text_or_none(
                    case, 'transaction-date/text()')
                if transaction_date_string:
                    transaction_date = datetime.strptime(
                        transaction_date_string, '%Y%m%d').date()
                else:
                    logger.warning('Missing transaction date in XML')
                    transaction_date = None
                if serial_db['transaction_date'] is not None and serial_db[
                        'transaction_date'] != '':
                    db_transaction_date = datetime.strptime(
                        serial_db['transaction_date'], '%Y%m%d').date()
                else:
                    logger.warning('Missing transaction date in database')
                    db_transaction_date = None
                if transaction_date > db_transaction_date \
                    or (serial_db['status'] is False and args.force) \
                    or (transaction_date > db_transaction_date and args.parseall and args.force):
                    for t in (
                            'trademark_app_case_files',
                            'trademark_app_case_file_event_statements',
                            'trademark_app_case_file_headers',
                            'trademark_app_case_file_owners',
                            'trademark_app_case_file_statements',
                            'trademark_app_classifications',
                            'trademark_app_correspondents',
                            'trademark_app_design_searches',
                            'trademark_app_foreign_applications',
                            'trademark_app_international_registration',
                            'trademark_app_madrid_history_events',
                            'trademark_app_madrid_international_filing_record',
                            'trademark_app_prior_registration_applications',
                            'trademark_app_us_codes'):
                        dbc.delete_serial(doc_id, t)
                        dbc.cnx.commit()
                    logger.info('[%s] Deleted serial %s from all tables',
                                os.path.basename(filename), doc_id)
                    logger.info('[%s] Processing existing serial number %s',
                                os.path.basename(filename), doc_id)
                    parse_case(case, doc_id, file_id, dbc)
            else:
                logger.info('[%s] Processing new serial number %s',
                            os.path.basename(filename), doc_id)
                parse_case(case, doc_id, file_id, dbc)
            case.clear()
    dbc.file_update_status(file_id, 'finished')
    os.remove(filename)
    logger.info('[%s] Finished parsing file in [%s sec]',
                os.path.basename(filename),
                time.time() - file_start_time)
            mifr_id = dbc.insert_dict(madrid_international_filing_record,
                                      'trademark_app_madrid_international_filing_record')
            mhe_elements = child.findall('madrid-history-events/madrid-history-event')
            lst = []
            for subchild in mhe_elements:
                madrid_history_events = {'serial_number': doc_id,
                                         'madrid_international_filing_record_id': mifr_id}
                madrid_history_events_items = (
                    'code', 'date', 'description_text', 'entry_number')
                for hitem in madrid_history_events_items:
                    search_term = hitem.replace('_', '-') + '/text()'
                    madrid_history_events[hitem] = get_text_or_none(subchild, search_term)
                lst.append(madrid_history_events)
            result = dbc.insert_listdict(lst, 'trademark_app_madrid_history_events')

    dbc = Db()
    start_time = time.time()

    with cf.ThreadPoolExecutor(max_workers=12) as executor:
        executor.submit(parse_case_files)
        executor.submit(parse_headers)
        executor.submit(parse_statements)
        executor.submit(parse_event_statements)
        executor.submit(parse_prior_registration_applications)
        executor.submit(parse_foreign_applications)
        executor.submit(parse_classifications)
        executor.submit(parse_correspondents)
        executor.submit(parse_owners)
        executor.submit(parse_design_searches)
        executor.submit(parse_international_registration)
        executor.submit(parse_madrid_international_filing_record)