def parse_madrid_international_filing_record(): dbc = Db() mifr_elements = case.findall('madrid-international-filing-requests/madrid-international-filing-record') for child in mifr_elements: madrid_international_filing_record = {'serial_number': doc_id} madrid_international_filing_record_items = ( 'entry_number', 'reference_number', 'original_filing_date_uspto', 'international_registration_number', 'international_registration_date', 'international_status_code', 'international_status_date', 'irregularity_reply_by_date', 'international_renewal_date') for hitem in madrid_international_filing_record_items: search_term = hitem.replace('_', '-') + '/text()' madrid_international_filing_record[hitem] = get_text_or_none(child, search_term) mifr_id = dbc.insert_dict(madrid_international_filing_record, 'trademark_app_madrid_international_filing_record') mhe_elements = child.findall('madrid-history-events/madrid-history-event') lst = [] for subchild in mhe_elements: madrid_history_events = {'serial_number': doc_id, 'madrid_international_filing_record_id': mifr_id} madrid_history_events_items = ( 'code', 'date', 'description_text', 'entry_number') for hitem in madrid_history_events_items: search_term = hitem.replace('_', '-') + '/text()' madrid_history_events[hitem] = get_text_or_none(subchild, search_term) lst.append(madrid_history_events) result = dbc.insert_listdict(lst, 'trademark_app_madrid_history_events')
def parse_case_files(): dbc = Db() case_files = {'serial_number': doc_id, 'file_id': file_id, 'registration_number': get_text_or_none(case, 'registration-number/text()'), 'transaction_date': get_text_or_none(case, 'transaction-date/text()')} return dbc.insert_dict(case_files, 'trademark_app_case_files')
def parse_headers(): dbc = Db() case_file_headers = {'serial_number': doc_id} case_file_header_items = ( 'filing_date', 'status_code', 'status_date', 'mark_identification', 'mark_drawing_code', 'attorney_docket_number', 'attorney_name', 'principal_register_amended_in', 'supplemental_register_amended_in', 'trademark_in', 'collective_trademark_in', 'service_mark_in', 'collective_service_mark_in', 'collective_membership_mark_in', 'certification_mark_in', 'cancellation_pending_in', 'published_concurrent_in', 'concurrent_use_in', 'concurrent_use_proceeding_in', 'interference_pending_in', 'opposition_pending_in', 'section_12c_in', 'section_2f_in', 'section_2f_in_part_in', 'renewal_filed_in', 'section_8_filed_in', 'section_8_partial_accept_in', 'section_8_accepted_in', 'section_15_acknowledged_in', 'section_15_filed_in', 'supplemental_register_in', 'foreign_priority_in', 'change_registration_in', 'intent_to_use_in', 'intent_to_use_current_in', 'filed_as_use_application_in', 'amended_to_use_application_in', 'use_application_currently_in', 'amended_to_itu_application_in', 'filing_basis_filed_as_44d_in', 'amended_to_44d_application_in', 'filing_basis_current_44d_in', 'filing_basis_filed_as_44e_in', 'filing_basis_current_44e_in', 'amended_to_44e_application_in', 'without_basis_currently_in', 'filing_current_no_basis_in', 'color_drawing_filed_in', 'color_drawing_current_in', 'drawing_3d_filed_in', 'drawing_3d_current_in', 'standard_characters_claimed_in', 'filing_basis_filed_as_66a_in', 'filing_basis_current_66a_in', 'current_location', 'location_date', 'employee_name', 'registration_date', 'published_for_opposition_date', 'amend_to_register_date', 'abandonment_date', 'cancellation_code', 'cancellation_date', 'republished_12c_date', 'domestic_representative_name', 'renewal_date', 'law_office_assigned_location_code') for hitem in case_file_header_items: search_term = 'case-file-header/' + hitem.replace('_', '-') + '/text()' case_file_headers[hitem] = get_text_or_none(case, search_term) return dbc.insert_dict(case_file_headers, 'trademark_app_case_file_headers')
def main_worker(file): dbc = Db() file_check = dbc.file_check(file) if file_check is None: xml_filename = download_file(file['url']) if xml_filename is not None: inserted_id = dbc.file_insert(file, os.path.basename(xml_filename)) try: parse_file(xml_filename, inserted_id) except Exception: logger.exception('message') raise elif file_check['status'] in ['new', 'reparsing'] or args.force: logger.warning('File %s exists into database. Going to process again', file_check['filename']) if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])): xml_filename = download_file(file['url']) else: xml_filename = file_check['filename'] try: parse_file(xml_filename, file_check['id']) except Exception: logger.exception('message') raise else: logger.info('File %s is already inserted into database. Skiping it', file_check['filename']) if args.parse: logger.info('Nothing to work. Exiting.') exit()
def parse_design_searches(): dbc = Db() cfds_elements = case.findall('design-searches/design-search') lst = [] for child in cfds_elements: case_file_design_searches = {'serial_number': doc_id, 'code': get_text_or_none(child, 'code/text()')} lst.append(case_file_design_searches) result = dbc.insert_listdict(lst, 'trademark_app_design_searches')
def parse_statements(): dbc = Db() cfs_elements = case.findall('case-file-statements/case-file-statement') lst = [] for child in cfs_elements: case_file_statements = {'serial_number': doc_id, 'type_code': get_text_or_none(child, 'type-code/text()'), 'text': get_text_or_none(child, 'text/text()')} lst.append(case_file_statements) result = dbc.insert_listdict(lst, 'trademark_app_case_file_statements')
def parse_correspondents(): dbc = Db() correspondent_elements = case.findall('correspondent') lst = [] for child in correspondent_elements: case_file_correspondent = {'serial_number': doc_id} case_file_correspondent_items = ( 'address_1', 'address_2', 'address_3', 'address_4', 'address_5') for hitem in case_file_correspondent_items: search_term = hitem.replace('_', '-') + '/text()' case_file_correspondent[hitem] = get_text_or_none(child, search_term) lst.append(case_file_correspondent) result = dbc.insert_listdict(lst, 'trademark_app_correspondents')
def parse_prior_registration_applications(): dbc = Db() pra_elements = case.findall('prior-registration-applications/prior-registration-application') other_related_in = get_text_or_none(case, 'prior-registration-applications/other-related-in/text()') lst = [] for child in pra_elements: prior_registration_applications = {'serial_number': doc_id, 'other_related_in': other_related_in, 'relationship_type': get_text_or_none(child, 'relationship-type/text()'), 'number': get_text_or_none(child, 'number/text()')} lst.append(prior_registration_applications) result = dbc.insert_listdict(lst, 'trademark_app_prior_registration_applications')
def parse_foreign_applications(): dbc = Db() fa_elements = case.findall('foreign-applications/foreign-application') lst = [] for child in fa_elements: foreign_applications = {'serial_number': doc_id} foreign_applications_items = ( 'filing_date', 'registration_date', 'registration_expiration_date', 'registration_renewal_date', 'registration_renewal_expiration_date', 'entry_number', 'application_number', 'country', 'other', 'registration_number', 'renewal_number', 'foreign_priority_claim_in') for hitem in foreign_applications_items: search_term = hitem.replace('_', '-') + '/text()' foreign_applications[hitem] = get_text_or_none(child, search_term) lst.append(foreign_applications) result = dbc.insert_listdict(lst, 'trademark_app_foreign_applications')
def parse_international_registration(): dbc = Db() cfir_elements = case.findall('international-registration') lst = [] for child in cfir_elements: case_file_international_registration = {'serial_number': doc_id} case_file_international_registration_items = ( 'international_registration_number', 'international_registration_date', 'international_publication_date', 'international_renewal_date', 'auto_protection_date', 'international_death_date', 'international_status_code', 'international_status_date', 'priority_claimed_in', 'priority_claimed_date', 'first_refusal_in') for hitem in case_file_international_registration_items: search_term = hitem.replace('_', '-') + '/text()' case_file_international_registration[hitem] = get_text_or_none(child, search_term) lst.append(case_file_international_registration) result = dbc.insert_listdict(lst, 'trademark_app_international_registration')
def parse_owners(): dbc = Db() cfo_elements = case.findall('case-file-owners/case-file-owner') lst = [] for child in cfo_elements: case_file_owners = {'serial_number': doc_id} case_file_owners_items = ( 'entry_number', 'party_type', 'legal_entity_type_code', 'entity_statement', 'party_name', 'address_1', 'address_2', 'city', 'state', 'country', 'other', 'postcode', 'dba_aka_text', 'composed_of_statement', 'name_change_explanation') for hitem in case_file_owners_items: search_term = hitem.replace('_', '-') + '/text()' case_file_owners[hitem] = get_text_or_none(child, search_term) case_file_owners['nationality'] = get_text_or_none(child, 'nationality/country/text()') lst.append(case_file_owners) result = dbc.insert_listdict(lst, 'trademark_app_case_file_owners')
def parse_file(filename, file_id): dbc = Db() if WORK_DIR not in filename: filename = os.path.join(WORK_DIR, filename) with open(filename, 'rb') as inputfile: file_start_time = time.time() logger.info('Parsing file %s' % filename) context = etree.iterparse(inputfile, events=('end',), tag='case-file') for event, case in context: doc_id = int(get_text_or_none(case, 'serial-number/text()')) serial_db = dbc.serial_get(doc_id, file_id) if serial_db is not None: new_file_date = int(re.sub(r"\D", "", filename)) db_file_date = int(re.sub(r"\D", "", serial_db['filename'])) if new_file_date > db_file_date \ or serial_db['status'] is False \ or (new_file_date >= db_file_date and args.parseall and args.force): for t in ('trademark_app_case_files', 'trademark_app_case_file_event_statements', 'trademark_app_case_file_headers', 'trademark_app_case_file_owners', 'trademark_app_case_file_statements', 'trademark_app_classifications', 'trademark_app_correspondents', 'trademark_app_design_searches', 'trademark_app_foreign_applications', 'trademark_app_international_registration', 'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record', 'trademark_app_prior_registration_applications', 'trademark_app_us_codes'): dbc.delete_serial(doc_id, t) logger.info('Processing existing serial number %s', doc_id) parse_case(case, doc_id, file_id) else: logger.info('Processing new serial number %s', doc_id) parse_case(case, doc_id, file_id) case.clear() dbc.file_update_status(file_id, 'finished') os.remove(filename) logger.info('Finished parsing file %s in [%s sec]', filename, time.time() - file_start_time)
def parse_description(): dbc = Db() description_element = case.find('description') description_text_full = etree.tostring(description_element).decode() text = re.sub('<.*?>|</.*?>', '', description_text_full) text = re.sub('[\n\t\r\f]+', ' ', text) text = re.sub('^\d+\.\s+', '', text) text = re.sub('\s+', ' ', text) description = {'app_id': app_id, 'uuid': id_generator(), 'text': text}
def parse_file(filename, file_id): dbc = Db() if WORK_DIR not in filename: filename = os.path.join(settings.APP_XMLDIR, filename) with open(filename, 'rb') as inputfile: file_start_time = time.time() logger.info('Parsing file %s' % filename) context = etree.iterparse(inputfile, events=('end', ), tag='us-patent-application') app_counter = 0 for event, case in context: data_application = case.find('us-bibliographic-data-application') app_ref = data_application.find('application-reference') app_id = int( get_text_or_none(app_ref, 'document-id/doc-number/text()')) app_id_db = dbc.app_id_get(app_id, file_id) if app_id_db is not None: logger.info( 'APP_id %s do not exists in database and will be inserted', app_id) new_file_date = int(re.sub(r"\D", "", filename)) db_file_date = int(re.sub(r"\D", "", app_id_db['filename'])) print(new_file_date, db_file_date) if new_file_date > db_file_date \ or app_id_db['status'] is False \ or (new_file_date >= db_file_date and args.parseall and args.force): for t in ( 'trademark_app_case_files', 'trademark_app_case_file_event_statements', 'trademark_app_case_file_headers', 'trademark_app_case_file_owners', 'trademark_app_case_file_statements', 'trademark_app_classifications', 'trademark_app_correspondents', 'trademark_app_design_searches', 'trademark_app_foreign_applications', 'trademark_app_international_registration', 'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record', 'trademark_app_prior_registration_applications', 'trademark_app_us_codes'): dbc.delete_serial(app_id, t) logger.info('Processing existing serial number %s', app_id) parse_case(case, app_id, file_id) else: logger.info('Processing new app_id %s', app_id) parse_app(case, app_id, filename) app_counter += 1 case.clear() if app_counter == 5: sys.exit() dbc.file_update_status(file_id, 'finished') os.remove(filename) logger.info('Finished parsing file %s in [%s sec]', filename, time.time() - file_start_time)
def parse_classifications(): dbc = Db() classification_elements = case.findall('classifications/classification') for child in classification_elements: classifications = {'serial_number': doc_id} classifications_items = ( 'international_code_total_no', 'us_code_total_no', 'international_code', 'status_code', 'status_date', 'first_use_anywhere_date', 'first_use_in_commerce_date', 'primary_code') for hitem in classifications_items: search_term = hitem.replace('_', '-') + '/text()' classifications[hitem] = get_text_or_none(child, search_term) classification_id = dbc.insert_dict(classifications, 'trademark_app_classifications') code_elements = child.findall('us-code') lst = [] for subchild in code_elements: case_file_us_codes = {'serial_number': doc_id, 'classification_id': classification_id, 'us_code': subchild.text} lst.append(case_file_us_codes) result = dbc.insert_listdict(lst, 'trademark_app_us_codes')
def sub_main(args): files_tuple = get_urls(MAIN_URL) dbc = Db() for file in files_tuple: file_check = dbc.file_check(file) if file_check is None: xml_filename = download_file(file['url']) if xml_filename is not None: inserted_id = dbc.file_insert(file, xml_filename) parse_file(xml_filename, inserted_id) elif file_check['status'] == 'new': logger.warning('File %s exists into database. Going to process again', file_check['filename']) if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])): xml_filename = download_file(file['url']) else: xml_filename = file_check['filename'] parse_file(xml_filename, file_check['id']) else: logger.info('File %s is already inserted into database.', file_check['filename']) if args.parse: logger.info('Nothing to work. Exiting.') sys.exit()
def parse_application(): dbc = Db() pub_date = get_text_or_none(pub_ref, 'document-id/date/text()') if pub_date[6:] != "00": pub_date = pub_date[:4] + '-' + pub_date[4:6] + '-' + pub_date[6:] year = pub_date[:4] else: pub_date = pub_date[:4] + '-' + pub_date[4:6] + '-' + '01' year = pub_date[:4] abstract_p_list = case.findall('abstract/p') abstract = '' for p in abstract_p_list: abstract += get_text_or_none(p, 'text()') application = { 'id': year + '/' + get_text_or_none(pub_ref, 'document-id/doc-number/text()'), 'type': app_ref.attrib['appl-type'], 'number': get_text_or_none(pub_ref, 'document-id/doc-number/text()'), 'app_id': app_id, 'country': get_text_or_none(app_ref, 'document-id/country/text()'), 'date': pub_date, 'abstract': abstract, 'title': get_text_or_none( case, 'us-bibliographic-data-application/invention-title/text()'), 'granted': None, 'num_claims': len(claims_element_list), 'filename': filename.split('\\')[-1], }
def parse_claims(): dbc = Db() claims_list = [] for claim_element in claims_element_list: sequence = claim_element.attrib['num'] claim_text_full = etree.tostring(claim_element).decode() dependent = re.search('<claim-ref idref="CLM-(\d+)">', claim_text_full) dependent = int( dependent.group(1)) if dependent is not None else None text = re.sub('<.*?>|</.*?>', '', claim_text_full) text = re.sub('[\n\t\r\f]+', '', text) text = re.sub('^\d+\.\s+', '', text) text = re.sub('\s+', ' ', text) claim = { 'uuid': id_generator(), 'application_id': application['id'], 'app_id': application['app_id'], 'text': text, 'dependent': dependent, 'sequence': sequence } claims_list.append(claim)
def main_worker(file): dbc = Db() file_check = dbc.file_check(file) if file_check is None: xml_filename = download_file(file['url']) if xml_filename is not None: inserted_id = dbc.file_insert(file, xml_filename) parse_file(xml_filename, inserted_id) elif file_check['status'] in ['new', ''] or file_check['status'] is None: logger.warning('File %s exists into database. Going to process again', file_check['filename']) if not os.path.isfile(os.path.join(WORK_DIR, file_check['filename'])): xml_filename = download_file(file['url']) else: xml_filename = file_check['filename'] if settings.APP_XMLDIR not in xml_filename: xml_filename = os.path.join(settings.APP_XMLDIR, xml_filename) parse_file(xml_filename, file_check['id']) else: logger.info('File %s is already inserted into database.', file_check['filename']) if args.parse: logger.info('Nothing to work. Exiting.') sys.exit()
'city': get_text_or_none(applicant_element, 'addressbook/address/city/text()'), 'state': get_text_or_none(applicant_element, 'addressbook/address/state/text()'), 'country': get_text_or_none(applicant_element, 'addressbook/address/country/text()'), 'country_transformed': get_text_or_none(applicant_element, 'addressbook/address/country/text()'), } rawlocation_list.append(rawlocation) dbc = Db() start_time = time.time() application = parse_application() # print(application) parse_claims() parse_description() parse_assignees() parse_inventors() parse_applicants() # with cf.ThreadPoolExecutor(max_workers=12) as executor: # executor.submit(parse_case_files) # executor.submit(parse_headers) # executor.submit(parse_statements) # executor.submit(parse_event_statements)
def parse_file(filename, file_id): dbc = Db() if WORK_DIR not in filename: filename = os.path.join(WORK_DIR, filename) with open(filename, 'rb') as inputfile: file_start_time = time.time() logger.info('Parsing file %s' % filename) context = etree.iterparse(inputfile, events=('end', ), tag='case-file') for event, case in context: doc_id = int(get_text_or_none(case, 'serial-number/text()')) serial_db = dbc.serial_get(doc_id, file_id) if serial_db is not None: transaction_date_string = get_text_or_none( case, 'transaction-date/text()') if transaction_date_string: transaction_date = datetime.strptime( transaction_date_string, '%Y%m%d').date() else: logger.warning('Missing transaction date in XML') transaction_date = None if serial_db['transaction_date'] is not None and serial_db[ 'transaction_date'] != '': db_transaction_date = datetime.strptime( serial_db['transaction_date'], '%Y%m%d').date() else: logger.warning('Missing transaction date in database') db_transaction_date = None if transaction_date > db_transaction_date \ or (serial_db['status'] is False and args.force) \ or (transaction_date > db_transaction_date and args.parseall and args.force): for t in ( 'trademark_app_case_files', 'trademark_app_case_file_event_statements', 'trademark_app_case_file_headers', 'trademark_app_case_file_owners', 'trademark_app_case_file_statements', 'trademark_app_classifications', 'trademark_app_correspondents', 'trademark_app_design_searches', 'trademark_app_foreign_applications', 'trademark_app_international_registration', 'trademark_app_madrid_history_events', 'trademark_app_madrid_international_filing_record', 'trademark_app_prior_registration_applications', 'trademark_app_us_codes'): dbc.delete_serial(doc_id, t) dbc.cnx.commit() logger.info('[%s] Deleted serial %s from all tables', os.path.basename(filename), doc_id) logger.info('[%s] Processing existing serial number %s', os.path.basename(filename), doc_id) parse_case(case, doc_id, file_id, dbc) else: logger.info('[%s] Processing new serial number %s', os.path.basename(filename), doc_id) parse_case(case, doc_id, file_id, dbc) case.clear() dbc.file_update_status(file_id, 'finished') os.remove(filename) logger.info('[%s] Finished parsing file in [%s sec]', os.path.basename(filename), time.time() - file_start_time)
mifr_id = dbc.insert_dict(madrid_international_filing_record, 'trademark_app_madrid_international_filing_record') mhe_elements = child.findall('madrid-history-events/madrid-history-event') lst = [] for subchild in mhe_elements: madrid_history_events = {'serial_number': doc_id, 'madrid_international_filing_record_id': mifr_id} madrid_history_events_items = ( 'code', 'date', 'description_text', 'entry_number') for hitem in madrid_history_events_items: search_term = hitem.replace('_', '-') + '/text()' madrid_history_events[hitem] = get_text_or_none(subchild, search_term) lst.append(madrid_history_events) result = dbc.insert_listdict(lst, 'trademark_app_madrid_history_events') dbc = Db() start_time = time.time() with cf.ThreadPoolExecutor(max_workers=12) as executor: executor.submit(parse_case_files) executor.submit(parse_headers) executor.submit(parse_statements) executor.submit(parse_event_statements) executor.submit(parse_prior_registration_applications) executor.submit(parse_foreign_applications) executor.submit(parse_classifications) executor.submit(parse_correspondents) executor.submit(parse_owners) executor.submit(parse_design_searches) executor.submit(parse_international_registration) executor.submit(parse_madrid_international_filing_record)