def build_sql_insert_query(self, insert_data_array, args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") uspto_xml_format = args_array['uspto_xml_format'] # Set a length counter used to find when the last item is appended to query string array_length_counter = 1 length_of_array = len(insert_data_array) - 1 # Pass the table name to variable table_name = insert_data_array['table_name'] # Pop the table name off the array to be stored into database del insert_data_array['table_name'] sql_query_string = "INSERT INTO " + table_name + " " sql_column_string = "(" sql_value_string = " VALUES (" # Concatenate the list of keys and values to sql format for key, value in list(insert_data_array.items()): # Don't escape values that are None (NULL) if value is not None and isinstance(value, int) == False: # Escape all values for sql insertion value = USPTOSanitizer.escape_value_for_sql(str(value.encode('utf-8'))) # Since postgresql uses `$` as delimiter, must strip from first and last char value = value.strip("$").replace("$$$", "$").replace("$$", "$") # If the last item in the array then append line without comma at end if length_of_array == array_length_counter: sql_column_string += key # Check for None value and append if value == None: sql_value_string += 'NULL' else: # PostgreSQL strings will be escaped slightly different than MySQL if args_array['database_type'] == 'postgresql': sql_value_string += "$$" + str(value)+ "$$" elif args_array['database_type'] == 'mysql': sql_value_string += '"' + str(value) + '"' # If not the last item then append with comma else: sql_column_string += key + ", " # Check if value is None if value == None: sql_value_string += 'NULL,' else: if args_array['database_type'] == 'postgresql': sql_value_string += "$$" + str(value) + "$$," elif args_array['database_type'] == 'mysql': sql_value_string += '"' + str(value) + '",' array_length_counter += 1 # Add the closing bracket sql_column_string += ") " sql_value_string += ");" # Concatenate the pieces of the query sql_query_string += sql_column_string + sql_value_string logger.info(sql_query_string) # Return the query string return sql_query_string
def extract_CPC_class_dict(line): cpc_array = USPTOSanitizer.return_CPC_class_application(line[0]) # Build a class dictionary class_dictionary = { "table_name" : "uspto.CPCClASS_C", "extraction_type" : "cpcclass", "Section" : cpc_array[0], "Class" : cpc_array[1], "SubClass" : cpc_array[2], "MainGroup" : cpc_array[3], "SubGroup" : cpc_array[4], "Title" : line[1].replace('"', "").strip() } #print(class_dictionary) # Return the class dictionary return class_dictionary
def extract_csv_line(args_array, line): #print(line) # Declare a processed array to append to processed_array = { "table_name": set_table_name_from_type(args_array['extraction_type']), "FileName": args_array['file_name'], "extraction_type": args_array['extraction_type'] } # Handle a correspondance items if args_array['extraction_type'] == "correspondence": processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[0])) processed_array['Name1'] = USPTOSanitizer.clean_PAIR_csv_item(line[1]) processed_array['Name2'] = USPTOSanitizer.clean_PAIR_csv_item(line[2]) processed_array['Address'] = USPTOSanitizer.clean_PAIR_csv_item( line[3]) + " " + USPTOSanitizer.clean_PAIR_csv_item(line[4]) processed_array['City'] = USPTOSanitizer.clean_PAIR_csv_item(line[5]) processed_array['PostalCode'] = USPTOSanitizer.clean_PAIR_csv_item( line[6]) processed_array['RegionCode'] = USPTOSanitizer.clean_PAIR_csv_item( line[7]) processed_array['RegionName'] = USPTOSanitizer.clean_PAIR_csv_item( line[8]) processed_array['CountryCode'] = USPTOSanitizer.clean_PAIR_csv_item( line[9]) processed_array['CountryName'] = USPTOSanitizer.clean_PAIR_csv_item( line[10]) processed_array['CustomerNum'] = USPTOSanitizer.clean_PAIR_csv_item( line[11]) elif args_array['extraction_type'] == "continuityparent": processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[0])) processed_array[ 'ParentApplicationID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[1])) processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[2]) processed_array[ 'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3]) elif args_array['extraction_type'] == "continuitychild": processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[0])) processed_array[ 'ChildApplicationID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[1])) processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[2]) processed_array[ 'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3]) # Return the array for storage return processed_array
def extract_XML4_grant(raw_data, args_array): # Stat process timer start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define all arrays to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] processed_foreignpriority = [] # Pass the raw_data data into Element Tree document_root = ET.fromstring(raw_data) # Start the extraction of XML data r = document_root.find('us-bibliographic-data-grant') if r is not None: # Find the main patent grant data for pr in r.findall('publication-reference'): for di in pr.findall('document-id'): try: pub_country = di.findtext('country').strip() except: pub_country = None try: document_id = di.findtext('doc-number').strip() document_id = USPTOSanitizer.fix_patent_number(document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = di.findtext('kind').strip()[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date(di.findtext('date'), args_array, document_id) except: pub_date = None # Find the main application data for ar in r.findall('application-reference'): try: app_type = ar.attrib['appl-type'][:45].strip() except: app_type = None for di in ar.findall('document-id'): try: app_country = di.findtext('country').strip() except: app_country = None try: app_no = di.findtext('doc-number')[:20].strip() except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date(di.findtext('date'), args_array, document_id) except: app_date = None # Get the series code try: series_code = r.findtext('us-application-series-code')[:2].strip() except: series_code = None # Get the length of grant try: grant_length = r.find("us-term-of-grant").findtext("length-of-grant").strip() except: grant_length = None # Find all international classifications ic = r.find('classifications-ipcr') position = 1 if ic is not None: for icc in ic.findall('classification-ipcr'): for x in icc.getchildren(): if(USPTOSanitizer.check_tag_exists(x,'section')): try: i_class_sec = x.text.strip()[:15] except: i_class_sec = None if(USPTOSanitizer.check_tag_exists(x,'class')): try: i_class_cls = x.text.strip()[:15] except: i_class_cls = None if(USPTOSanitizer.check_tag_exists(x,'subclass')): try: i_class_sub = x.text.strip()[:15] except: i_class_sub = None if(USPTOSanitizer.check_tag_exists(x,'main-group')): try: i_class_mgr = x.text.strip()[:15] except: i_class_mgr = None if(USPTOSanitizer.check_tag_exists(x,'subgroup')): try: i_class_sgr = x.text.strip()[:15] except: i_class_sgr = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name" : "uspto.INTCLASS_G", "GrantID" : document_id, "Position" : position, "Section" : i_class_sec, "Class" : i_class_cls, "SubClass" : i_class_sub, "MainGroup" : i_class_mgr, "SubGroup" : i_class_sgr, "FileName" : args_array['file_name'] }) #print(processed_intclass) position += 1 # Init positions for CPC and US classifications cpc_position = 1 nc_position = 1 # TODO: So much more fields available in the XML4 grant cpc-classifications # Find CPC Classifications in main root # This section is not required since the 'classification-cpc-text' provides same data # """ cpcs = r.find('classifications-cpc') if cpcs is not None: cpc_main = cpcs.find('main-cpc').find('classification-cpc') if cpc_main is not None: cpc_section = None cpc_class = None cpc_subclass = None cpc_class_mgr = None cpc_class_sgr = None try: cpc_section = cpc_item.findtext('section') except: cpc_section = None try: cpc_class = cpc_item.findtext('class') except: cpc_class = None try: cpc_subclass = cpc_item.findtext('subclass') except: cpc_subclass = None try: cpc_class_mgr = cpc_item.findtext('main-group') except: cpc_class_mgr = None try: cpc_class_sgr = cpc_item.findtext('subgroup') except: cpc_class_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name" : "uspto.CPCCLASS_G", "GrantID" : document_id, "Position" : cpc_position, "Section" : cpc_section, "Class" : cpc_class, "SubClass" : cpc_subclass, "MainGroup" : cpc_class_mgr, "SubGroup" : cpc_class_sgr, "FileName" : args_array['file_name'] }) #print(processed_cpcclass) cpc_position += 1 # Collect further CPC classifications cpcf = cpcs.find('further-cpc') if cpcf is not None: for cpc_item in cpcf.findall('classification-cpc'): cpc_section = None cpc_class = None cpc_subclass = None cpc_class_mgr = None cpc_class_sgr = None try: cpc_section = cpc_item.findtext('section') except: cpc_section = None try: cpc_class = cpc_item.findtext('class') except: cpc_class = None try: cpc_subclass = cpc_item.findtext('subclass') except: cpc_subclass = None try: cpc_class_mgr = cpc_item.findtext('main-group') except: cpc_class_mgr = None try: cpc_class_sgr = cpc_item.findtext('subgroup') except: cpc_class_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name" : "uspto.CPCCLASS_G", "GrantID" : document_id, "Position" : cpc_position, "Section" : cpc_section, "Class" : cpc_class, "SubClass" : cpc_subclass, "MainGroup" : cpc_class_mgr, "SubGroup" : cpc_class_sgr, "FileName" : args_array['file_name'] }) #print(processed_cpcclass) cpc_position += 1 """ # Find all US classifications if they are embedded in a "field-of-search" tag (XML4 2005 files) foc = r.find('field-of-search') if foc is not None: nc_position = 1 # Create list of all items ncs = foc.findall('classification-national') for nc in ncs: # Find the main classification tag ncm = nc.find('main-classification') if ncm is not None: #print(ncm.text) n_class_main = None n_subclass = None n_malformed = None try: n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncm.text) except Exception as e: traceback.print_exc() n_class_main = None n_subclass = None n_malformed = 1 # Some are labelled as "None" if n_class_main != None or n_subclass != None: # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name" : "uspto.USCLASS_G", "GrantID" : document_id, "Position" : nc_position, "Class" : n_class_main, "SubClass" : n_subclass, "Malformed" : n_malformed, "FileName" : args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Find all CPC classifications foc = r.find('us-field-of-classification-search') if foc is not None: for cpc in foc.findall('classification-cpc-text'): cpc_section = None cpc_class = None cpc_subclass = None cpc_class_mgr = None cpc_class_sgr = None try: #print(cpc.text) cpc_text = cpc.text cpc_class_string, cpc_group_string = cpc_text.split(" ") #print(cpc_class_string + " " + cpc_group_string) cpc_section = cpc_text.strip()[0] cpc_class = cpc_class_string.strip()[1:3] cpc_subclass = cpc_class_string.strip()[3] cpc_class_mgr, cpc_class_sgr = cpc_group_string.rsplit("/", 1) cpc_class_mgr = cpc_class_mgr.strip()[:15] cpc_class_sgr = cpc_class_sgr.strip()[:15] #print(cpc_class_sec + " " + cpc_class + " " + cpc_subclass + " " + cpc_class_mgr + " " + cpc_class_sgr) except: cpc_section = None cpc_class = None cpc_subclass = None cpc_class_mgr = None cpc_class_sgr = None logger.warning("There was an error parsing the cpc class for Grant ID: " + document_id + " in file: " + url_link) logger.warning("Traceback: " + traceback.format_exc()) # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name" : "uspto.CPCCLASS_G", "GrantID" : document_id, "Position" : cpc_position, "Section" : cpc_section, "Class" : cpc_class, "SubClass" : cpc_subclass, "MainGroup" : cpc_class_mgr, "SubGroup" : cpc_class_sgr, "FileName" : args_array['file_name'] }) #print(processed_cpcclass) cpc_position += 1 # Find all US classifications nc_position = 1 ncs = foc.findall('classification-national') for nc in ncs: ncm = nc.find('main-classification') if ncm is not None: #print(ncm.text) n_class_main = None n_subclass = None n_malformed = None try: n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncm.text) except Exception as e: traceback.print_exc() exit() n_class_main = None n_subclass = None n_malformed = 1 # Some are labelled as "None" if n_class_main != None or n_subclass != None: # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name" : "uspto.USCLASS_G", "GrantID" : document_id, "Position" : nc_position, "Class" : n_class_main, "SubClass" : n_subclass, "Malformed" : n_malformed, "FileName" : args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Collect further US classes ncf = nc.find('further-classification') if ncf is not None: #print("Further " + ncf.text) n_class_main = None n_subclass = None n_malformed = None try: n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncf.text) except Exception as e: traceback.print_exc() exit() n_class_main = None n_subclass = None n_malformed = 1 # Some are labelled as "None" if n_class_main != None or n_subclass != None: # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name" : "uspto.USCLASS_G", "GrantID" : document_id, "Position" : position, "Class" : n_class_main, "SubClass" : n_subclass, "Malformed" : n_malformed, "FileName" : args_array['file_name'] }) #print(processed_usclass) position += 1 # Find the title of the patent try: title = USPTOSanitizer.strip_for_csv(r.findtext('invention-title')[:500]) except: title = None # Find all references cited in the grant # Check if the XML format is using 'us-references-cited' or 'references-cited' if r.find('us-references-cited') != None: ref_cited_id_string = "us-references-cited" elif r.find('references-cited') != None: ref_cited_id_string = "references-cited" else: ref_cited_id_string = "references" rf = r.find(ref_cited_id_string) if rf != None: # Check if the XML format is using 'citation' or 'us-citation' if rf.find('citation') != None: citation_id_string = "citation" elif rf.find('us-citation') != None: citation_id_string = "us-citation" else: citation_id_string = "us-citation" uspatcit_position = 1 forpatcit_position = 1 nptc_position = 1 all_rfc = rf.findall(citation_id_string) for rfc in all_rfc: # If the patent citation child is found must be a patent citation if rfc.find('patcit') != None: x = rfc.find('patcit') try: citation_country = x.find('document-id').findtext('country').strip()[:5] except: citation_country = None try: citation_grant_id = x.find('document-id').findtext('doc-number').strip()[:20] except: citation_grant_id = None try: citation_kind = x.find('document-id').findtext('kind').strip()[:10] except: citation_kind = None try: citation_name = x.find('document-id').findtext('name').strip()[:100] except: citation_name = None try: citation_date = USPTOSanitizer.return_formatted_date(x.find('document-id').findtext('date'), args_array, document_id) except: citation_date = None try: citation_category = rfc.findtext('category').strip().upper()[:20] except Exception as e: citation_category = None # US patent citations if(citation_country.strip().upper() == 'US'): # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name" : "uspto.GRACIT_G", "GrantID" : document_id, "Position" : uspatcit_position, "CitedID" : citation_grant_id, "Kind" : citation_kind, "Name" : citation_name, "Date" : citation_date, "Country" : citation_country, "Category" : citation_category, "FileName" : args_array['file_name'] }) #print(processed_usclass) uspatcit_position += 1 elif citation_country.strip().upper() != 'US': # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name" : "uspto.FORPATCIT_G", "GrantID" : document_id, "Position" : forpatcit_position, "CitedID" : citation_grant_id, "Kind" : citation_kind, "Name" : citation_name, "Date" : citation_date, "Country" : citation_country, "Category" : citation_category, "FileName" : args_array['file_name'] }) forpatcit_position += 1 #print(processed_forpatcit) # If the non-patent citations are found elif rfc.find('nplcit') != None: x = rfc.find('nplcit') # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it try: npatcit_text = USPTOSanitizer.strip_for_csv(x.findtext('othercit')) #npatcit_text.replace("<", "").replace(">","") except: npatcit_text = None try: citation_category = rfc.findtext('category').strip().upper()[:20] except: citation_category = None # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name" : "uspto.NONPATCIT_G", "GrantID" : document_id, "Position" : nptc_position, "Citation" : npatcit_text, "Category" : citation_category, "FileName" : args_array['file_name'] }) #print(processed_nonpatcit) nptc_position += 1 # Find number of claims try: claims_num = r.findtext('number-of-claims').strip() except: claims_num = None # Find the number of figures and number of drawings nof = r.find('figures') try: number_of_drawings = nof.findtext('number-of-drawing-sheets').strip() number_of_drawings = number_of_drawings.split("/")[0].strip() except: number_of_drawings = None try: number_of_figures = nof.findtext('number-of-figures').strip() except: number_of_figures = None # Find the parties # Check if XML format uses 'us-parties' or 'parties' if r.find('us-parties') != None: parties_id_string = "us-parties" elif r.find('parties') != None: parties_id_string = "parties" else: parties_id_string = "parties" # Get the main parties XML tag prt = r.find(parties_id_string) if prt != None: appl_position = 1 invt_position = 1 # Find all applicant data # Check if the XML format uses 'applicants' or 'us-applicants' if prt.find('us-applicants') != None : applicants_id_string = 'us-applicants' elif prt.find('applicants') != None : applicants_id_string = 'applicants' else: applicants_id_string = 'applicants' # Grab the layered applicants tag apts = prt.find(applicants_id_string) if apts != None: # Check if the XML format uses 'applicant' or 'us-applicant' if apts.find('us-applicant') != None : applicant_id_string = 'us-applicant' elif apts.find('applicant') != None : applicant_id_string = 'applicant' else: applicant_id_string = 'applicant' for apt in apts.findall(applicant_id_string): # Get the inventor status of the applicant try: inventor_status = apt.attrib['app-type'] except: inventor_status = None if(apt.find('addressbook') != None): try: applicant_orgname = apt.find('addressbook').findtext('orgname')[:300].strip() except: applicant_orgname = None try: applicant_first_name = apt.find('addressbook').findtext('first-name')[:100].strip() except: applicant_first_name = None try: applicant_last_name = apt.find('addressbook').findtext('last-name')[:100].strip() except: applicant_last_name = None try: applicant_city = apt.find('addressbook').find('address').findtext('city')[:100].strip() except: applicant_city = None try: applicant_state = apt.find('addressbook').find('address').findtext('state')[:25].strip() except: applicant_state = None try: applicant_country = apt.find('addressbook').find('address').findtext('country')[:5].strip() except: applicant_country = None try: inventor_residence = apt.findtext('residence')[:100].strip() except: inventor_residence = None # Append SQL data into dictionary to be written later processed_applicant.append({ "table_name" : "uspto.APPLICANT_G", "GrantID" : document_id, "OrgName" : applicant_orgname, "Position" : appl_position, "FirstName" : applicant_first_name, "LastName" : applicant_last_name, "City" : applicant_city, "State" : applicant_state, "Country" : applicant_country, "FileName" : args_array['file_name'] }) #print(processed_applicant) appl_position += 1 # Check if the applicant is inventor if "inventor" in inventor_status: # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name" : "uspto.INVENTOR_G", "GrantID" : document_id, "Position" : invt_position, "FirstName" : applicant_first_name, "LastName" : applicant_last_name, "City" : applicant_city, "State" : applicant_state, "Country" : applicant_country, "Residence" : inventor_residence, "FileName" : args_array['file_name'] }) #print(processed_inventor) invt_position += 1 # Find all inventor data for invts in prt.findall('inventors'): for inv in invts.findall('inventor'): try: inventor_sequence = USPTOSanitizer.strip_leading_zeros(inv.attrib['sequence']) except: inventor_sequence = position if inv.find('addressbook') != None: try: inventor_first_name = inv.find('addressbook').findtext('first-name')[:100].strip() except: inventor_first_name = None try: inventor_last_name = inv.find('addressbook').findtext('last-name')[:100].strip() except: inventor_last_name = None try: inventor_city = inv.find('addressbook').find('address').findtext('city')[:100].strip() except: inventor_city = None try: inventor_state = inv.find('addressbook').find('address').findtext('state')[:100].strip() except: inventor_state = None try: inventor_country = inv.find('addressbook').find('address').findtext('country')[:5].strip() except: inventor_country = None try: inventor_residence = inv.find('addressbook').find('address').findtext('country')[:5].strip() except: inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name" : "uspto.INVENTOR_G", "GrantID" : document_id, "Position" : invt_position, "FirstName" : inventor_first_name, "LastName" : inventor_last_name, "City" : inventor_city, "State" : inventor_state, "Country" : inventor_country, "Residence" : inventor_residence, "FileName" : args_array['file_name'] }) #print(processed_inventor) invt_position += 1 # Find all agent data for agns in prt.findall('agents'): position = 1 for agn in agns.findall('agent'): try: agent_sequence = USPTOSanitizer.strip_leading_zeros(agn.attrib['sequence']) except: agent_sequence = position if(agn.find('addressbook') != None): try: agent_orgname = agn.find('addressbook').findtext('orgname')[:300].strip() except: agent_orgname = None try: agent_last_name = agn.find('addressbook').findtext('last-name')[:100].strip() except: agent_last_name = None try: agent_first_name = agn.find('addressbook').findtext('first-name')[:100].strip() except: agent_first_name = None try: agent_country = agn.find('addressbook').find('address').findtext('country')[:3].strip() except: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name" : "uspto.AGENT_G", "GrantID" : document_id, "Position" : agent_sequence, "OrgName" : agent_orgname, "LastName" : agent_last_name, "FirstName" : agent_first_name, "Country" : agent_country, "FileName" : args_array['file_name'] }) #print(processed_agent) position += 1 # Find all assignee data for asn in r.findall('assignees'): position = 1 for x in asn.findall('assignee'): if(x.find('addressbook') != None): try: asn_orgname = x.find('addressbook').findtext('orgname')[:500].strip() except: asn_orgname = None try: asn_role = x.find('addressbook').findtext('role')[:45].strip() except: asn_role = None try: asn_city = x.find('addressbook').find('address').findtext('city')[:100].strip() except: asn_city = None try: asn_state = x.find('addressbook').find('address').findtext('state')[:100].strip() except: asn_state = None try: asn_country = x.find('addressbook').find('address').findtext('country')[:5].strip() except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name" : "uspto.ASSIGNEE_G", "GrantID" : document_id, "Position" : position, "OrgName" : asn_orgname, "Role" : asn_role, "City" : asn_city, "State" : asn_state, "Country" : asn_country, "FileName" : args_array['file_name'] }) #print(processed_assignee) position += 1 # Find all examiner data for exm in r.findall('examiners'): position = 1 for x in exm.findall('primary-examiner'): try: exm_last_name = x.findtext('last-name')[:50].strip() except: exm_last_name = None try: exm_first_name = x.findtext('first-name')[:50].strip() except: exm_first_name = None try: exm_department = x.findtext('department')[:100].strip() except: exm_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name" : "uspto.EXAMINER_G", "GrantID" : document_id, "Position" : position, "LastName" : exm_last_name, "FirstName" : exm_first_name, "Department" : exm_department, "FileName" : args_array['file_name'] }) #print(processed_examiner) position += 1 for x in exm.findall('assistant-examiner'): try: exm_last_name = x.findtext('last-name')[:50].strip() except: exm_last_name = None try: exm_first_name = x.findtext('first-name')[:50].strip() except: exm_first_name = None try: exm_department = x.findtext('department')[:100].strip() except: exm_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name" : "uspto.EXAMINER_G", "GrantID" : document_id, "Position" : position, "LastName" : exm_last_name, "FirstName" : exm_first_name, "Department" : exm_department, "FileName" : args_array['file_name'] }) #print(processed_examiner) position += 1 # Find main priority claims tag pcs = r.find('priority-claims') position = 1 if pcs is not None: # Find all priority claims in main tag for pc in pcs.findall('priority-claim'): # Assign data to vars try: pc_country = pc.findtext('country')[:5].strip() except: pc_country = None try: pc_kind = pc.attrib['kind'][:45].strip() except: pc_kind = None try: pc_doc_num = pc.findtext('doc-number')[:45].strip() except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date(pc.findtext('date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name" : "uspto.FOREIGNPRIORITY_G", "GrantID" : document_id, "Position" : position, "Kind" : pc_kind, "Country" : pc_country, "DocumentID" : pc_doc_num, "PriorityDate" : pc_date, "FileName" : args_array['file_name'] }) #print(processed_foreignpriority) position += 1 # Find the abstract try: a_elem = document_root.find('abstract') if a_elem is not None: abstract = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(a_elem)) else: abstract = None except Exception as e: abstract = None #traceback.print_exc() #logger.error("Exception while extracting abstract from " + str(document_id) + ": " + traceback.print_exc()) #print(abstract) # Find the description try: d_elem = document_root.find('description') if d_elem is not None: description = USPTOSanitizer.strip_for_csv(' '.join(d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc()) #print(description) # Find the claims try: c_elem = document_root.find('claims') if c_elem is not None: claims = USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext())) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc()) #print(claims) # Append SQL data into dictionary to be written later try: processed_grant.append({ "table_name" : "uspto.GRANT", "GrantID" : document_id, "Title" : title, "IssueDate" : pub_date, "Kind" : kind, "USSeriesCode" : series_code, "Abstract" : abstract, "ClaimsNum" : claims_num, "DrawingsNum" : number_of_drawings, "FiguresNum" : number_of_figures, "ApplicationID" : app_no, "Description" : description, "Claims" : claims, "FileDate" : app_date, "AppType" : app_type, "GrantLength" : grant_length, "FileName" : args_array['file_name'] }) except Exception as e: traceback.print_exc() logger.warning("Could not append patent data to array for patent number: " + document_id + " Traceback: " + traceback.format_exc()) # Return a dictionary of the processed_ data arrays return { "processed_grant" : processed_grant, "processed_applicant" : processed_applicant, "processed_examiner" : processed_examiner, "processed_assignee" : processed_assignee, "processed_agent" : processed_agent, "processed_inventor" : processed_inventor, "processed_usclass" : processed_usclass, "processed_intclass" : processed_intclass, "processed_cpcclass" : processed_cpcclass, "processed_gracit" : processed_gracit, "processed_forpatcit" : processed_forpatcit, "processed_nonpatcit" : processed_nonpatcit, "processed_foreignpriority" : processed_foreignpriority }
def extract_XML4_application_tag_counts(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # Declare a dictionary to use in counting tags tags_dict = { "APPLICATION" : ["<us-patent-application"], "INTCLASS_A" : ["<classification-ipcr"], "USCLASS_A" : ["<main-classification", "<further-classification"], "CPCCLASS_A" : ["<classification-cpc"], "FOREIGNPRIORITY_A" : ["<priority-claim>", "<priority-claim "], "AGENT_A" : ["<agent>", "<agent "], "ASSIGNEE_A" : ["<assignee>", "<assignee "], "INVENTOR_A" : ["<inventor>", "<inventor "], "APPLICANT_A" : ["<us-applicant>", "<applicant>", "<us-applicant ", "<applicant "] } # Declare a dictionary to hold counts by table counts_dict = { "file_name" : args_array['file_name'], "APPLICATION" : 0, "INTCLASS_A" : 0, "USCLASS_A" : 0, "CPCCLASS_A" : 0, "FOREIGNPRIORITY_A" : 0, "AGENT_A" : 0, "ASSIGNEE_A" : 0, "INVENTOR_A" : 0, "APPLICANT_A" : 0 } # Print to stdout and log print("-- Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) logger.info("Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) # Loop through the file contents line by line for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # Loop through tags_dict items and look for XML tag for table, tag in tags_dict.items(): item_found = False # If list is provided if isinstance(tag, list): for item in tag: # Look for field tag if item in line: item_found = True if item_found == True: # Increment the count for appropriate table counts_dict[table] += 1 # Print to stdout and log print("-- Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) logger.info("Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) # Return the dictionary of counts for found tags if args_array['stdout_level'] == 1: pprint(counts_dict) # Return the dictionary of counts for found tags return counts_dict
def extract_XML1_application_tag_counts(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # Declare a dictionary to use in counting tags #CPCCLASS_A and APPLICANT_A are not included in XML1 applications # APPLICANT_A are not include in XML1 applications tags_dict = { "APPLICATION" : ["<patent-application-publication"], "INTCLASS_A" : ["<classification-ipc-primary>", "<classification-ipc-secondary>"], "USCLASS_A" : ["<classification-us-primary>", "<classification-us-secondary>"], "FOREIGNPRIORITY_A" : ["<priority-application-number"], "AGENT_A" : ["<correspondence-address>"], "INVENTOR_A" : ["<first-named-inventor", "<inventor>"], } # Declare a dictionary to hold counts by table counts_dict = { "file_name" : args_array['file_name'], "APPLICATION" : 0, "INTCLASS_A" : 0, "USCLASS_A" : 0, "CPCCLASS_A" : 0, "FOREIGNPRIORITY_A" : 0, "AGENT_A" : 0, "ASSIGNEE_A" : 0, "INVENTOR_A" : 0, "APPLICANT_A" : 0 } # Print to stdout and log print("-- Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) logger.info("Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) # Loop through the file contents line by line for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # Loop through tags_dict items and look for XML tag for table, tag in tags_dict.items(): item_found = False # If list is provided if isinstance(tag, list): for item in tag: # Look for field tag if item in line: item_found = True if item_found == True: # Increment the count for appropriate table counts_dict[table] += 1 # Parse the tags that need to be XML parsed # Create variables needed to parse the file xml_string = '' patent_xml_started = False # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<patent-application-publication" in line: patent_xml_started = True xml_string += "<patent-application-publication>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</patent-application-publication" in line: patent_xml_started = False xml_string += "</patent-application-publication>" #print(xml_string) # Pass the raw_data data into Element Tree document_root = ET.fromstring(xml_string) #print(document_root) # Extract the root tag r = document_root.find('subdoc-bibliographic-information') # Count the number of assignee tags counts_dict['ASSIGNEE_A'] += len(r.findall('assignee')) # Count the number of inventor tags counts_dict['INVENTOR_A'] += len(r.findall('inventor')) # Reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_old_html_characters(line) # Print to stdout and log print("-- Finished the XML1 appication tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) logger.info("Finished the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) # Return the dictionary of counts for found tags if args_array['stdout_level'] == 1: pprint(counts_dict) # Return the dictionary of counts for found tags return counts_dict
def extract_XML4_grant_tag_counts(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # Declare a dictionary to use in counting tags tags_dict = { "GRANT" : ["<us-patent-grant"], "INTCLASS_G" : ["<classification-ipcr"], "AGENT_G" : ["<agent>", "<agent "], "ASSIGNEE_G" : ["<assignee>", "<assignee "], "APPLICANT_G" : ["<us-applicant>", "<us-applicant ", "<applicant", "<applicant>"], "INVENTOR_G" : ["<inventor>", "<inventor ", "applicant-inventor"], "NONPATCIT_G" : ["<nplcit"], "EXAMINER_G" : ["<primary-examiner", "<assistant-examiner"], "FOREIGNPRIORITY_G" : ["<priority-claim>", "<priority-claim "] } # Declare a dictionary to hold counts by table counts_dict = { "file_name" : args_array['file_name'], "GRANT" : 0, "INTCLASS_G" : 0, "CPCCLASS_G" : 0, "USCLASS_G" : 0, "INVENTOR_G" : 0, "AGENT_G" : 0, "ASSIGNEE_G" : 0, "APPLICANT_G" : 0, "NONPATCIT_G" : 0, "EXAMINER_G" : 0, "GRACIT_G" : 0, "FORPATCIT_G" : 0, "FOREIGNPRIORITY_G" : 0 } # Print to stdout and log print("-- Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) logger.info("Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) # Loop through the file contents line by line for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # Loop through tags_dict items and look for XML tag for table, tag in tags_dict.items(): item_found = False # If list is provided if isinstance(tag, list): for item in tag: # Look for field tag if item in line: item_found = True if item_found == True: # Increment the count for appropriate table counts_dict[table] += 1 # Parse the tags that need to be XML parsed # Create variables needed to parse the file xml_string = '' patent_xml_started = False # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<us-patent-grant" in line: patent_xml_started = True xml_string += "<us-patent-grant>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</us-patent-grant" in line: patent_xml_started = False xml_string += "</us-patent-grant>" #print(xml_string) # Pass the raw_data data into Element Tree document_root = ET.fromstring(xml_string) #print(document_root) # Extract the root tag r = document_root.find('us-bibliographic-data-grant') # Get the patent CPC class count foc = r.find('us-field-of-classification-search') if foc is not None: counts_dict["CPCCLASS_G"] += len(foc.findall('classification-cpc-text')) counts_dict["USCLASS_G"] += len(foc.findall('classification-national')) # Get USCLASS_G count if file format uses field-of-search foc = r.find('field-of-search') if foc is not None: counts_dict["USCLASS_G"] += len(foc.findall('classification-national')) # Count the citation / reference tags if r.find('us-references-cited') != None: ref_cited_id_string = "us-references-cited" elif r.find('references-cited') != None: ref_cited_id_string = "references-cited" else: ref_cited_id_string = "references" rf = r.find(ref_cited_id_string) if rf != None: # Check if the XML format is using 'citation' or 'us-citation' if rf.find('citation') != None: citation_id_string = "citation" elif rf.find('us-citation') != None: citation_id_string = "us-citation" else: citation_id_string = "us-citation" all_rfc = rf.findall(citation_id_string) for rfc in all_rfc: # If the patent citation child is found must be a patent citation if rfc.find('patcit') != None: x = rfc.find('patcit') try: citation_country = x.find('document-id').findtext('country').strip() except: citation_country = None # Check if US or foreign patent citation if(citation_country == 'US'): counts_dict["GRACIT_G"] += 1 else: counts_dict["FORPATCIT_G"] += 1 # Count the foreign patent citiation tags # Reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += line # Print to stdout and log print("-- Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) logger.info("Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) # Return the dictionary of counts for found tags if args_array['stdout_level'] == 1: pprint(counts_dict) # Return the dictionary of counts for found tags return counts_dict
def extract_XML1_application(raw_data, args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define required arrays processed_application = [] processed_foreignpriority = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] # Set process start time start_time = time.time() # Print start message to stdout #print '- Starting to extract xml in USPTO application format ' + uspto_xml_format + " Start time: " + time.strftime("%c") #print raw_data # Pass the xml into Element tree object document_root = ET.fromstring(raw_data) r = document_root.find('subdoc-bibliographic-information') # Get and fix the document_id data di = r.find('document-id') if di is not None: try: # This document ID is NOT application number document_id = di.findtext('doc-number') except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = di.findtext('kind-code')[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date( di.findtext('document-date'), args_array, document_id) except: pub_date = None try: app_type = r.findtext('publication-filing-type')[:45] except: app_type = None # Get application filing data ar = r.find('domestic-filing-data') if ar is not None: try: app_no = ar.find('application-number').findtext('doc-number')[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( ar.findtext('filing-date'), args_array, document_id) except: app_date = None try: series_code = ar.findtext('application-number-series-code')[:2] except: series_code = None technical_information_element = r.find('technical-information') # Init position position = 1 if technical_information_element is not None: # Get international classification data ic = technical_information_element.find('classification-ipc') if ic is not None: # Process the primary international class icm = ic.find('classification-ipc-primary') #TODO: regex the class found into class, subclass and other #TODO: find out what maingrou and subgroup are found in this file format try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class( icm.findtext('ipc')) i_class_sec = i_class_sec[:15] i_class = i_class[:15] i_subclass = i_subclass[:15] i_class_mgr = i_class_mgr[:15] i_class_sgr = i_class_sgr[:15] except: i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) # Increment Position position += 1 #print processed_intclass # Process any secondary international classes ics = ic.findall('classification-ipc-secondary') if ics is not None: for ics_item in ics: try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class( ics_item.findtext('ipc')) i_class_sec = i_class_sec[:15] i_class = i_class[:15] i_subclass = i_subclass[:15] i_class_mgr = i_class_mgr[:15] i_class_sgr = i_class_sgr[:15] except: i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) # Increment position position += 1 #print(processed_intclass) # Get US classification data nc = technical_information_element.find('classification-us') # init position position = 1 if nc is not None: uspc = nc.find('classification-us-primary').find('uspc') try: n_class_main = uspc.findtext('class')[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass')[:15] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_usclass us_classification_secondary_element = nc.find( 'classification-us-secondary') if us_classification_secondary_element is not None: uspc = us_classification_secondary_element.find('uspc') try: n_class_main = uspc.findtext('class')[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass')[:5] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_usclass # Get priority claims position = 1 pc_kind = None for pc in r.findall('foreign-priority-data'): try: pc_country = pc.findtext('country-code')[:100] except: pc_country = None try: pc_doc_num = pc.find('priority-application-number').findtext( 'doc-number')[:100] except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date( pc.findtext('filing-date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_A", "ApplicationID": app_no, "Position": position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) position += 1 #print(processed_foreignpriority) # Get invention title try: title = technical_information_element.findtext( 'title-of-invention')[:500] except: title = None # Get inventor data iv = r.find('inventors') if iv is not None: # Init position position = 1 for inventor in iv.findall('first-named-inventor'): n = inventor.find('name') try: inventor_first_name = n.findtext('given-name')[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext('family-name')[:100] except: inventor_last_name = None res = inventor.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext('city')[:100] except: inventor_city = None try: inventor_state = residence_us.findtext('state')[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code')[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext('city')[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state')[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code')[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_inventor # For all secordary inventors for inventor in iv.findall('inventor'): if inventor is not None: n = inventor.find('name') if n is not None: try: inventor_first_name = n.findtext('given-name')[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext('family-name')[:100] except: inventor_last_name = None res = inventor.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext('city')[:100] except: inventor_city = None try: inventor_state = residence_us.findtext( 'state')[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code')[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext( 'city')[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state')[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code')[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) # Increment position position += 1 #print(processed_inventor) assignee_element = r.find('assignee') if assignee_element is not None: # init position position = 1 try: asn_role = assignee_element.findtext('assignee-type')[:100] except: asn_role = None try: asn_orgname = assignee_element.findtext('organization-name')[:300] except: asn_orgname = None ad = assignee_element.find('address') try: asn_city = ad.findtext('city')[:100] except: asn_city = None try: asn_state = ad.findtext('state')[:100] except: asn_state = None try: asn_country = ad.find('country').findtext('country-code')[:100] except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_A", "ApplicationID": app_no, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) # increment position position += 1 # Find the agent elements agent_element = r.find('correspondence-address') # init position position = 1 if agent_element is not None: try: agent_orgname = agent_element.findtext('name-1') except: agent_orgname = None try: agent_orgname_2 = agent_element.findtext('name-2') except: agent_orgname_2 = None # Combine Orgname 1 and 2 and shorten if needed if agent_orgname != None and agent_orgname_2 != None: agent_orgname = agent_orgname + " " + agent_orgname_2 agent_orgname = agent_orgname[:300] try: adresss_element = agent_element.find('address') if address_element is not None: try: agent_city = adresss_element.findtext('city')[:100] except: agent_city = None try: agent_state = adresss_element.findtext('state')[:100] except: agent_state = None try: agent_country = adresss_element.find('country').findtext( 'country-code')[:100] except: agent_country = None except: agent_city = None agent_state = None agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_A", "ApplicationID": app_no, "Position": position, "OrgName": agent_orgname, "Country": agent_country, "FileName": args_array['file_name'] }) # increment position position += 1 # Find the abstract of the application try: abstract = USPTOSanitizer.return_element_text( document_root.find('subdoc-abstract')).strip() except: abstract = None # Append SQL data into dictionary to be written later processed_application.append({ "table_name": "uspto.APPLICATION", "ApplicationID": app_no, "PublicationID": document_id, "AppType": app_type, "Title": title, "FileDate": app_date, "PublishDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "FileName": args_array['file_name'] }) #print processed_application # Return a dictionary of the processed_ data arrays return { "processed_application": processed_application, "processed_foreignpriority": processed_foreignpriority, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass }
def extract_XML4_application(raw_data, args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define required arrays processed_application = [] processed_priority_claims = [] processed_assignee = [] processed_applicant = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] # Set process start time start_time = time.time() # Print start message to stdout #print '- Starting to extract xml in USPTO application format ' + uspto_xml_format + " Start time: " + time.strftime("%c") # Pass the raw data into Element tree xml object patent_root = ET.fromstring(raw_data) # Start extract XML data for r in patent_root.findall('us-bibliographic-data-application'): # Get basic document ID information pr = r.find('publication-reference') pub_doc = pr.find('document-id') try: pub_country = pub_doc.findtext('country') except: pub_country = None try: document_id = pub_doc.findtext('doc-number') document_id = USPTOSanitizer.fix_patent_number(document_id) except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = pub_doc.findtext('kind')[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date( pub_doc.findtext('date'), args_array, document_id) except: pub_date = None # Get application reference data ar = r.find('application-reference') if ar is not None: try: app_type = ar.attrib['appl-type'][:45] except: app_type = None app_doc = ar.find('document-id') try: app_country = app_doc.findtext('country') except: app_country = None try: app_no = app_doc.findtext('doc-number')[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( app_doc.findtext('date'), args_array, document_id) except: app_date = None # Get series code try: series_code = r.findtext('us-application-series-code')[:2] except: series_code = None # Get priority Claims pcs = r.find('priority-claims') if pcs is not None: for pc in pcs.findall('priority-claim'): try: pc_sequence = USPTOSanitizer.strip_leading_zeros( pc.attrib['sequence']) except: pc_sequence = None try: pc_kind = pc.attrib['kind'][:100] except: pc_kind = None try: pc_country = pc.findtext('country')[:100] except: pc_country = None try: pc_doc_num = pc.findtext('doc-number')[:100] except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date( pc.findtext('date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_priority_claims.append({ "table_name": "uspto.FOREIGNPRIORITY_A", "ApplicationID": app_no, "Position": pc_sequence, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print processed_priority_claims # Get International classifcation data ics = r.find('classifications-ipcr') # Init position for int classifications position = 1 if ics is not None: # Get all international classification for icc in ics.findall('classification-ipcr'): for x in icc.getchildren(): if (USPTOSanitizer.check_tag_exists(x, 'section')): i_class_sec = x.text[:100] if (USPTOSanitizer.check_tag_exists(x, 'class')): i_class = x.text[:15] if (USPTOSanitizer.check_tag_exists(x, 'subclass')): i_subclass = x.text[:15] if (USPTOSanitizer.check_tag_exists(x, 'main-group')): i_class_mgr = x.text[:15] if (USPTOSanitizer.check_tag_exists(x, 'subgroup')): i_class_sgr = x.text[:15] # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_intclass # Get US Classification data nc = r.find('classification-national') # Init position position = 1 if nc is not None: try: n_class_country = nc.findtext('country') except: n_class_country = None try: n_class_info = nc.findtext('main-classification') except: n_class_info = None try: n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] except: n_class_main = None n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) # Increment position position += 1 # TODO: find an instance of futher classification to parse if nc.findall('further-classification') is not None: nat_class_fur_root = nc.findall('further-classification') for n in nat_class_fur_root: try: n_class_info = n.text except: n_class_info = None try: n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] except: n_class_main = None n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) # Increment position position += 1 # Get CPC Classification data cpc_class_element = r.find('classifications-cpc') # Init position position = 1 if cpc_class_element is not None: main_cpc_class_element = cpc_class_element.find('main-cpc') if main_cpc_class_element is not None: for cpc_class_item in main_cpc_class_element.findall( 'classification-cpc'): try: cpc_section = cpc_class_item.findtext('section')[:15] except: cpc_section = None try: cpc_class = cpc_class_item.findtext('class')[:15] except: cpc_class = None try: cpc_subclass = cpc_class_item.findtext('subclass')[:15] except: cpc_subclass = None try: cpc_mgr = cpc_class_item.findtext('main-group')[:15] except: cpc_mgr = None try: cpc_sgr = cpc_class_item.findtext('subgroup')[:15] except: cpc_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name": "uspto.CPCCLASS_A", "ApplicationID": app_no, "Position": position, "Section": cpc_section, "Class": cpc_class, "SubClass": cpc_subclass, "MainGroup": cpc_mgr, "SubGroup": cpc_sgr, "FileName": args_array['file_name'] }) # Increment position position += 1 further_cpc_class = cpc_class_element.find('further-cpc') if further_cpc_class is not None: for cpc_class_item in further_cpc_class.findall( 'classification-cpc'): try: cpc_section = cpc_class_item.findtext('section')[:15] except: cpc_section = None try: cpc_class = cpc_class_item.findtext('class')[:15] except: cpc_class = None try: cpc_subclass = cpc_class_item.findtext('subclass')[:15] except: cpc_subclass = None try: cpc_mgr = cpc_class_item.findtext('main-group')[:15] except: cpc_mgr = None try: cpc_sgr = cpc_class_item.findtext('subgroup')[:15] except: cpc_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name": "uspto.CPCCLASS_A", "ApplicationID": app_no, "Position": position, "Section": cpc_section, "Class": cpc_class, "SubClass": cpc_subclass, "MainGroup": cpc_mgr, "SubGroup": cpc_sgr, "FileName": args_array['file_name'] }) # Increment position position += 1 # Get the title of the application try: title = r.findtext('invention-title')[:500] except: title = None logger.error("Title not Found for :" + url_link + " Application ID: " + app_no) # Get number of claims try: claims_num = r.findtext('number-of-claims') except: claims_num = None # Get number of figure, drawings nof = r.find('figures') if nof is not None: try: number_of_drawings = nof.findtext('number-of-drawing-sheets') except: number_of_drawings = None try: number_of_figures = nof.findtext('number-of-figures') except: number_of_figures = None else: number_of_drawings = None number_of_figures = None # Increment position position = 1 # Get Associated party data parties_element = r.find('us-parties') if parties_element is not None: applicant_element = parties_element.find('us-applicants') # Get Applicant data for applicant_item in applicant_element.findall('us-applicant'): if (applicant_item.find('addressbook') != None): try: applicant_orgname = applicant_item.find( 'addressbook').findtext('orgname')[:300] except: applicant_orgname = None try: applicant_role = applicant_item.find( 'addressbook').findtext('role') except: applicant_role = None try: applicant_city = applicant_item.find( 'addressbook').find('address').findtext( 'city')[:100] except: applicant_city = None try: applicant_state = applicant_item.find( 'addressbook').find('address').findtext( 'state')[:100] except: applicant_state = None try: applicant_country = applicant_item.find( 'addressbook').find('address').findtext( 'country')[:100] except: applicant_country = None try: applicant_first_name = applicant_item.find( 'addressbook').findtext('first-name')[:100] except: applicant_first_name = None try: applicant_last_name = applicant_item.find( 'addressbook').findtext('last-name')[:100] except: applicant_last_name = None # Append SQL data into dictionary to be written later processed_applicant.append({ "table_name": "uspto.APPLICANT_A", "ApplicationID": app_no, "Position": position, "OrgName": applicant_orgname, "FirstName": applicant_first_name, "LastName": applicant_last_name, "City": applicant_city, "State": applicant_state, "Country": applicant_country, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_applicant # Get the inventor data element invs = parties_element.find('inventors') # Init position position = 1 if invs is not None: # Get all inventors for inv in invs.findall("inventor"): if (inv.find('addressbook') != None): try: inventor_first_name = inv.find( 'addressbook').findtext('first-name')[:100] except: inventor_first_name = None try: inventor_last_name = inv.find( 'addressbook').findtext('last-name')[:100] except: inventor_last_name = None try: inventor_city = inv.find('addressbook').find( 'address').findtext('city')[:100] except: inventor_city = None try: inventor_state = inv.find('addressbook').find( 'address').findtext('state')[:100] except: inventor_state = None try: inventor_country = inv.find('addressbook').find( 'address').findtext('country')[:100] except: inventor_country = None try: inventor_nationality = inv.find( 'nationality').findtext('country')[:100] except: inventor_nationality = None try: inventor_residence = inv.find( 'residence').findtext('country')[:300] except: inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Nationality": inventor_nationality, "Residence": inventor_residence, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_inventor # Init position position = 1 # Get agent data #TODO Find if available in application ??? Where agents_element = parties_element.find('agents') if agents_element is not None: for agent_item in agents_element.findall('agent'): try: asn_sequence = agent_item.attrib['sequence'] except: asn_sequence = None if (agent_item.find('addressbook') != None): try: atn_orgname = agent_item.find( 'addressbook').findtext('orgname')[:300] except: atn_orgname = None try: atn_last_name = agent_item.find( 'addressbook').findtext('last-name')[:100] except: atn_last_name = None try: atn_first_name = agent_item.find( 'addressbook').findtext('first-name')[:100] except: atn_first_name = None try: atn_country = agent_item.find('addressbook').find( 'address').findtext('country')[:100] except: atn_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_A", "ApplicationID": app_no, "Position": position, "OrgName": atn_orgname, "LastName": atn_last_name, "FirstName": atn_first_name, "Country": atn_country, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_agent # Get assignee data assignee_element = r.find('assignees') # Init position position += 1 if assignee_element is not None: for assignee_item in assignee_element.findall('assignee'): if (assignee_item.find('addressbook') != None): try: assignee_orgname = assignee_item.find( 'addressbook').findtext('orgname')[:300] except: assignee_orgname = None try: assignee_role = assignee_item.find( 'addressbook').findtext('role')[:45] except: assignee_role = None try: assignee_city = assignee_item.find('addressbook').find( 'address').findtext('city')[:100] except: assignee_city = None try: assignee_state = assignee_item.find( 'addressbook').find('address').findtext( 'state')[:100] except: assignee_state = None try: assignee_country = assignee_item.find( 'addressbook').find('address').findtext( 'country')[:100] except: assignee_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_A", "ApplicationID": app_no, "Position": position, "OrgName": assignee_orgname, "Role": assignee_role, "City": assignee_city, "State": assignee_state, "Country": assignee_country, "FileName": args_array['file_name'] }) # Increment position position += 1 #print processed_assignee # Get abstract data # Find the abstract try: abstract_element = patent_root.find('abstract') if abstract_element is not None: abstract = USPTOSanitizer.return_element_text(abstract_element) except: abstract = None #print abstract # Append SQL data into dictionary to be written later processed_application.append({ "table_name": "uspto.APPLICATION", "ApplicationID": app_no, "PublicationID": document_id, "AppType": app_type, "Title": title, "FileDate": app_date, "PublishDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "FileName": args_array['file_name'] }) # Return a dictionary of the processed_ data arrays return { "processed_application": processed_application, "processed_priority_claims": processed_priority_claims, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass, }
def extract_XML1_application(raw_data, args_array): # Set process start time start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define required arrays processed_application = [] processed_foreignpriority = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] # Pass the xml into Element tree object document_root = ET.fromstring(raw_data) r = document_root.find('subdoc-bibliographic-information') # Get and fix the document_id data di = r.find('document-id') if di is not None: # This document ID is NOT application number try: document_id = di.findtext('doc-number').strip() except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = di.findtext('kind-code').strip()[:2] app_type = USPTOSanitizer.return_xml2_app_type(args_array, kind).strip() except: kind = None app_type = None try: pub_date = USPTOSanitizer.return_formatted_date( di.findtext('document-date'), args_array, document_id) except: pub_date = None # Get application filing data ar = r.find('domestic-filing-data') if ar is not None: try: app_no = ar.find('application-number').findtext( 'doc-number').strip()[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( ar.findtext('filing-date'), args_array, document_id) except: app_date = None try: series_code = ar.findtext( 'application-number-series-code').strip()[:2] except: series_code = None # Get technical information ti = r.find('technical-information') if ti is not None: # Get invention title try: title = USPTOSanitizer.strip_for_csv( ti.findtext('title-of-invention')[:500]) except: title = None # Get international classification data ic = ti.find('classification-ipc') if ic is not None: # Init position position = 1 # Process the primary international class icm = ic.find('classification-ipc-primary') if icm is not None: #print(icm.findtext('ipc')) # Clear variable values i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application( icm.findtext('ipc')) i_class_sec = i_class_sec.strip()[:15] i_class = i_class.strip()[:15] i_subclass = i_subclass.strip()[:15] i_class_mgr = i_class_mgr.strip()[:15] i_class_sgr = i_class_sgr.strip()[:15] except Exception as e: traceback.print_exc() i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = 1 logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # Process any secondary international classes ics = ic.findall('classification-ipc-secondary') if ics is not None: for ics_item in ics: # Clear variable values i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None try: i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application( ics_item.findtext('ipc')) i_class_sec = i_class_sec.strip()[:15] i_class = i_class.strip()[:15] i_subclass = i_subclass.strip()[:15] i_class_mgr = i_class_mgr.strip()[:15] i_class_sgr = i_class_sgr.strip()[:15] except Exception as e: traceback.print_exc() i_class_sec = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = 1 logger.warning( "Malformed international class found in application ID: " + document_id + " in file: " + url_link) # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # Get US classification data nc = ti.find('classification-us') nc_position = 1 if nc is not None: uspc = nc.find('classification-us-primary').find('uspc') if uspc is not None: n_class_main = None n_subclass = None try: n_class_main = uspc.findtext('class').strip()[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass').strip()[:15] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": nc_position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Collect all Secondary US class ncs = nc.findall('classification-us-secondary') for ncs_item in ncs: n_class_main = None n_subclass = None uspc = ncs_item.find('uspc') if uspc is not None: try: n_class_main = uspc.findtext('class').strip()[:5] except: n_class_main = None try: n_subclass = uspc.findtext('subclass').strip()[:5] except: n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": nc_position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) nc_position += 1 # Get priority claims pc_position = 1 pc_kind = None for pc in r.findall('foreign-priority-data'): try: pc_country = pc.findtext('country-code').strip()[:100] except: pc_country = None try: pc_doc_num = pc.find('priority-application-number').findtext( 'doc-number').strip()[:100] except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date( pc.findtext('filing-date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_A", "ApplicationID": app_no, "Position": pc_position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) pc_position += 1 # Get inventor data invs = r.find('inventors') if invs is not None: # Init position inv_position = 1 for inventor in invs.findall('first-named-inventor'): n = inventor.find('name') try: inventor_first_name = n.findtext('given-name').strip()[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext('family-name').strip()[:100] except: inventor_last_name = None # Get the residence tag res = inventor.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code').strip()[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code').strip()[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": inv_position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) #print(processed_inventor) inv_position += 1 # For all secordary inventors for inv in invs.findall('inventor'): if inv is not None: n = inv.find('name') if n is not None: try: inventor_first_name = n.findtext( 'given-name').strip()[:100] except: inventor_first_name = None try: inventor_last_name = n.findtext( 'family-name').strip()[:100] except: inventor_last_name = None res = inv.find('residence') if res is not None: residence_us = res.find('residence-us') if residence_us is not None: try: inventor_city = residence_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_us.findtext( 'country-code').strip()[:100] except: inventor_country = None residence_non_us = res.find('residence-non-us') if residence_non_us is not None: try: inventor_city = residence_non_us.findtext( 'city').strip()[:100] except: inventor_city = None try: inventor_state = residence_non_us.findtext( 'state').strip()[:100] except: inventor_state = None try: inventor_country = residence_non_us.findtext( 'country-code').strip()[:100] except: inventor_country = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": inv_position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "FileName": args_array['file_name'] }) #print(processed_inventor) inv_position += 1 # Get assignee data # Init position asn_position = 1 for asn in r.findall('assignee'): try: asn_role = asn.findtext('assignee-type').strip()[:100] except: asn_role = None try: asn_orgname = asn.findtext('organization-name').strip()[:300] except: asn_orgname = None adr_elem = asn.find('address') try: asn_city = adr_elem.findtext('city').strip()[:100] except: asn_city = None try: asn_state = adr_elem.findtext('state').strip()[:100] except: asn_state = None try: asn_country = adr_elem.find('country').findtext( 'country-code').strip()[:100] except: asn_country = None if asn_country == None: if USPTOSanitizer.is_US_state(asn_state): asn_country = "US" # These have not been found in XML1, # but a full XML parse should be done asn_firstname = None asn_lastname = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_A", "ApplicationID": app_no, "Position": asn_position, "OrgName": asn_orgname, "FirstName": asn_firstname, "LastName": asn_lastname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) asn_position += 1 # Find the agent element agn = r.find('correspondence-address') # Init position agn_position = 1 if agn is not None: try: agent_orgname = agn.findtext('name-1').strip() except: agent_orgname = None try: agent_orgname_2 = agn.findtext('name-2').strip() except: agent_orgname_2 = None # Combine Orgname 1 and 2 and shorten if needed if agent_orgname != None and agent_orgname_2 != None: agent_orgname = USPTOSanitizer.strip_for_csv(agent_orgname + " " + agent_orgname_2)[:300] # Get the address element addr_elem = agn.find('address') if addr_elem is not None: try: try: agent_addr_1 = addr_elem.findtext( 'address-1').strip()[:100] except: agent_addr_1 = "" try: agent_addr_2 = addr_elem.findtext( 'address-2').strip()[:100] except: agent_addr_2 = "" agent_address = USPTOSanitizer.strip_for_csv(agent_addr_1 + agent_addr_2) except: agent_address = None try: agent_city = addr_elem.findtext('city').strip()[:50] except: agent_city = None try: agent_state = addr_elem.findtext('state').strip()[:3] except: agent_state = None try: agent_country = addr_elem.find('country').findtext( 'country-code').strip()[:3] except: if USPTOSanitizer.is_US_state(agent_state): agent_country = "US" else: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_A", "ApplicationID": app_no, "Position": agn_position, "OrgName": agent_orgname, "Address": agent_address, "City": agent_city, "State": agent_state, "Country": agent_country, "FileName": args_array['file_name'] }) #print(processed_agent) agn_position += 1 # Find the abstract of the application try: abstract = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text( document_root.find('subdoc-abstract'))) except: abstract = None # Find the description try: description = "" d_elem = document_root.find('subdoc-description') if d_elem is not None: description += USPTOSanitizer.strip_for_csv(' '.join( d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(app_no)) #print(description) # Find the claims try: claims = "" c_elem = document_root.find('subdoc-claims') if c_elem is not None: claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext())) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(app_no)) #print(claims) # Find the number of claims try: number_of_claims = 0 for clms in c_elem.findall('claim'): number_of_claims += 1 except Exception as e: number_of_claims = None #traceback.print_exc() #logger.error("Exception while extracting number of claims from " + str(app_no)) #print(number_of_claims) # Find the number of drawings and figures try: number_of_figures = 0 number_of_drawings = 0 drw_elem = document_root.find('subdoc-drawings') if drw_elem != None: for fg in drw_elem.findall('figure'): img_type = fg.find('image').attrib['ti'].strip() if img_type == "DR": number_of_drawings += 1 elif img_type == "FG": number_of_figures += 1 else: number_of_figures = None number_of_drawings = None except Exception as e: number_of_figures = None number_of_drawings = None #traceback.print_exc() #logger.error("Exception while extracting figures and drawings num " + str(app_no)) #print(number_of_figures) #print(number_of_drawings) # Append SQL data into dictionary to be written later processed_application.append({ "table_name": "uspto.APPLICATION", "ApplicationID": app_no, "PublicationID": document_id, "AppType": app_type, "Title": title, "FileDate": app_date, "PublishDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": number_of_claims, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "Description": description, "Claims": claims, "FileName": args_array['file_name'] }) #print(processed_application) # Return a dictionary of the processed_ data arrays return { "processed_application": processed_application, "processed_foreignpriority": processed_foreignpriority, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass }
def extract_XML4_grant(raw_data, args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define all arrays to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] # Stat process timer start_time = time.time() # Pass the raw_data data into Element Tree patent_root = ET.fromstring(raw_data) # Start the extraction of XML data for r in patent_root.findall('us-bibliographic-data-grant'): # Find the main patent grant data for pr in r.findall('publication-reference'): for di in pr.findall('document-id'): try: pub_country = di.findtext('country') except: pub_country = None try: document_id = di.findtext('doc-number') document_id = USPTOSanitizer.fix_patent_number( document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = di.findtext('kind')[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date( di.findtext('date'), args_array, document_id) except: pub_date = None # Find the main application data for ar in r.findall('application-reference'): try: app_type = ar.attrib['appl-type'][:45] except: app_type = None for di in ar.findall('document-id'): try: app_country = di.findtext('country') except: app_country = None try: app_no = di.findtext('doc-number')[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( di.findtext('date'), args_array, document_id) except: app_date = None # Get the series code try: series_code = r.findtext('us-application-series-code')[:2] except: series_code = None # Get the length of grant try: terms_of_grant = r.find("us-term-of-grant").findtext( "length-of-grant") except: terms_of_grant = None # Find all international classifications ic = r.find('classifications-ipcr') position = 1 if ic is not None: for icc in ic.findall('classification-ipcr'): for x in icc.getchildren(): if (USPTOSanitizer.check_tag_exists(x, 'section')): try: i_class_sec = x.text[:15] except: i_class_sec = None if (USPTOSanitizer.check_tag_exists(x, 'class')): try: i_class_cls = x.text[:15] except: i_class_cls = None if (USPTOSanitizer.check_tag_exists(x, 'subclass')): try: i_class_sub = x.text[:15] except: i_class_sub = None if (USPTOSanitizer.check_tag_exists(x, 'main-group')): try: i_class_mgr = x.text[:15] except: i_class_mgr = None if (USPTOSanitizer.check_tag_exists(x, 'subgroup')): try: i_class_sgr = x.text[:15] except: i_class_sgr = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_class_sec, "Class": i_class_cls, "SubClass": i_class_sub, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) position += 1 # Find all CPC classifications cpc = r.find('us-field-of-classification-search') #print nat_class_element if cpc is not None: position = 1 for cpcc in cpc.findall('classification-cpc-text'): try: #print cpc.text cpc_text = cpcc.text #print cpc_text cpc_class_string, cpc_group_string = cpc_text.split(" ") #print cpc_class_string + " " + cpc_group_string cpc_class_sec = cpc_text[0] cpc_class = cpc_class_string[1:3] cpc_subclass = cpc_class_string[3] cpc_class_mgr, cpc_class_sgr = cpc_group_string.rsplit( "/", 1) cpc_class_mgr = cpc_class_mgr[:15] cpc_class_sgr = cpc_class_sgr[:15] #print cpc_class_sec + " " + cpc_class + " " + cpc_subclass + " " + cpc_class_mgr + " " + cpc_class_sgr except: #traceback.print_exc() cpc_class_sec = None cpc_class = None cpc_subclass = None cpc_class_mgr = None cpc_class_sgr = None logger.warning( "There was an error parsing the cpc class for Grant ID: " + document_id + " in file: " + url_link) logger.warning("Traceback: " + traceback.format_exc()) # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name": "uspto.CPCCLASS_G", "GrantID": document_id, "Position": position, "Section": cpc_class_sec, "Class": cpc_class, "SubClass": cpc_subclass, "MainGroup": cpc_class_mgr, "SubGroup": cpc_class_sgr, "FileName": args_array['file_name'] }) position += 1 # Find all US classifications for nc in r.findall('classification-national'): position = 1 try: n_class_info = nc.findtext('main-classification') n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] except: n_class_main = None n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 n_class_fur_root = nc.findall( 'further-classification') #return a list of all elements for n in n_class_fur_root: try: n_class_info = n.text except: n_class_info = None try: n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] except: n_class_main = None n_subclass = None # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 # Find the title of the patent try: title = r.findtext('invention-title')[:500] except: title = None # Find all references cited in the grant for rf in r.findall('us-references-cited'): for rfc in rf.findall('us-citation'): # If the patent citation child is found must be a patent citation if (rfc.find('patcit') != None): position = 1 try: citation_position = USPTOSanitizer.strip_leading_zeros( rfc.find('patcit').attrib['num']) except: citation_position = position for x in rfc.findall('patcit'): try: citation_country = x.find('document-id').findtext( 'country')[:100] except: citation_country = None try: citation_grant_id = x.find('document-id').findtext( 'doc-number')[:20] except: citation_grant_id = None try: citation_kind = x.find('document-id').findtext( 'kind')[:10] except: citation_kind = None try: citation_name = x.find('document-id').findtext( 'name')[:100] except: citation_name = None try: citation_date = USPTOSanitizer.return_formatted_date( x.find('document-id').findtext('date'), args_array, document_id) except: citation_date = None try: if rfc.findtext('category') == "cited by examiner": citation_category = 1 else: citation_category = 0 except: citation_category = None # US patent citations if (citation_country.strip().upper() == 'US'): # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name": "uspto.GRACIT_G", "GrantID": document_id, "Position": citation_position, "CitedID": citation_grant_id, "Kind": citation_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 elif (citation_country.strip().upper() != 'US'): # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name": "uspto.FORPATCIT_G", "GrantID": document_id, "Position": citation_position, "CitedID": citation_grant_id, "Kind": citation_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 # If the non patent citations are found elif (rfc.find('nplcit') != None): position = 1 for x in rfc.findall('nplcit'): try: citation_position = USPTOSanitizer.strip_leading_zeros( rfc.find('nplcit').attrib['num']) except: citation_position = position # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it try: non_patent_citation_text = x.findtext('othercit') except: non_patent_citation_text = None # TODO: check that strip tags is working try: non_patent_citation_text = re.sub( '<[^>]+>', '', non_patent_citation_text).replace('\n', "") except: non_patent_citation_text = None # TODO: parse the category into boolean for now How many categories are there and what are they?? # TODO: change category to boolean in schema try: if x.findtext('category') == "cited by examiner": citation_category = 1 else: citation_category = 0 except: citation_category = None # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name": "uspto.NONPATCIT_G", "GrantID": document_id, "Position": citation_position, "Citation": non_patent_citation_text, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 # Find number of claims try: claims_num = r.findtext('number-of-claims') except: claims_num = None # Find the number of figures and number of drawings nof = r.find('figures') try: number_of_drawings = nof.findtext('number-of-drawing-sheets') number_of_drawings = number_of_drawings.split("/")[0] except: number_of_drawings = None try: number_of_figures = nof.findtext('number-of-figures') except: number_of_figures = None # Find the parties for prt in r.findall('us-parties'): # Find all applicant data for apts in prt.findall('us-applicants'): position = 1 for apt in apts.findall('us-applicant'): if (apt.find('addressbook') != None): try: applicant_orgname = apt.find( 'addressbook').findtext('orgname')[:300] except: applicant_orgname = None try: applicant_first_name = apt.find( 'addressbook').findtext('first-name')[:100] except: applicant_first_name = None try: applicant_last_name = apt.find( 'addressbook').findtext('last-name')[:100] except: applicant_last_name = None try: applicant_city = apt.find('addressbook').find( 'address').findtext('city')[:100] except: applicant_city = None try: applicant_state = apt.find('addressbook').find( 'address').findtext('state')[:100] except: applicant_state = None try: applicant_country = apt.find('addressbook').find( 'address').findtext('country')[:100] except: applicant_country = None # Append SQL data into dictionary to be written later processed_applicant.append({ "table_name": "uspto.APPLICANT_G", "GrantID": document_id, "OrgName": applicant_orgname, "Position": position, "FirstName": applicant_first_name, "LastName": applicant_last_name, "City": applicant_city, "State": applicant_state, "Country": applicant_country, "FileName": args_array['file_name'] }) position += 1 # Find all inventor data for apts in prt.findall('inventors'): position = 1 for apt in apts.findall('inventor'): try: inventor_sequence = USPTOSanitizer.strip_leading_zeros( apt.attrib['sequence']) except: inventor_sequence = position if (apt.find('addressbook') != None): try: inventor_first_name = apt.find( 'addressbook').findtext('first-name')[:100] except: inventor_first_name = None try: inventor_last_name = apt.find( 'addressbook').findtext('last-name')[:100] except: inventor_last_name = None try: inventor_city = apt.find('addressbook').find( 'address').findtext('city')[:100] except: inventor_city = None try: inventor_state = apt.find('addressbook').find( 'address').findtext('state')[:100] except: inventor_state = None try: inventor_country = apt.find('addressbook').find( 'address').findtext('country')[:100] except: inventor_country = None try: inventor_residence = apt.find('addressbook').find( 'address').findtext('country')[:300] except: inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_G", "GrantID": document_id, "Position": inventor_sequence, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Residence": inventor_residence, "FileName": args_array['file_name'] }) position += 1 # Find all agent data for agns in prt.findall('agents'): position = 1 for agn in agns.findall('agent'): try: agent_sequence = USPTOSanitizer.strip_leading_zeros( agn.attrib['sequence']) except: agent_sequence = position if (agn.find('addressbook') != None): try: agent_orgname = agn.find('addressbook').findtext( 'orgname')[:300] except: agent_orgname = None try: agent_last_name = agn.find('addressbook').findtext( 'last-name')[:100] except: agent_last_name = None try: agent_first_name = agn.find( 'addressbook').findtext('first-name')[:100] except: agent_first_name = None try: agent_country = agn.find('addressbook').find( 'address').findtext('country')[:100] except: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_G", "GrantID": document_id, "Position": agent_sequence, "OrgName": agent_orgname, "LastName": agent_last_name, "FirstName": agent_first_name, "Country": agent_country, "FileName": args_array['file_name'] }) position += 1 # Find all assignee data for asn in r.findall('assignees'): position = 1 for x in asn.findall('assignee'): if (x.find('addressbook') != None): try: asn_orgname = x.find('addressbook').findtext( 'orgname')[:500] except: asn_orgname = None try: asn_role = x.find('addressbook').findtext('role')[:45] except: asn_role = None try: asn_city = x.find('addressbook').find( 'address').findtext('city')[:100] except: asn_city = None try: asn_state = x.find('addressbook').find( 'address').findtext('state')[:100] except: asn_state = None try: asn_country = x.find('addressbook').find( 'address').findtext('country')[:100] except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_G", "GrantID": document_id, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) position += 1 # Find all examiner data for exm in r.findall('examiners'): position = 1 for x in exm.findall('primary-examiner'): try: exm_last_name = x.findtext('last-name')[:50] except: exm_last_name = None try: exm_first_name = x.findtext('first-name')[:50] except: exm_first_name = None try: exm_department = x.findtext('department')[:100] except: exm_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": exm_last_name, "FirstName": exm_first_name, "Department": exm_department, "FileName": args_array['file_name'] }) position += 1 for x in exm.findall('assistant-examiner'): try: exm_last_name = x.findtext('last-name')[:50] except: exm_last_name = None try: exm_first_name = x.findtext('first-name')[:50] except: exm_first_name = None try: exm_department = x.findtext('department')[:100] except: exm_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": exm_last_name, "FirstName": exm_first_name, "Department": exm_department, "FileName": args_array['file_name'] }) position += 1 # TODO: see if it's claims or description and store accordingly try: claims = patent_root.findtext('description') except: claims = None #print claims # Find the abstract try: abstract = USPTOSanitizer.return_element_text( patent_root.find('abstract')) except: traceback.print_exc() abstract = None #print abstract # Append SQL data into dictionary to be written later try: processed_grant.append({ "table_name": "uspto.GRANT", "GrantID": document_id, "Title": title, "IssueDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "ApplicationID": app_no, "Claims": claims, "FileDate": app_date, "AppType": app_type, "GrantLength": terms_of_grant, "FileName": args_array['file_name'] }) except Exception as e: print "could not append to array" traceback.print_exc() logger.warning( "Could not append patent data to array for patent number: " + document_id + " Traceback: " + traceback.format_exc()) # Return a dictionary of the processed_ data arrays return { "processed_grant": processed_grant, "processed_applicant": processed_applicant, "processed_examiner": processed_examiner, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass, "processed_gracit": processed_gracit, "processed_forpatcit": processed_forpatcit, "processed_nonpatcit": processed_nonpatcit }
def extract_XML4_application(raw_data, args_array): # Set process start time start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define required arrays processed_application = [] processed_foreignpriority = [] processed_assignee = [] processed_applicant = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_cpcclass = [] # Pass the raw data into Element tree xml object document_root = ET.fromstring(raw_data) # Start extract XML data for r in document_root.findall('us-bibliographic-data-application'): # Get basic document ID information pr = r.find('publication-reference') pub_doc = pr.find('document-id') try: pub_country = pub_doc.findtext('country').strip() except: pub_country = None try: document_id = pub_doc.findtext('doc-number').strip() document_id = USPTOSanitizer.fix_patent_number(document_id) except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = pub_doc.findtext('kind').strip()[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date( pub_doc.findtext('date'), args_array, document_id) except: pub_date = None # Get application reference data ar = r.find('application-reference') if ar is not None: try: app_type = ar.attrib['appl-type'].strip()[:45] except: app_type = None app_doc = ar.find('document-id') try: app_country = app_doc.findtext('country').strip() except: app_country = None try: app_no = app_doc.findtext('doc-number').strip()[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( app_doc.findtext('date'), args_array, document_id) except: app_date = None # Get series code try: series_code = r.findtext( 'us-application-series-code').strip()[:2] except: series_code = None # Get Priority Claims pcs = r.find('priority-claims') if pcs is not None: for pc in pcs.findall('priority-claim'): try: pc_sequence = USPTOSanitizer.strip_leading_zeros( pc.attrib['sequence']) except: pc_sequence = None try: pc_kind = pc.attrib['kind'].strip()[:100] except: pc_kind = None try: pc_country = pc.findtext('country').strip()[:100] except: pc_country = None try: pc_doc_num = pc.findtext('doc-number').strip()[:100] except: pc_doc_num = None try: pc_date = USPTOSanitizer.return_formatted_date( pc.findtext('date'), args_array, document_id) except: pc_date = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_A", "ApplicationID": app_no, "Position": pc_sequence, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) # Find all international classifications ic = r.find('classifications-ipcr') position = 1 if ic is not None: for icc in ic.findall('classification-ipcr'): for x in icc.getchildren(): if (USPTOSanitizer.check_tag_exists(x, 'section')): try: i_class_sec = x.text.strip()[:15] except: i_class_sec = None if (USPTOSanitizer.check_tag_exists(x, 'class')): try: i_class_cls = x.text.strip()[:15] except: i_class_cls = None if (USPTOSanitizer.check_tag_exists(x, 'subclass')): try: i_class_sub = x.text.strip()[:15] except: i_class_sub = None if (USPTOSanitizer.check_tag_exists(x, 'main-group')): try: i_class_mgr = x.text.strip()[:15] except: i_class_mgr = None if (USPTOSanitizer.check_tag_exists(x, 'subgroup')): try: i_class_sgr = x.text.strip()[:15] except: i_class_sgr = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_A", "ApplicationID": app_no, "Position": position, "Section": i_class_sec, "Class": i_class_cls, "SubClass": i_class_sub, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # Get US Classification data nc = r.find('classification-national') position = 1 if nc is not None: ncm = nc.find('main-classification') if ncm is not None: #print(ncm.text) n_class_main = None n_subclass = None n_malformed = 1 try: n_class_info = nc.findtext('main-classification') except: n_class_info = None try: n_class_main, n_subclass = USPTOSanitizer.return_US_class_XML4_application( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] except: n_class_main = None n_subclass = None n_malformed = 1 # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "Malformed": n_malformed, "FileName": args_array['file_name'] }) #print(processed_usclass) position += 1 # TODO: find an instance of futher classification to parse ncs = nc.findall('further-classification') for ncs_item in ncs: #print("Further: " + ncs_item.text) n_class_main = None n_subclass = None n_malformed = 1 try: n_class_info = ncs_item.text n_class_main, n_subclass = USPTOSanitizer.return_US_class_XML4_application( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] except: n_class_main = None n_subclass = None n_malformed = 1 # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_A", "ApplicationID": app_no, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "Malformed": n_malformed, "FileName": args_array['file_name'] }) #print(processed_usclass) position += 1 # Get CPC Classification data cpc_class_element = r.find('classifications-cpc') # Init position position = 1 if cpc_class_element is not None: main_cpc_class_element = cpc_class_element.find('main-cpc') if main_cpc_class_element is not None: for cpc_class_item in main_cpc_class_element.findall( 'classification-cpc'): try: cpc_section = cpc_class_item.findtext( 'section').strip()[:15] except: cpc_section = None try: cpc_class = cpc_class_item.findtext( 'class').strip()[:15] except: cpc_class = None try: cpc_subclass = cpc_class_item.findtext( 'subclass').strip()[:15] except: cpc_subclass = None try: cpc_mgr = cpc_class_item.findtext( 'main-group').strip()[:15] except: cpc_mgr = None try: cpc_sgr = cpc_class_item.findtext( 'subgroup').strip()[:15] except: cpc_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name": "uspto.CPCCLASS_A", "ApplicationID": app_no, "Position": position, "Section": cpc_section, "Class": cpc_class, "SubClass": cpc_subclass, "MainGroup": cpc_mgr, "SubGroup": cpc_sgr, "FileName": args_array['file_name'] }) position += 1 further_cpc_class = cpc_class_element.find('further-cpc') if further_cpc_class is not None: for cpc_class_item in further_cpc_class.findall( 'classification-cpc'): try: cpc_section = cpc_class_item.findtext( 'section').strip()[:15] except: cpc_section = None try: cpc_class = cpc_class_item.findtext( 'class').strip()[:15] except: cpc_class = None try: cpc_subclass = cpc_class_item.findtext( 'subclass').strip()[:15] except: cpc_subclass = None try: cpc_mgr = cpc_class_item.findtext( 'main-group').strip()[:15] except: cpc_mgr = None try: cpc_sgr = cpc_class_item.findtext( 'subgroup').strip()[:15] except: cpc_sgr = None # Append SQL data into dictionary to be written later processed_cpcclass.append({ "table_name": "uspto.CPCCLASS_A", "ApplicationID": app_no, "Position": position, "Section": cpc_section, "Class": cpc_class, "SubClass": cpc_subclass, "MainGroup": cpc_mgr, "SubGroup": cpc_sgr, "FileName": args_array['file_name'] }) position += 1 # Get the title of the application try: title = USPTOSanitizer.strip_for_csv( r.findtext('invention-title')[:500]) except: title = None logger.error("Title not Found for :" + url_link + " Application ID: " + app_no) # Get number of figure, drawings nof = r.find('figures') if nof is not None: try: number_of_drawings = nof.findtext( 'number-of-drawing-sheets').strip() except: number_of_drawings = None try: number_of_figures = nof.findtext('number-of-figures').strip() except: number_of_figures = None else: number_of_drawings = None number_of_figures = None # Check if XML format uses 'us-parties' or 'parties' if r.find('us-parties') != None: parties_id_string = "us-parties" elif r.find('parties') != None: parties_id_string = "parties" else: parties_id_string = "parties" prt = r.find(parties_id_string) if prt is not None: # Increment position appl_position = 1 invt_position = 1 atn_position = 1 # Check if the XML format uses 'applicants' or 'us-applicants' if prt.find('us-applicants') != None: applicants_id_string = 'us-applicants' elif prt.find('applicants') != None: applicants_id_string = 'applicants' else: applicants_id_string = 'applicants' # Get Applicant data appl_elem = prt.find(applicants_id_string) # Check if the XML format uses 'applicant' or 'us-applicant' if appl_elem.find('us-applicant') != None: applicant_id_string = 'us-applicant' elif appl_elem.find('applicant') != None: applicant_id_string = 'applicant' else: applicant_id_string = 'applicant' for appl in appl_elem.findall(applicant_id_string): if (appl.find('addressbook') != None): try: appl_orgname = USPTOSanitizer.strip_for_csv( appl.find('addressbook').findtext('orgname'))[:300] except: appl_orgname = None try: appl_role = appl.find('addressbook').findtext('role') except: appl_role = None try: appl_city = appl.find('addressbook').find( 'address').findtext('city').strip()[:100] except: appl_city = None try: appl_state = appl.find('addressbook').find( 'address').findtext('state').strip()[:100] except: appl_state = None try: appl_country = appl.find('addressbook').find( 'address').findtext('country').strip()[:100] except: appl_country = None try: appl_firstname = USPTOSanitizer.strip_for_csv( appl.find('addressbook').findtext( 'first-name'))[:100] except: appl_firstname = None try: appl_lastname = USPTOSanitizer.strip_for_csv( appl.find('addressbook').findtext( 'last-name'))[:100] except: appl_lastname = None # Append SQL data into dictionary to be written later processed_applicant.append({ "table_name": "uspto.APPLICANT_A", "ApplicationID": app_no, "Position": appl_position, "OrgName": appl_orgname, "FirstName": appl_firstname, "LastName": appl_lastname, "City": appl_city, "State": appl_state, "Country": appl_country, "FileName": args_array['file_name'] }) #print(processed_applicant) appl_position += 1 # Get the inventor data element invs = prt.find('inventors') # Init position position = 1 if invs is not None: # Get all inventors for inv in invs.findall("inventor"): if (inv.find('addressbook') != None): try: inv_first_name = inv.find('addressbook').findtext( 'first-name').strip()[:100] except: inv_first_name = None try: inv_last_name = inv.find('addressbook').findtext( 'last-name').strip()[:100] except: inv_last_name = None try: inv_city = inv.find('addressbook').find( 'address').findtext('city').strip()[:100] except: inv_city = None try: inv_state = inv.find('addressbook').find( 'address').findtext('state').strip()[:100] except: inv_state = None try: inv_country = inv.find('addressbook').find( 'address').findtext('country').strip()[:100] except: inv_country = None try: inv_nationality = inv.find('nationality').findtext( 'country').strip()[:100] except: inv_nationality = None try: inv_residence = inv.find('residence').findtext( 'country').strip()[:300] except: inv_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_A", "ApplicationID": app_no, "Position": invt_position, "FirstName": inv_first_name, "LastName": inv_last_name, "City": inv_city, "State": inv_state, "Country": inv_country, "Nationality": inv_nationality, "Residence": inv_residence, "FileName": args_array['file_name'] }) #print(processed_inventor) invt_position += 1 # Init position position = 1 # Get agent data #TODO Find if available in application ??? Where agents_element = prt.find('agents') if agents_element is not None: for agent_item in agents_element.findall('agent'): try: asn_sequence = agent_item.attrib['sequence'] except: asn_sequence = None if (agent_item.find('addressbook') != None): try: atn_orgname = agent_item.find( 'addressbook').findtext( 'orgname').strip()[:300] except: atn_orgname = None try: atn_last_name = agent_item.find( 'addressbook').findtext( 'last-name').strip()[:100] except: atn_last_name = None try: atn_first_name = agent_item.find( 'addressbook').findtext( 'first-name').strip()[:100] except: atn_first_name = None try: atn_country = agent_item.find('addressbook').find( 'address').findtext('country').strip()[:100] except: atn_country = None atn_address = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_A", "ApplicationID": app_no, "Position": atn_position, "OrgName": atn_orgname, "LastName": atn_last_name, "FirstName": atn_first_name, "Country": atn_country, "FileName": args_array['file_name'] }) #print(processed_agent) atn_position += 1 # Get assignee data asn_elem = r.find('assignees') # Init position position = 1 if asn_elem is not None: for asn_item in asn_elem.findall('assignee'): if (asn_item.find('addressbook') != None): try: asn_orgname = asn_item.find('addressbook').findtext( 'orgname').strip()[:300] except: asn_orgname = None try: asn_firstname = asn_item.find('addressbook').findtext( 'first-name').strip()[:100] except: asn_firstname = None try: asn_lastname = asn_item.find('addressbook').findtext( 'last-name').strip()[:100] except: asn_lastname = None try: asn_role = asn_item.find('addressbook').findtext( 'role').strip()[:5] except: asn_role = None try: asn_city = asn_item.find('addressbook').find( 'address').findtext('city').strip()[:50] except: asn_city = None try: asn_state = asn_item.find('addressbook').find( 'address').findtext('state').strip()[:10] except: asn_state = None try: asn_country = asn_item.find('addressbook').find( 'address').findtext('country').strip()[:3] except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_A", "ApplicationID": app_no, "Position": position, "OrgName": asn_orgname, "FirstName": asn_firstname, "LastName": asn_lastname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) position += 1 # Find the abstract try: abstract_element = document_root.find('abstract') if abstract_element is not None: abstract = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text(abstract_element)) else: abstract = None except: abstract = None #print(abstract) # Find the description try: description = "" d_elem = document_root.find('description') if d_elem is not None: description += USPTOSanitizer.strip_for_csv(' '.join( d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(app_no) + ": " + traceback.print_exc()) #print(description) # Find the claims try: claims = "" c_elem = document_root.find('claims') if c_elem is not None: claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext())) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(app_no) + ": " + traceback.print_exc()) #print(claims) # Find the number of claims try: number_of_claims = 0 for clms in c_elem.findall('claim'): number_of_claims += 1 except Exception as e: number_of_claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(app_no)) #print(number_of_claims) # Find the number of drawings and figures try: number_of_figures = 0 number_of_drawings = 0 drw_elem = document_root.find('drawings') if drw_elem != None: for fg in drw_elem.findall('figure'): img_type = fg.find('img').attrib['img-content'].strip() if img_type == "drawing": number_of_drawings += 1 elif img_type == "figure": number_of_figures += 1 else: number_of_figures = None number_of_drawings = None except Exception as e: number_of_figures = None number_of_drawings = None #traceback.print_exc() #logger.error("Exception while extracting figures and drawings num " + str(app_no)) #print(number_of_figures) #print(number_of_drawings) # Append SQL data into dictionary to be written later processed_application.append({ "table_name": "uspto.APPLICATION", "ApplicationID": app_no, "PublicationID": document_id, "AppType": app_type, "Title": title, "FileDate": app_date, "PublishDate": pub_date, "Kind": kind, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": number_of_claims, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "Description": description, "Claims": claims, "FileName": args_array['file_name'] }) #print(processed_application) # Return a dictionary of the processed_ data arrays return { "processed_application": processed_application, "processed_foreignpriority": processed_foreignpriority, "processed_applicant": processed_applicant, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_cpcclass": processed_cpcclass, }
def insert_2005_grant_classifications(self, args_array, json_obj): # Start timer start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") try: # Insert the items in the json object into the appropiate # patent grant in the database for item in json_obj: # Extract the patent grant ID from the document number publication_number = item['publication_number'].split("-")[1] # Create an array to hold all CPC codes for the patent cpc_array = [] ipc_array = [] # Loop through all CPC codes for the item and # convert to dict for code in item['cpc']: cpc_dict = USPTOSanitizer.extract_BQ_CPC_string_to_dict( code['code']) # Append the CPC dict to array cpc_array.append(cpc_dict) # Loop through all IPC codes for the item and # convert to dict for code in itemp['ipc']: ipc_dict = USPTOSanitizer.extract_BQ_CPC_string_to_dict( code['code']) # Append the CPC dict to array ipc_array.append(ipc_dict) # Pass the publication_number and cpc_array to SQL to be inserted args_array['database_connection'].insert_CPC_patched_item( publication_number, cpc_array) # Pass the publication_number and ipc_array to SQL to be inserted args_array['database_connection'].insert_IPC_patched_item( publication_number, ipc_array) print( "-- All 2005 classification codes inserted into database successfully" ) logger.info( "-- All 2005 classification codes inserted into database successfully" ) exit(0) except Exception as e: # If the insertion process failed then exit with status 1 traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) print( "-- Failed to insert all 2005 classification codes inserted into database" ) logger.info( "-- Failed to insert all 2005 classification codes inserted into database" ) exit(1)
def extract_csv_line(args_array, line): #print(line) # Declare a processed array to append to processed_array = { "table_name": set_table_name_from_type(args_array['extraction_type']), "FileName": args_array['file_name'], "extraction_type": args_array['extraction_type'] } # Handle a correspondance items if args_array['extraction_type'] == "cases": processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1]) processed_array['PacerID'] = USPTOSanitizer.clean_PAIR_csv_item( line[2]) processed_array['CourtTitle'] = USPTOSanitizer.clean_PAIR_csv_item( line[3]) processed_array['DistrictID'] = USPTOSanitizer.clean_PAIR_csv_item( line[4]) processed_array['CaseTitle'] = USPTOSanitizer.clean_PAIR_csv_item( line[5]) processed_array['AssignedTo'] = USPTOSanitizer.clean_PAIR_csv_item( line[6]) processed_array['ReferredTo'] = USPTOSanitizer.clean_PAIR_csv_item( line[7]) processed_array['Cause'] = USPTOSanitizer.clean_PAIR_csv_item(line[8]) processed_array[ 'JurisdictionBasis'] = USPTOSanitizer.clean_PAIR_csv_item(line[9]) processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[10]) processed_array['CloseDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[11]) processed_array['LastFileDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[12]) processed_array['JuryDemand'] = USPTOSanitizer.clean_PAIR_csv_item( line[13]) processed_array['Demand'] = USPTOSanitizer.clean_PAIR_csv_item( line[14]) processed_array['LeadCase'] = USPTOSanitizer.clean_PAIR_csv_item( line[15]) processed_array['RelatedCase'] = USPTOSanitizer.clean_PAIR_csv_item( line[16]) processed_array['Settlement'] = USPTOSanitizer.clean_PAIR_csv_item( line[17]) processed_array['CaseIDRaw'] = USPTOSanitizer.clean_PAIR_csv_item( line[18]) processed_array['CaseType1'] = USPTOSanitizer.clean_PAIR_csv_item( line[19]) processed_array['CaseType2'] = USPTOSanitizer.clean_PAIR_csv_item( line[20]) processed_array['CaseType3'] = USPTOSanitizer.clean_PAIR_csv_item( line[21]) processed_array['CaseTypeNote'] = USPTOSanitizer.clean_PAIR_csv_item( line[22]) elif args_array['extraction_type'] == "pacercases": processed_array['ApplicationID'] = USPTOSanitizer.clean_PAIR_csv_item( line[0]) processed_array[ 'ParentApplicationID'] = USPTOSanitizer.clean_PAIR_csv_item( line[1]) processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item( line[2]) processed_array[ 'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3]) elif args_array['extraction_type'] == "names": processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1]) processed_array['PartyType'] = USPTOSanitizer.clean_PAIR_csv_item( line[3]) processed_array['Name'] = USPTOSanitizer.clean_PAIR_csv_item(line[5]) elif args_array['extraction_type'] == "attorneys": processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1]) processed_array['CaseIDRaw'] = USPTOSanitizer.clean_PAIR_csv_item( line[2]) processed_array['PartyType'] = USPTOSanitizer.clean_PAIR_csv_item( line[4]) processed_array['Name'] = USPTOSanitizer.clean_PAIR_csv_item(line[6]) processed_array['ContactInfo'] = USPTOSanitizer.clean_PAIR_csv_item( line[7]) processed_array['Position'] = USPTOSanitizer.clean_PAIR_csv_item( line[8]) elif args_array['extraction_type'] == "patents": processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[2]) processed_array['PacerID'] = USPTOSanitizer.clean_PAIR_csv_item( line[1]) processed_array['NOS'] = USPTOSanitizer.clean_PAIR_csv_item(line[4]) processed_array['PatentID'] = USPTOSanitizer.strip_leading_zeros( USPTOSanitizer.clean_PAIR_csv_item(line[11])) processed_array['PatentDocType'] = USPTOSanitizer.clean_PAIR_csv_item( line[12]) # Return the array for storage return processed_array
def process_XML_application_content(args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ( "database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files( args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Process zip file by getting .dat or .txt file and .xml filenames start_time = time.time() # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_zip_to_array(args_array) # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly if args_array['uspto_xml_format'] == "aXML4": # Loop through all lines in the xml file for line in xml_file.readlines(): # This identifies the start of well formed XML segment for patent # application bibliographic information if "<us-patent-application" in line: patent_xml_started = True xml_string += line # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</us-patent-application" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router( xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data( processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_new_html_characters(line) elif args_array['uspto_xml_format'] == "aXML1": line_count = 1 # Loop through all lines in the xml file for line in xml_file.readlines(): # This identifies the start of well formed XML segment for patent # application bibliographic information if "<patent-application-publication" in line: patent_xml_started = True xml_string += line # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</patent-application-publication" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router( xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data( processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close the .xml file being read from xml_file.close() # Close the all the .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if args_array['database_insert_mode'] == 'bulk': file_processed = args_array['database_connection'].load_csv_bulk_data( args_array, logger) # If the file was successfully processed into the database if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Close all the open csv files USPTOCSVHandler.delete_csv_files(args_array) # Print message to stdout and log print '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format( args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")) logger.info( '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) else: # Print message to stdout and log print '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format( args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")) logger.info( '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
def extract_XML2_grant(raw_data, args_array): # # Data documentation on the fields in XML2 Grant data can be found # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file # Start timer start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] # Define all arrays needed to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] processed_foreignpriority = [] # Pass the raw data into Element tree xml object try: document_root = ET.fromstring(raw_data) except ET.ParseError as e: print_xml = raw_data.split("\n") for num, line in enumerate(print_xml, start=1): print(str(num) + ' : ' + line) logger.error( "Character Entity prevented ET from parsing XML in file: " + url_link) traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # SDOBI is the bibliographic data r = document_root.find('SDOBI') if r is not None: # B100 Document Identification for B100 in r.findall('B100'): try: document_id = USPTOSanitizer.return_element_text( B100.find('B110')).strip() document_id = USPTOSanitizer.fix_patent_number( document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = USPTOSanitizer.return_element_text( B100.find('B130')).strip()[:2] app_type = USPTOSanitizer.return_xml2_app_type( args_array, kind).strip() except: kind = None # PATENT ISSUE DATE try: pub_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B100.find('B140')), args_array, document_id) except: pub_date = None # B190 is Publishing Country or Organization # This is always US in Red Book Patent Grant documents and # this field is not stored or used. try: pub_country = USPTOSanitizer.return_element_text( B100.find('B190')).strip() except: pub_country = None # B200 is Domestic Filing Data for B200 in r.findall('B200'): # TODO: find this in XML2 applications app_country = None # Application number try: app_no = USPTOSanitizer.return_element_text( B200.find('B210')).strip()[:20] except: app_no = None # Application Date try: app_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B200.find('B220')), args_array, document_id) except: app_date = None # Series Code try: series_code = USPTOSanitizer.return_element_text( B200.find('B211US')).strip()[:2] except: series_code = None # Collect the Grant Length try: grant_length = USPTOSanitizer.return_element_text( r.find("B400").find("B472").find("B474")).strip() except: grant_length = None # Collect Technical information # such as classification and references # TODO: don't need the loop here for B500 in r.findall('B500'): # US Classification for B520 in B500.findall('B520'): position = 1 # USCLASS for B521 in B520.findall('B521'): # Reset the class vars n_class = None n_section = None n_subclass = None # Collect class vars n_class_info = USPTOSanitizer.return_element_text(B521) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) #print(processed_usclass) position += 1 # B522 USCLASS FURTHER for B522 in B520.findall('B522'): n_class_info = USPTOSanitizer.return_element_text(B522) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main.strip()[:5] n_subclass = n_subclass.strip()[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 # B510 International Class data # TODO: check if I need to set all variables to empty or can just leave as null # TODO: check if classification is parsed correctly for B510 in B500.findall('B510'): #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id)) # Reset position position = 1 # B511 Main Class for B511 in B510.findall('B511'): i_section = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None int_class = USPTOSanitizer.return_element_text(B511) # Int Class is: if (len(int_class.split()) > 1): sec_1, sec_2 = int_class.split() sec_1 = sec_1.strip()[:15] # Remove the Section from first character i_section = sec_1[0] i_class = sec_1[1:3] i_subclass = sec_1[-1] i_class_mgr = sec_2.strip()[:-2] i_class_sgr = sec_2.strip()[-2:] else: int_class = int_class.strip()[:15] i_section = int_class[0] i_class = int_class[1:] i_subclass = int_class[-1] i_malformed = 1 # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_section, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # B512 Further International Class for B512 in B510.findall('B512'): i_section = None i_class = None i_subclass = None i_class_mgr = None i_class_sgr = None i_malformed = None int_class = USPTOSanitizer.return_element_text(B512) # Split class in to class and group if (len(int_class.split()) > 1): sec_1, sec_2 = int_class.split() sec_1 = sec_1.strip()[:15] # Remove the Section from first character i_section = sec_1[0] i_class = sec_1[1:3] i_subclass = sec_1[-1] i_class_mgr = sec_2.strip()[:-2] i_class_sgr = sec_2.strip()[-2:] else: # TODO: Is this correct?? int_class = int_class.strip()[:15] i_section = int_class[0] i_class = int_class[1:] i_subclass = int_class[-1] i_malformed = 1 # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_section, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "Malformed": i_malformed, "FileName": args_array['file_name'] }) #print(processed_intclass) position += 1 # B540 Collect Title for B540 in B500.findall('B540'): try: title = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text(B540)[:500]) except: title = None # Patent Citations for B560 in B500.findall('B560'): # Reset position counter for all citations loop position = 1 # B561 is Patent Citation for B561 in B560.findall('B561'): # TODO: find out how to do PCIT, DOC without loop. Only B561 needs loop pcit = B561.find('PCIT') # Determien if the patent is US or not #TODO: needs to check better, what does non US patent look like # If all patents have PARTY-US then perhaps a databse call to check the country of origin # would still allow to separate into GRACIT and FORPATCIT_G #if PCIT.find("PARTY-US") == True: #print "CITATION COUNTRY US" #citation_country = "US" #else: #citation_country = "NON-US" #logger.warning("NON US patent found") #citation_country = "US" # Declare items in case they are not found citation_name = None citation_city = None citation_state = None citation_country = None doc = pcit.find('DOC') if doc is not None: try: citation_document_number = USPTOSanitizer.return_element_text( doc.find('DNUM')).strip()[:15] except: citation_document_number = None try: pct_kind = USPTOSanitizer.return_element_text( doc.find('KIND')).strip()[:10] except: pct_kind = None try: citation_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( doc.find('DATE')), args_array, document_id) except: citation_date = None prt = pcit.find('PARTY-US') if prt is not None: try: citation_name = USPTOSanitizer.return_element_text( prt.find("NAM").find("SNM")).strip()[:100] except: citation_name = None # Citation Address info try: citation_city = USPTOSanitizer.return_element_text( prt.find('ADR').find('CITY')).strip()[:100] except: citation_city = None try: citation_state = USPTOSanitizer.return_element_text( prt.find('ADR').find('STATE')).strip()[:3] except: citation_state = None # Citation country try: citation_country = USPTOSanitizer.return_element_text( prt.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state( citation_state): citation_country = "US" else: citation_country = None except: citation_country = None # Parse citation category if (len(B561.getchildren()) > 1): try: citation_category = B561.getchildren( )[1].tag.replace("\n", "").replace("\r", "").upper() except: citation_category = None else: citation_category = None #TODO: be aware that there may be something crazy in the # citation document number if pct_kind != None: # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name": "uspto.GRACIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) #print(processed_gracit) position += 1 else: # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name": "uspto.FORPATCIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) #print(processed_forpatcit) position += 1 # Reset position counter for non-patent citations loop position = 1 # Non-patent Literature Citations for B562 in B560.findall('B562'): NCIT = B562.find('NCIT') if NCIT is not None: # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it non_patent_citation_text = USPTOSanitizer.return_element_text( NCIT) non_patent_citation_text = re.sub( '<[^>]+>', '', non_patent_citation_text) else: non_patent_citation_text = None # Parse citation category into code if (len(B562.getchildren()) > 1): try: ncitation_category = B562.getchildren( )[1].tag.replace("\n", "").replace("\r", "").upper() except: ncitation_category = None else: ncitation_category = None # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name": "uspto.NONPATCIT_G", "GrantID": document_id, "Position": position, "Citation": non_patent_citation_text, "Category": ncitation_category, "FileName": args_array['file_name'] }) #print(processed_nonpatcit) position += 1 # Collect number of claims for B570 in B500.findall('B570'): try: claims_num = USPTOSanitizer.return_element_text( B570.find('B577')).strip() except: claims_num = None # Collect number of drawings and figures for B590 in B500.findall('B590'): for B595 in B590.findall('B595'): try: number_of_drawings = USPTOSanitizer.return_element_text( B595).strip() number_of_drawings = number_of_drawings.split("/")[0] except: number_of_drawings = None for B596 in B590.findall('B596'): try: number_of_figures = USPTOSanitizer.return_element_text( B596).strip() except: number_of_figures = None # TODO: B582 find out what it is. Looks like patent classifications but it's all alone in the XML # B700 is Parties # TODO: find the applicant data and append to array for B700 in r.findall('B700'): # B720 Inventor for B720 in B700.findall('B720'): # Reset position for inventors position = 1 # Collect inventor information for B721 in B720.findall('B721'): for i in B721.findall('PARTY-US'): # Inventor Name try: inventor_first_name = USPTOSanitizer.return_element_text( i.find('NAM').find('FNM')).strip()[:100] except: inventor_first_name = None try: inventor_last_name = USPTOSanitizer.return_element_text( i.find('NAM').find('SNM')).strip()[:100] except: inventor_last_name = None # Inventor Address info try: inventor_city = USPTOSanitizer.return_element_text( i.find('ADR').find('CITY')).strip()[:100] except: inventor_city = None try: inventor_state = USPTOSanitizer.return_element_text( i.find('ADR').find('STATE')).strip()[:3] except: inventor_state = None # Inventor country try: inventor_country = USPTOSanitizer.return_element_text( i.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state(inventor_state): inventor_country = "US" else: inventor_country = None except: inventor_country = None inventor_nationality = None inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_G", "GrantID": document_id, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Nationality": inventor_nationality, "Residence": inventor_residence, "FileName": args_array['file_name'] }) #print(processed_inventor) position += 1 # B730 Assignee # TODO: check if finding child of child is working # Reset position for assignees position = 1 for B730 in B700.findall('B730'): for B731 in B730.findall('B731'): for x in B731.findall('PARTY-US'): try: asn_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM")).strip()[:500] except: asn_orgname = None asn_role = None try: asn_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY')).strip()[:100] except: asn_city = None try: asn_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE')).strip()[:30] except: asn_state = None # Assignee country try: asn_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY')).strip()[:3] except: try: # Fix country if country missing if USPTOSanitizer.is_US_state(asn_state): asn_country = "US" else: asn_country = None except: asn_country = None # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_G", "GrantID": document_id, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) #print(processed_assignee) position += 1 # B740 is Legal Agent / Attorney for B740 in B700.findall('B740'): # Reset position for agents position = 1 for B741 in B740.findall('B741'): for x in B741.findall('PARTY-US'): # Attorney Organization try: agent_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM")).strip()[:300] except: agent_orgname = None # Attorney Name try: agent_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:100] except: agent_last_name = None try: agent_first_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:100] except: agent_first_name = None # Attorney Address information try: agent_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY')).strip()[:100] except: agent_city = None try: agent_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE')).strip()[:30] except: agent_state = None # Agent country try: agent_country = USPTOSanitizer.return_element_text( x.find("ADR").find('CTRY')).strip()[:3] except: try: # Fix country if missing if USPTOSanitizer.is_US_state(agent_state): agent_country = "US" else: agent_country = None except: agent_country = None # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_G", "GrantID": document_id, "Position": position, "OrgName": agent_orgname, "LastName": agent_last_name, "FirstName": agent_first_name, "Country": agent_country, "FileName": args_array['file_name'] }) #print(processed_agent) position += 1 # B745 Examiner for B745 in B700.findall('B745'): position = 1 # Primary Examiner for B746 in B745.findall('B746'): for x in B746.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:50] except: examiner_fist_name = None try: examiner_department = USPTOSanitizer.return_element_text( B745.find('B748US')).strip()[:50] except: examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) #print(processed_examiner) position += 1 # Assistant Examiner for B747 in B745.findall('B747'): for x in B747.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM')).strip()[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM')).strip()[:50] except: examiner_fist_name = None try: examiner_department = USPTOSanitizer.return_element_text( B745.find('B748US')).strip()[:50] except: examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) #print(processed_examiner) position += 1 # B300 Foreign priotiry data position = 1 for B300 in r.findall('B300'): # Country try: pc_country = USPTOSanitizer.return_element_text( B300.find('B330').find('CTRY')).strip()[:5] except: pc_country = None # Prority filing date try: pc_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( B300.find('B320').find('DATE')).strip()[:45]) except: pc_date = None # Prority document number try: pc_doc_num = USPTOSanitizer.return_element_text( B300.find('B310').find('DNUM')).strip()[:45] except: pc_doc_dum = None # Set the fields that are not in gXML2 pc_kind = None # Append SQL data into dictionary to be written later processed_foreignpriority.append({ "table_name": "uspto.FOREIGNPRIORITY_G", "GrantID": document_id, "Position": position, "Kind": pc_kind, "Country": pc_country, "DocumentID": pc_doc_num, "PriorityDate": pc_date, "FileName": args_array['file_name'] }) #print(processed_foreignpriority) position += 1 # Collect Abstract from data try: a_elem = document_root.find('SDOAB') if a_elem is not None: abstract = USPTOSanitizer.strip_for_csv( USPTOSanitizer.return_element_text(a_elem)) else: abstract = None except Exception as e: abstract = None #traceback.print_exc() #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc()) #print(abstract) # Collect detailed description from DETDESC try: d_elem = document_root.find('SDODE').find('DETDESC') if d_elem is not None: description = USPTOSanitizer.strip_for_csv(' '.join( d_elem.itertext())) else: description = None except Exception as e: description = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc()) #print(description) # Collect claims from data try: c_elem = document_root.find('SDOCL') if c_elem is not None: claims = USPTOSanitizer.strip_for_csv(' '.join( c_elem.itertext())) #claims = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(c_elem)) else: claims = None except Exception as e: claims = None #traceback.print_exc() #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc()) #print(claims) # Append SQL data into dictionary to be written later processed_grant.append({ "table_name": "uspto.GRANT", "GrantID": document_id, "Title": title, "Claims": claims, "Description": description, "IssueDate": pub_date, "Kind": kind, "GrantLength": grant_length, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "ApplicationID": app_no, "FileDate": app_date, "AppType": app_type, "FileName": args_array['file_name'] }) #print(processed_grant) # Return a dictionary of the processed_ data arrays return { "processed_grant": processed_grant, "processed_applicant": processed_applicant, "processed_examiner": processed_examiner, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_gracit": processed_gracit, "processed_forpatcit": processed_forpatcit, "processed_nonpatcit": processed_nonpatcit, "processed_foreignpriority": processed_foreignpriority }
def process_XML_grant_content(args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") if "database" in args_array["command_args"]: # Pass the database connection to variable database_connection = args_array['database_connection'] # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Set the start time of operation start_time = time.time() # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly #print "The xml format is: " + args_array['uspto_xml_format'] if args_array['uspto_xml_format'] == "gXML4": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<us-patent-grant" in line: patent_xml_started = True xml_string += "<us-patent-grant>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</us-patent-grant" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreGrantData.store_grant_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_new_html_characters(line) # Used for gXML2 files elif args_array['uspto_xml_format'] == "gXML2": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<PATDOC" in line: patent_xml_started = True xml_string += "<PATDOC>" # Print line with number #print str(line_number) + " : " + line #line_number += 1 # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</PATDOC" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreGrantData.store_grant_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close all the open .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required database_connection.remove_previous_file_records(args_array['document_type'], args_array['file_name']) # Load CSV file into database file_processed = database_connection.load_csv_bulk_data(args_array) if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Delete all the open csv files USPTOCSVHandler.delete_csv_files(args_array) # Print message to stdout and log print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return file_processed as success status return file_processed else: # Print message to stdout and log print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None as failed status during database insertion return None
def process_XML_application_content(args_array): # Process zip file by getting .dat or .txt file and .xml filenames start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly if args_array['uspto_xml_format'] == "aXML4": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # application bibliographic information if "<us-patent-application" in line: patent_xml_started = True xml_string += "<us-patent-application>" # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</us-patent-application" in line: patent_xml_started = False xml_string += "</us-patent-application>" # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data(processed_data_array, args_array) # Reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_new_html_characters(line) elif args_array['uspto_xml_format'] == "aXML1": line_count = 1 # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # application bibliographic information if "<patent-application-publication" in line: patent_xml_started = True xml_string += "<patent-application-publication>" # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</patent-application-publication" in line: patent_xml_started = False xml_string += "</patent-application-publication>" # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close the all the .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required args_array['database_connection'].remove_previous_file_records(args_array['document_type'], args_array['file_name']) # Loop through each csv file and bulk copy into database for key, csv_file in list(args_array['csv_file_array'].items()): # Load CSV file into database file_processed = args_array['database_connection'].load_csv_bulk_data(args_array, key, csv_file) # If the file was successfully processed into the database if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Close all the open csv files USPTOCSVHandler.delete_csv_files(args_array) print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return the file procecssed status return file_processed else: print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None to show database insertion failed return None
def extract_XML2_grant(raw_data, args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Pass the url_link and format into local variables url_link = args_array['url_link'] uspto_xml_format = args_array['uspto_xml_format'] #print raw_data # Define all arrays needed to hold the data processed_grant = [] processed_applicant = [] processed_examiner = [] processed_assignee = [] processed_agent = [] processed_inventor = [] processed_usclass = [] processed_intclass = [] processed_gracit = [] processed_forpatcit = [] processed_nonpatcit = [] # Start timer start_time = time.time() try: # Pass the raw data into Element tree xml object patent_root = ET.fromstring(raw_data) except ET.ParseError as e: print_xml = raw_data.split("\n") for num, line in enumerate(print_xml, start=1): print(str(num) + ' : ' + line) logger.error( "Character Entity prevented ET from parsing XML in file: " + url_link) # Print traceback traceback.print_exc() # Print exception information to file exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # Start the parsing process for XML for r in patent_root.findall('SDOBI'): # Collect document data for B100 in r.findall('B100'): #GRANT try: document_id = USPTOSanitizer.return_element_text( B100.find('B110')) document_id = USPTOSanitizer.fix_patent_number( document_id)[:20] except: document_id = None logger.error("No Patent Number was found for: " + url_link) try: kind = USPTOSanitizer.return_element_text( B100.find('B130'))[:2] except: kind = None try: pub_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B100.find('B140')), args_array, document_id) # PATENT ISSUE DATE except: pub_date = None try: pub_country = USPTOSanitizer.return_element_text( B100.find('B190')) # PATENT APPLICANT COUNTRY?? except: pub_country = None # Collect apllication data in document for B200 in r.findall('B200'): # APPLICATION # TODO: find these datas in XML2 applications app_type = None app_country = None try: app_no = USPTOSanitizer.return_element_text( B200.find('B210'))[:20] except: app_no = None try: app_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text(B200.find('B220')), args_array, document_id) # APPLICATION DATE except: app_date = None try: series_code = USPTOSanitizer.return_element_text( B200.find('B211US'))[:2] except: series_code = None # Collect the grant length grant_length = USPTOSanitizer.return_element_text(r.find("B474")) # Collect US classification for B500 in r.findall('B500'): for B520 in B500.findall('B520'): #US CLASSIFICATION position = 1 for B521 in B520.findall('B521'): # USCLASS MAIN n_class_info = USPTOSanitizer.return_element_text(B521) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 for B522 in B520.findall('B522'): # USCLASS FURTHER n_class_info = USPTOSanitizer.return_element_text(B522) n_class_main, n_subclass = USPTOSanitizer.return_class( n_class_info) n_class_main = n_class_main[:5] n_subclass = n_subclass[:15] # Append SQL data into dictionary to be written later processed_usclass.append({ "table_name": "uspto.USCLASS_G", "GrantID": document_id, "Position": position, "Class": n_class_main, "SubClass": n_subclass, "FileName": args_array['file_name'] }) position += 1 # Collect International Class data # TODO: check if I need to set all variables to empty or can just leave as null # TODO: check if classification is parsed correctly for B510 in B500.findall('B510'): # INTERNATIONAL CLASS #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id)) # Reset position position = 1 for B511 in B510.findall('B511'): #MAIN CLASS i_class_version_date = None i_class_action_date = None i_class_gnr = None i_class_level = None i_class_sec = None int_class = USPTOSanitizer.return_element_text(B511) # TODO: check international classification and rewrite this parsing piece. if (len(int_class.split()) > 1): i_class, i_subclass = int_class.split() i_class = i_class[:15] i_subclass = i_subclass[:15] else: i_class = int_class[:15] i_subclass = None i_class_mgr = None i_class_sgr = None i_class_sps = None i_class_val = None i_class_status = None i_class_ds = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) position += 1 #INTERNATIONAL CLASS FURTHER for B512 in B510.findall('B511'): i_class_version_date = None i_class_action_date = None i_class_gnr = None i_class_level = None i_class_sec = None int_class = USPTOSanitizer.return_element_text(B512) # TODO: splitting int class does not include possible multiple subclasses if (len(int_class.split()) > 1): i_class = int_class.split()[0][:15] i_subclass = int_class.split()[1][:15] else: i_class = int_class[:15] i_subclass = None i_class_mgr = None i_class_sgr = None i_class_sps = None i_class_val = None i_class_status = None i_class_ds = None # Append SQL data into dictionary to be written later processed_intclass.append({ "table_name": "uspto.INTCLASS_G", "GrantID": document_id, "Position": position, "Section": i_class_sec, "Class": i_class, "SubClass": i_subclass, "MainGroup": i_class_mgr, "SubGroup": i_class_sgr, "FileName": args_array['file_name'] }) position += 1 # Collect Tite for B540 in B500.findall('B540'): try: title = USPTOSanitizer.return_element_text(B540)[:500] except: title = None # Collect Citations for B560 in B500.findall('B560'): # CITATIONS # Reset position counter for all citations loop position = 1 for B561 in B560.findall('B561'): #PATCIT # TODO: find out how to do PCIT, DOC without loop. Only B561 needs loop PCIT = B561.find('PCIT') # Determien if the patent is US or not #TODO: needs to check better, what does non US patent look like # If all patents have PARTY-US then perhaps a databse call to check the country of origin # would still allow to separate into GRACIT and FORPATCIT_G #if PCIT.find("PARTY-US") == True: #print "CiTATION OUNTRY US" #citation_country == "US" #else: #citation_country = "NON-US" #logger.warning("NON US patent found") citation_country = "US" DOC = PCIT.find('DOC') try: citation_document_number = USPTOSanitizer.return_element_text( DOC.find('DNUM'))[:15] except: citation_document_number = None try: pct_kind = USPTOSanitizer.return_element_text( DOC.find('KIND'))[:10] except: pct_kind = None try: citation_date = USPTOSanitizer.return_formatted_date( USPTOSanitizer.return_element_text( DOC.find('DATE'), args_array, document_id)) except: citation_date = None try: citation_name = USPTOSanitizer.return_element_text( PCIT.find('PARTY-US'))[:100] except: citation_name = None # Parse citation category if (len(B561.getchildren()) > 1): citation_category = B561.getchildren()[1].tag.replace( "\n", "").replace("\r", "") #print type(citation_category) # TODO: check that the citation category tag matches correctly #print "Citation Category = " + citation_category + " Length: " + str(len(citation_category)) if "CITED-BY-EXAMINER" in citation_category: citation_category = 1 elif "CITED-BY-OTHER" in citation_category: citation_category = 2 else: citation_category = 0 logger.warning("Cited by unknown type") else: citation_category = None #TODO: be aware that there may be something crazy in the citation document number if citation_country == "US": # Append SQL data into dictionary to be written later processed_gracit.append({ "table_name": "uspto.GRACIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 else: # Append SQL data into dictionary to be written later processed_forpatcit.append({ "table_name": "uspto.FORPATCIT_G", "GrantID": document_id, "Position": position, "CitedID": citation_document_number, "Kind": pct_kind, "Name": citation_name, "Date": citation_date, "Country": citation_country, "Category": citation_category, "FileName": args_array['file_name'] }) position += 1 # Reset position counter for non-patent citations loop position = 1 for B562 in B560.findall('B562'): #NON-PATENT LITERATURE for NCIT in B562.findall('NCIT'): # sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it non_patent_citation_text = USPTOSanitizer.return_element_text( NCIT) non_patent_citation_text = re.sub( '<[^>]+>', '', non_patent_citation_text) # parse citation cateory into code ncitation_category = ET.tostring(NCIT) if (len(B562.getchildren()) > 1): ncitation_category = B562.getchildren( )[1].tag.replace("\n", "").replace("\r", "") #print type(ncitation_category) #rint "Non patent citation category" + ncitation_category if "CITED-BY-EXAMINER" in ncitation_category: ncitation_category = 1 elif "CITED-BY-OTHER" in ncitation_category: ncitation_category = 2 else: ncitation_category = 0 # Append SQL data into dictionary to be written later processed_nonpatcit.append({ "table_name": "uspto.NONPATCIT_G", "GrantID": document_id, "Position": position, "Citation": non_patent_citation_text, "Category": ncitation_category, "FileName": args_array['file_name'] }) position += 1 # Collect number of claims for B570 in B500.findall('B570'): try: claims_num = USPTOSanitizer.return_element_text( B570.find('B577')) except: claims_num = None # Collect number of drawings and figures for B590 in B500.findall('B590'): for B595 in B590.findall('B595'): try: number_of_drawings = USPTOSanitizer.return_element_text( B595) number_of_drawings = number_of_drawings.split("/")[0] except: number_of_drawings = None for B596 in B590.findall('B596'): try: number_of_figures = USPTOSanitizer.return_element_text( B596) except: number_of_figures = None # TODO: B582 find out what it is. Looks like patent classifications but it's all alone in the XML # Collect party information # TODO: find the applicant data and append to array for B700 in r.findall('B700'): #PARTIES # Collect inventor data for B720 in B700.findall('B720'): #INVENTOR # Reset position for inventors position = 1 # Collect inventor information for B721 in B720.findall('B721'): for i in B721.findall('PARTY-US'): itSequence = position try: inventor_first_name = USPTOSanitizer.return_element_text( i.find('NAM').find('FNM'))[:100] except: inventor_first_name = None try: inventor_last_name = USPTOSanitizer.return_element_text( i.find('NAM').find('SNM'))[:100] except: inventor_last_name = None try: inventor_city = USPTOSanitizer.return_element_text( i.find('ADR').find('CITY'))[:100] except: inventor_city = None try: inventor_state = USPTOSanitizer.return_element_text( i.find('ADR').find('STATE'))[:100] except: inventor_state = None # TODO: find out if country can be other than US inventor_country = "US" inventor_nationality = None inventor_residence = None # Append SQL data into dictionary to be written later processed_inventor.append({ "table_name": "uspto.INVENTOR_G", "GrantID": document_id, "Position": position, "FirstName": inventor_first_name, "LastName": inventor_last_name, "City": inventor_city, "State": inventor_state, "Country": inventor_country, "Nationality": inventor_nationality, "Residence": inventor_residence, "FileName": args_array['file_name'] }) position += 1 # Collect Assignee data # TODO: check if finding child of child is working # Reset position for assignees position = 1 for B730 in B700.findall('B730'): for B731 in B730.findall('B731'): for x in B731.findall('PARTY-US'): try: asn_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM"))[:500] except: asn_orgname = None asn_role = None try: asn_city = USPTOSanitizer.return_element_text( x.find("ADR").find('CITY'))[:100] except: asn_city = None try: asn_state = USPTOSanitizer.return_element_text( x.find("ADR").find('STATE'))[:100] except: asn_state = None # TODO: find out if country is always US because it's never included. Check all other references also asn_country = "US" # Append SQL data into dictionary to be written later processed_assignee.append({ "table_name": "uspto.ASSIGNEE_G", "GrantID": document_id, "Position": position, "OrgName": asn_orgname, "Role": asn_role, "City": asn_city, "State": asn_state, "Country": asn_country, "FileName": args_array['file_name'] }) # Increment the position placement position += 1 # Collect agent data for B740 in B700.findall('B740'): #AGENT # Reset position for agents position = 1 for B741 in B740.findall('B741'): for x in B741.findall('PARTY-US'): try: agent_orgname = USPTOSanitizer.return_element_text( x.find('NAM').find("ONM"))[:300] except: agent_orgname = None try: agent_last_name = USPTOSanitizer.return_element_text( i.find('NAM').find('FNM'))[:100] except: agent_last_name = None try: agent_first_name = USPTOSanitizer.return_element_text( i.find('NAM').find('SNM'))[:100] except: agent_first_name = None agent_country = "US" # Append SQL data into dictionary to be written later processed_agent.append({ "table_name": "uspto.AGENT_G", "GrantID": document_id, "Position": position, "OrgName": agent_orgname, "LastName": agent_last_name, "FirstName": agent_first_name, "Country": agent_country, "FileName": args_array['file_name'] }) position += 1 # Collect examiner data for B745 in B700.findall('B745'): position = 1 # Primary Examiner for B746 in B745.findall('B746'): for x in B746.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM'))[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM'))[:50] except: examiner_fist_name = None #TODO: find out if 748US is the department examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) position += 1 # Assistant Examiner for B747 in B745.findall('B747'): for x in B747.findall('PARTY-US'): try: examiner_last_name = USPTOSanitizer.return_element_text( x.find('NAM').find('SNM'))[:50] except: examiner_last_name = None try: examiner_fist_name = USPTOSanitizer.return_element_text( x.find('NAM').find('FNM'))[:50] except: examiner_fist_name = None #TODO: find out if 748US is the department examiner_department = None # Append SQL data into dictionary to be written later processed_examiner.append({ "table_name": "uspto.EXAMINER_G", "GrantID": document_id, "Position": position, "LastName": examiner_last_name, "FirstName": examiner_fist_name, "Department": examiner_department, "FileName": args_array['file_name'] }) position += 1 # Collect Abstract from data try: abstr = patent_root.find('SDOAB') abstract = USPTOSanitizer.return_element_text(abstr) #print abstract except: abstract = None # Collect claims from data try: cl = patent_root.find('SDOCL') claims = USPTOSanitizer.return_element_text(cl) #print claims except: traceback.print_exc() claims = None # Append SQL data into dictionary to be written later processed_grant.append({ "table_name": "uspto.GRANT", "GrantID": document_id, "Title": title, "IssueDate": pub_date, "Kind": kind, "GrantLength": grant_length, "USSeriesCode": series_code, "Abstract": abstract, "ClaimsNum": claims_num, "DrawingsNum": number_of_drawings, "FiguresNum": number_of_figures, "ApplicationID": app_no, "Claims": claims, "FileDate": app_date, "AppType": app_type, "FileName": args_array['file_name'] }) # Return a dictionary of the processed_ data arrays return { "processed_grant": processed_grant, "processed_applicant": processed_applicant, "processed_examiner": processed_examiner, "processed_assignee": processed_assignee, "processed_agent": processed_agent, "processed_inventor": processed_inventor, "processed_usclass": processed_usclass, "processed_intclass": processed_intclass, "processed_gracit": processed_gracit, "processed_forpatcit": processed_forpatcit, "processed_nonpatcit": processed_nonpatcit }
def extract_XML2_grant_tag_counts(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # Declare a dictionary to use in counting tags # NOTE: CPCClASS_G, APPLICANT_G, are not available in XML2 Grant files tags_dict = { "GRANT" : ["<PATDOC"], "INTCLASS_G" : ["<B510"], "USCLASS_G" : ["<B521", "<B522"], "INVENTOR_G" : ["<B721"], "AGENT_G" : ["<B740"], "ASSIGNEE_G" : ["<B730"], "NONPATCIT_G" : ["<B562"], "EXAMINER_G" : ["<B746", "<B747"], "FOREIGNPRIORITY_G" : ["<B310"] } # Declare a dictionary to hold counts by table counts_dict = { "file_name" : args_array['file_name'], "GRANT" : 0, "INTCLASS_G" : 0, "CPCCLASS_G" : 0, "USCLASS_G" : 0, "INVENTOR_G" : 0, "AGENT_G" : 0, "ASSIGNEE_G" : 0, "APPLICANT_G" : 0, "NONPATCIT_G" : 0, "EXAMINER_G" : 0, "GRACIT_G" : 0, "FORPATCIT_G" : 0, "FOREIGNPRIORITY_G" : 0 } # Print to stdout and log print("-- Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) logger.info("Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c")) # Loop through the file contents line by line for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # Loop through tags_dict items and look for XML tag for table, tag in tags_dict.items(): item_found = False # If list is provided if isinstance(tag, list): for item in tag: # Look for field tag if item in line: item_found = True if item_found == True: # Increment the count for appropriate table counts_dict[table] += 1 # Count the items that cannot be counted by only tags # Parse the tags that need to be XML parsed # Create variables needed to parse the file xml_string = '' patent_xml_started = False # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<PATDOC" in line: patent_xml_started = True xml_string += "<PATDOC>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</PATDOC" in line: patent_xml_started = False xml_string += "</PATDOC>" #print(xml_string) # Pass the raw_data data into Element Tree try: document_root = ET.fromstring(xml_string) # SDOBI is the bibliographic data r = document_root.find('SDOBI') # Patent Citations B500 = r.find('B500') if B500 is not None: for B560 in B500.findall('B560'): # B561 is Patent Citation for B561 in B560.findall('B561'): try: pcit = B561.find('PCIT').find('DOC') except: pcit = None if pcit is not None: prt = pcit.find('PARTY-US') try: citation_state = USPTOSanitizer.return_element_text(prt.find('ADR').find('STATE')).strip()[:3] except: citation_state = None try: citation_country = USPTOSanitizer.return_element_text(prt.find("ADR").find('CTRY')).strip()[:3] except: try: # If state is a US state, set country to US if USPTOSanitizer.is_US_state(citation_state): citation_country = "US" else: citation_country = None except: citation_country = None if citation_country == "US" or citation_country == None: counts_dict['GRACIT_G'] += 1 elif citation_country is not None: counts_dict['FORPATCIT_G'] += 1 # Reset the xml string xml_string = '' except ET.ParseError as e: print_xml = xml_string.split("\n") for num, line in enumerate(print_xml, start = 1): #print(str(num) + ' : ' + line) logger.error(str(num) + ' : ' + line) logger.error("Character Entity prevented ET from parsing XML in file: " + args_array['file_name'] ) traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_old_html_characters(line) # Print to stdout and log print("-- Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) logger.info("Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c")) # Return the dictionary of counts for found tags if args_array['stdout_level'] == 1: pprint(counts_dict) return counts_dict