예제 #1
0
    def build_sql_insert_query(self, insert_data_array, args_array):

        logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

        uspto_xml_format = args_array['uspto_xml_format']

        # Set a length counter used to find when the last item is appended to query string
        array_length_counter = 1
        length_of_array = len(insert_data_array) - 1
        # Pass the table name to variable
        table_name = insert_data_array['table_name']
        # Pop the table name off the array to be stored into database
        del insert_data_array['table_name']

        sql_query_string = "INSERT INTO " + table_name + " "
        sql_column_string = "("
        sql_value_string = " VALUES ("
        # Concatenate the list of keys and values to sql format
        for key, value in list(insert_data_array.items()):

            # Don't escape values that are None (NULL)
            if value is not None and isinstance(value, int) == False:
                # Escape all values for sql insertion
                value = USPTOSanitizer.escape_value_for_sql(str(value.encode('utf-8')))
                # Since postgresql uses `$` as delimiter, must  strip from first and last char
                value = value.strip("$").replace("$$$", "$").replace("$$", "$")

            # If the last item in the array then append line without comma at end
            if length_of_array == array_length_counter:
                sql_column_string += key
                # Check for None value and append
                if value == None:
                    sql_value_string += 'NULL'
                else:
                    # PostgreSQL strings will be escaped slightly different than MySQL
                    if args_array['database_type'] == 'postgresql':
                        sql_value_string += "$$" + str(value)+ "$$"
                    elif args_array['database_type'] == 'mysql':
                        sql_value_string += '"' + str(value) + '"'
            # If not the last item then append with comma
            else:
                sql_column_string += key + ", "
                # Check if value is None
                if value == None:
                    sql_value_string +=  'NULL,'
                else:
                    if args_array['database_type'] == 'postgresql':
                        sql_value_string +=  "$$" + str(value) + "$$,"
                    elif args_array['database_type'] == 'mysql':
                        sql_value_string += '"' + str(value) + '",'
            array_length_counter += 1
        # Add the closing bracket
        sql_column_string += ") "
        sql_value_string += ");"

        # Concatenate the pieces of the query
        sql_query_string += sql_column_string + sql_value_string
        logger.info(sql_query_string)
        # Return the query string
        return sql_query_string
예제 #2
0
def extract_CPC_class_dict(line):

    cpc_array = USPTOSanitizer.return_CPC_class_application(line[0])

    # Build a class dictionary
    class_dictionary = {
        "table_name" : "uspto.CPCClASS_C",
        "extraction_type" : "cpcclass",
        "Section" : cpc_array[0],
        "Class" : cpc_array[1],
        "SubClass" : cpc_array[2],
        "MainGroup" : cpc_array[3],
        "SubGroup" : cpc_array[4],
        "Title" : line[1].replace('"', "").strip()
    }
    #print(class_dictionary)
    # Return the class dictionary
    return class_dictionary
def extract_csv_line(args_array, line):

    #print(line)
    # Declare a processed array to append to
    processed_array = {
        "table_name": set_table_name_from_type(args_array['extraction_type']),
        "FileName": args_array['file_name'],
        "extraction_type": args_array['extraction_type']
    }

    # Handle a correspondance items
    if args_array['extraction_type'] == "correspondence":
        processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros(
            USPTOSanitizer.clean_PAIR_csv_item(line[0]))
        processed_array['Name1'] = USPTOSanitizer.clean_PAIR_csv_item(line[1])
        processed_array['Name2'] = USPTOSanitizer.clean_PAIR_csv_item(line[2])
        processed_array['Address'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[3]) + " " + USPTOSanitizer.clean_PAIR_csv_item(line[4])
        processed_array['City'] = USPTOSanitizer.clean_PAIR_csv_item(line[5])
        processed_array['PostalCode'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[6])
        processed_array['RegionCode'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[7])
        processed_array['RegionName'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[8])
        processed_array['CountryCode'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[9])
        processed_array['CountryName'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[10])
        processed_array['CustomerNum'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[11])

    elif args_array['extraction_type'] == "continuityparent":
        processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros(
            USPTOSanitizer.clean_PAIR_csv_item(line[0]))
        processed_array[
            'ParentApplicationID'] = USPTOSanitizer.strip_leading_zeros(
                USPTOSanitizer.clean_PAIR_csv_item(line[1]))
        processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[2])
        processed_array[
            'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3])

    elif args_array['extraction_type'] == "continuitychild":
        processed_array['ApplicationID'] = USPTOSanitizer.strip_leading_zeros(
            USPTOSanitizer.clean_PAIR_csv_item(line[0]))
        processed_array[
            'ChildApplicationID'] = USPTOSanitizer.strip_leading_zeros(
                USPTOSanitizer.clean_PAIR_csv_item(line[1]))
        processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[2])
        processed_array[
            'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3])

    # Return the array for storage
    return processed_array
예제 #4
0
def extract_XML4_grant(raw_data, args_array):

    # Stat process timer
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define all arrays to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []
    processed_foreignpriority = []

    # Pass the raw_data data into Element Tree
    document_root = ET.fromstring(raw_data)

    # Start the extraction of XML data
    r = document_root.find('us-bibliographic-data-grant')
    if r is not None:
        # Find the main patent grant data
        for pr in r.findall('publication-reference'):
            for di in pr.findall('document-id'):
                try: pub_country = di.findtext('country').strip()
                except: pub_country = None
                try:
                    document_id = di.findtext('doc-number').strip()
                    document_id = USPTOSanitizer.fix_patent_number(document_id)[:20]
                except:
                    document_id = None
                    logger.error("No Patent Number was found for: " + url_link)
                try: kind = di.findtext('kind').strip()[:2]
                except: kind = None
                try: pub_date = USPTOSanitizer.return_formatted_date(di.findtext('date'), args_array, document_id)
                except: pub_date = None

        # Find the main application data
        for ar in r.findall('application-reference'):
            try: app_type = ar.attrib['appl-type'][:45].strip()
            except: app_type = None
            for di in ar.findall('document-id'):
                try: app_country = di.findtext('country').strip()
                except: app_country = None
                try: app_no = di.findtext('doc-number')[:20].strip()
                except: app_no = None
                try: app_date = USPTOSanitizer.return_formatted_date(di.findtext('date'), args_array, document_id)
                except: app_date = None

        # Get the series code
        try: series_code = r.findtext('us-application-series-code')[:2].strip()
        except: series_code = None
        # Get the length of grant
        try: grant_length = r.find("us-term-of-grant").findtext("length-of-grant").strip()
        except: grant_length = None

        # Find all international classifications
        ic = r.find('classifications-ipcr')
        position = 1
        if ic is not None:
            for icc in ic.findall('classification-ipcr'):
                for x in icc.getchildren():
                    if(USPTOSanitizer.check_tag_exists(x,'section')):
                        try: i_class_sec = x.text.strip()[:15]
                        except: i_class_sec = None
                    if(USPTOSanitizer.check_tag_exists(x,'class')):
                        try: i_class_cls = x.text.strip()[:15]
                        except:  i_class_cls = None
                    if(USPTOSanitizer.check_tag_exists(x,'subclass')):
                        try: i_class_sub = x.text.strip()[:15]
                        except: i_class_sub = None
                    if(USPTOSanitizer.check_tag_exists(x,'main-group')):
                        try: i_class_mgr = x.text.strip()[:15]
                        except: i_class_mgr = None
                    if(USPTOSanitizer.check_tag_exists(x,'subgroup')):
                        try: i_class_sgr = x.text.strip()[:15]
                        except: i_class_sgr = None

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name" : "uspto.INTCLASS_G",
                    "GrantID" : document_id,
                    "Position" : position,
                    "Section" : i_class_sec,
                    "Class" : i_class_cls,
                    "SubClass" : i_class_sub,
                    "MainGroup" : i_class_mgr,
                    "SubGroup" : i_class_sgr,
                    "FileName" : args_array['file_name']
                })
                #print(processed_intclass)
                position += 1

        # Init positions for CPC and US classifications
        cpc_position = 1
        nc_position = 1
        # TODO: So much more fields available in the XML4 grant cpc-classifications
        # Find CPC Classifications in main root
        # This section is not required since the 'classification-cpc-text' provides same data
        #
        """
        cpcs = r.find('classifications-cpc')
        if cpcs is not None:
            cpc_main = cpcs.find('main-cpc').find('classification-cpc')
            if cpc_main is not None:
                cpc_section = None
                cpc_class = None
                cpc_subclass = None
                cpc_class_mgr = None
                cpc_class_sgr = None
                try: cpc_section = cpc_item.findtext('section')
                except: cpc_section = None
                try: cpc_class = cpc_item.findtext('class')
                except: cpc_class = None
                try: cpc_subclass = cpc_item.findtext('subclass')
                except: cpc_subclass = None
                try: cpc_class_mgr = cpc_item.findtext('main-group')
                except: cpc_class_mgr = None
                try: cpc_class_sgr = cpc_item.findtext('subgroup')
                except: cpc_class_sgr = None

                # Append SQL data into dictionary to be written later
                processed_cpcclass.append({
                    "table_name" : "uspto.CPCCLASS_G",
                    "GrantID" : document_id,
                    "Position" : cpc_position,
                    "Section" : cpc_section,
                    "Class" : cpc_class,
                    "SubClass" : cpc_subclass,
                    "MainGroup" : cpc_class_mgr,
                    "SubGroup" : cpc_class_sgr,
                    "FileName" : args_array['file_name']
                })
                #print(processed_cpcclass)
                cpc_position += 1

            # Collect further CPC classifications
            cpcf = cpcs.find('further-cpc')
            if cpcf is not None:
                for cpc_item in cpcf.findall('classification-cpc'):
                    cpc_section = None
                    cpc_class = None
                    cpc_subclass = None
                    cpc_class_mgr = None
                    cpc_class_sgr = None
                    try: cpc_section = cpc_item.findtext('section')
                    except: cpc_section = None
                    try: cpc_class = cpc_item.findtext('class')
                    except: cpc_class = None
                    try: cpc_subclass = cpc_item.findtext('subclass')
                    except: cpc_subclass = None
                    try: cpc_class_mgr = cpc_item.findtext('main-group')
                    except: cpc_class_mgr = None
                    try: cpc_class_sgr = cpc_item.findtext('subgroup')
                    except: cpc_class_sgr = None

                    # Append SQL data into dictionary to be written later
                    processed_cpcclass.append({
                        "table_name" : "uspto.CPCCLASS_G",
                        "GrantID" : document_id,
                        "Position" : cpc_position,
                        "Section" : cpc_section,
                        "Class" : cpc_class,
                        "SubClass" : cpc_subclass,
                        "MainGroup" : cpc_class_mgr,
                        "SubGroup" : cpc_class_sgr,
                        "FileName" : args_array['file_name']
                    })
                    #print(processed_cpcclass)
                    cpc_position += 1
        """

        # Find all US classifications if they are embedded in a "field-of-search" tag (XML4 2005 files)
        foc = r.find('field-of-search')
        if foc is not None:
            nc_position = 1
            # Create list of all items
            ncs = foc.findall('classification-national')
            for nc in ncs:
                # Find the main classification tag
                ncm = nc.find('main-classification')
                if ncm is not None:
                    #print(ncm.text)
                    n_class_main = None
                    n_subclass = None
                    n_malformed = None
                    try:
                        n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncm.text)
                    except Exception as e:
                        traceback.print_exc()
                        n_class_main = None
                        n_subclass = None
                        n_malformed = 1

                    # Some are labelled as "None"
                    if n_class_main != None or n_subclass != None:
                        # Append SQL data into dictionary to be written later
                        processed_usclass.append({
                            "table_name" : "uspto.USCLASS_G",
                            "GrantID" : document_id,
                            "Position" : nc_position,
                            "Class" : n_class_main,
                            "SubClass" : n_subclass,
                            "Malformed" : n_malformed,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_usclass)
                        nc_position += 1

        # Find all CPC classifications
        foc = r.find('us-field-of-classification-search')
        if foc is not None:
            for cpc in foc.findall('classification-cpc-text'):
                cpc_section = None
                cpc_class = None
                cpc_subclass = None
                cpc_class_mgr = None
                cpc_class_sgr = None
                try:
                    #print(cpc.text)
                    cpc_text = cpc.text
                    cpc_class_string, cpc_group_string = cpc_text.split(" ")
                    #print(cpc_class_string + " " + cpc_group_string)
                    cpc_section = cpc_text.strip()[0]
                    cpc_class = cpc_class_string.strip()[1:3]
                    cpc_subclass = cpc_class_string.strip()[3]
                    cpc_class_mgr, cpc_class_sgr = cpc_group_string.rsplit("/", 1)
                    cpc_class_mgr = cpc_class_mgr.strip()[:15]
                    cpc_class_sgr = cpc_class_sgr.strip()[:15]
                    #print(cpc_class_sec + " " + cpc_class + " " + cpc_subclass + " " + cpc_class_mgr + " " + cpc_class_sgr)
                except:
                    cpc_section = None
                    cpc_class = None
                    cpc_subclass = None
                    cpc_class_mgr = None
                    cpc_class_sgr = None
                    logger.warning("There was an error parsing the cpc class for Grant ID: " + document_id + " in file: " + url_link)
                    logger.warning("Traceback: " + traceback.format_exc())

                # Append SQL data into dictionary to be written later
                processed_cpcclass.append({
                    "table_name" : "uspto.CPCCLASS_G",
                    "GrantID" : document_id,
                    "Position" : cpc_position,
                    "Section" : cpc_section,
                    "Class" : cpc_class,
                    "SubClass" : cpc_subclass,
                    "MainGroup" : cpc_class_mgr,
                    "SubGroup" : cpc_class_sgr,
                    "FileName" : args_array['file_name']
                })
                #print(processed_cpcclass)
                cpc_position += 1

            # Find all US classifications
            nc_position = 1
            ncs = foc.findall('classification-national')
            for nc in ncs:
                ncm = nc.find('main-classification')
                if ncm is not None:
                    #print(ncm.text)
                    n_class_main = None
                    n_subclass = None
                    n_malformed = None
                    try:
                        n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncm.text)
                    except Exception as e:
                        traceback.print_exc()
                        exit()
                        n_class_main = None
                        n_subclass = None
                        n_malformed = 1

                    # Some are labelled as "None"
                    if n_class_main != None or n_subclass != None:
                        # Append SQL data into dictionary to be written later
                        processed_usclass.append({
                            "table_name" : "uspto.USCLASS_G",
                            "GrantID" : document_id,
                            "Position" : nc_position,
                            "Class" : n_class_main,
                            "SubClass" : n_subclass,
                            "Malformed" : n_malformed,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_usclass)
                        nc_position += 1

                # Collect further US classes
                ncf = nc.find('further-classification')
                if ncf is not None:
                    #print("Further " + ncf.text)
                    n_class_main = None
                    n_subclass = None
                    n_malformed = None
                    try: n_class_main, n_subclass = USPTOSanitizer.return_class_XML4_grant(ncf.text)
                    except Exception as e:
                        traceback.print_exc()
                        exit()
                        n_class_main = None
                        n_subclass = None
                        n_malformed = 1

                    # Some are labelled as "None"
                    if n_class_main != None or n_subclass != None:
                        # Append SQL data into dictionary to be written later
                        processed_usclass.append({
                            "table_name" : "uspto.USCLASS_G",
                            "GrantID" : document_id,
                            "Position" : position,
                            "Class" : n_class_main,
                            "SubClass" : n_subclass,
                            "Malformed" : n_malformed,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_usclass)
                        position += 1

        # Find the title of the patent
        try: title = USPTOSanitizer.strip_for_csv(r.findtext('invention-title')[:500])
        except: title = None

        # Find all references cited in the grant
        # Check if the XML format is using 'us-references-cited' or 'references-cited'
        if r.find('us-references-cited') != None: ref_cited_id_string = "us-references-cited"
        elif r.find('references-cited') != None: ref_cited_id_string = "references-cited"
        else: ref_cited_id_string = "references"
        rf = r.find(ref_cited_id_string)
        if rf != None:
            # Check if the XML format is using 'citation' or 'us-citation'
            if rf.find('citation') != None: citation_id_string = "citation"
            elif rf.find('us-citation') != None: citation_id_string = "us-citation"
            else: citation_id_string = "us-citation"
            uspatcit_position = 1
            forpatcit_position = 1
            nptc_position = 1
            all_rfc = rf.findall(citation_id_string)
            for rfc in all_rfc:
                # If the patent citation child is found must be a patent citation
                if rfc.find('patcit') != None:
                    x = rfc.find('patcit')
                    try: citation_country = x.find('document-id').findtext('country').strip()[:5]
                    except: citation_country = None
                    try: citation_grant_id = x.find('document-id').findtext('doc-number').strip()[:20]
                    except: citation_grant_id = None
                    try: citation_kind = x.find('document-id').findtext('kind').strip()[:10]
                    except: citation_kind = None
                    try: citation_name = x.find('document-id').findtext('name').strip()[:100]
                    except: citation_name = None
                    try: citation_date = USPTOSanitizer.return_formatted_date(x.find('document-id').findtext('date'), args_array, document_id)
                    except: citation_date = None
                    try: citation_category = rfc.findtext('category').strip().upper()[:20]
                    except Exception as e: citation_category = None
                    # US patent citations
                    if(citation_country.strip().upper() == 'US'):

                        # Append SQL data into dictionary to be written later
                        processed_gracit.append({
                            "table_name" : "uspto.GRACIT_G",
                            "GrantID" : document_id,
                            "Position" : uspatcit_position,
                            "CitedID" : citation_grant_id,
                            "Kind" : citation_kind,
                            "Name" : citation_name,
                            "Date" : citation_date,
                            "Country" : citation_country,
                            "Category" : citation_category,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_usclass)
                        uspatcit_position += 1

                    elif citation_country.strip().upper() != 'US':

                        # Append SQL data into dictionary to be written later
                        processed_forpatcit.append({
                            "table_name" : "uspto.FORPATCIT_G",
                            "GrantID" : document_id,
                            "Position" : forpatcit_position,
                            "CitedID" : citation_grant_id,
                            "Kind" : citation_kind,
                            "Name" : citation_name,
                            "Date" : citation_date,
                            "Country" : citation_country,
                            "Category" : citation_category,
                            "FileName" : args_array['file_name']
                        })
                        forpatcit_position += 1
                        #print(processed_forpatcit)

                # If the non-patent citations are found
                elif rfc.find('nplcit') != None:
                    x = rfc.find('nplcit')
                    # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                    try:
                        npatcit_text = USPTOSanitizer.strip_for_csv(x.findtext('othercit'))
                        #npatcit_text.replace("<", "").replace(">","")
                    except: npatcit_text = None
                    try: citation_category = rfc.findtext('category').strip().upper()[:20]
                    except: citation_category = None

                    # Append SQL data into dictionary to be written later
                    processed_nonpatcit.append({
                        "table_name" : "uspto.NONPATCIT_G",
                        "GrantID" : document_id,
                        "Position" : nptc_position,
                        "Citation" : npatcit_text,
                        "Category" : citation_category,
                        "FileName" : args_array['file_name']
                    })
                    #print(processed_nonpatcit)
                    nptc_position += 1

        # Find number of claims
        try: claims_num = r.findtext('number-of-claims').strip()
        except: claims_num = None

        # Find the number of figures and number of drawings
        nof = r.find('figures')
        try:
            number_of_drawings = nof.findtext('number-of-drawing-sheets').strip()
            number_of_drawings = number_of_drawings.split("/")[0].strip()
        except: number_of_drawings = None
        try: number_of_figures = nof.findtext('number-of-figures').strip()
        except: number_of_figures = None

        # Find the parties
        # Check if XML format uses 'us-parties' or 'parties'
        if r.find('us-parties') != None: parties_id_string = "us-parties"
        elif r.find('parties') != None: parties_id_string = "parties"
        else: parties_id_string = "parties"
        # Get the main parties XML tag
        prt = r.find(parties_id_string)
        if prt != None:
            appl_position = 1
            invt_position = 1
            # Find all applicant data
            # Check if the XML format uses 'applicants' or 'us-applicants'
            if prt.find('us-applicants') != None : applicants_id_string = 'us-applicants'
            elif prt.find('applicants') != None : applicants_id_string = 'applicants'
            else: applicants_id_string = 'applicants'
            # Grab the layered applicants tag
            apts = prt.find(applicants_id_string)
            if apts != None:
                # Check if the XML format uses 'applicant' or 'us-applicant'
                if apts.find('us-applicant') != None : applicant_id_string = 'us-applicant'
                elif apts.find('applicant') != None : applicant_id_string = 'applicant'
                else: applicant_id_string = 'applicant'
                for apt in apts.findall(applicant_id_string):
                    # Get the inventor status of the applicant
                    try: inventor_status = apt.attrib['app-type']
                    except: inventor_status = None
                    if(apt.find('addressbook') != None):
                        try: applicant_orgname = apt.find('addressbook').findtext('orgname')[:300].strip()
                        except: applicant_orgname = None
                        try: applicant_first_name = apt.find('addressbook').findtext('first-name')[:100].strip()
                        except: applicant_first_name = None
                        try: applicant_last_name = apt.find('addressbook').findtext('last-name')[:100].strip()
                        except: applicant_last_name = None
                        try: applicant_city = apt.find('addressbook').find('address').findtext('city')[:100].strip()
                        except: applicant_city = None
                        try: applicant_state = apt.find('addressbook').find('address').findtext('state')[:25].strip()
                        except: applicant_state = None
                        try: applicant_country = apt.find('addressbook').find('address').findtext('country')[:5].strip()
                        except: applicant_country = None
                        try: inventor_residence = apt.findtext('residence')[:100].strip()
                        except: inventor_residence = None

                        # Append SQL data into dictionary to be written later
                        processed_applicant.append({
                            "table_name" : "uspto.APPLICANT_G",
                            "GrantID" : document_id,
                            "OrgName" : applicant_orgname,
                            "Position" : appl_position,
                            "FirstName" : applicant_first_name,
                            "LastName" : applicant_last_name,
                            "City" : applicant_city,
                            "State" : applicant_state,
                            "Country" : applicant_country,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_applicant)
                        appl_position += 1

                        # Check if the applicant is inventor
                        if "inventor" in inventor_status:
                            # Append SQL data into dictionary to be written later
                            processed_inventor.append({
                                "table_name" : "uspto.INVENTOR_G",
                                "GrantID" : document_id,
                                "Position" : invt_position,
                                "FirstName" : applicant_first_name,
                                "LastName" : applicant_last_name,
                                "City" : applicant_city,
                                "State" : applicant_state,
                                "Country" : applicant_country,
                                "Residence" : inventor_residence,
                                "FileName" : args_array['file_name']
                            })
                            #print(processed_inventor)
                            invt_position += 1

            # Find all inventor data
            for invts in prt.findall('inventors'):
                for inv in invts.findall('inventor'):
                    try: inventor_sequence = USPTOSanitizer.strip_leading_zeros(inv.attrib['sequence'])
                    except: inventor_sequence = position
                    if inv.find('addressbook') != None:
                        try: inventor_first_name = inv.find('addressbook').findtext('first-name')[:100].strip()
                        except: inventor_first_name = None
                        try: inventor_last_name = inv.find('addressbook').findtext('last-name')[:100].strip()
                        except: inventor_last_name = None
                        try: inventor_city = inv.find('addressbook').find('address').findtext('city')[:100].strip()
                        except: inventor_city = None
                        try: inventor_state = inv.find('addressbook').find('address').findtext('state')[:100].strip()
                        except: inventor_state = None
                        try:
                            inventor_country = inv.find('addressbook').find('address').findtext('country')[:5].strip()
                        except: inventor_country = None
                        try: inventor_residence = inv.find('addressbook').find('address').findtext('country')[:5].strip()
                        except: inventor_residence = None

                        # Append SQL data into dictionary to be written later
                        processed_inventor.append({
                            "table_name" : "uspto.INVENTOR_G",
                            "GrantID" : document_id,
                            "Position" : invt_position,
                            "FirstName" : inventor_first_name,
                            "LastName" : inventor_last_name,
                            "City" : inventor_city,
                            "State" : inventor_state,
                            "Country" : inventor_country,
                            "Residence" : inventor_residence,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_inventor)
                        invt_position += 1

            # Find all agent data
            for agns in prt.findall('agents'):
                position = 1
                for agn in agns.findall('agent'):
                    try: agent_sequence = USPTOSanitizer.strip_leading_zeros(agn.attrib['sequence'])
                    except: agent_sequence = position
                    if(agn.find('addressbook') != None):
                        try: agent_orgname = agn.find('addressbook').findtext('orgname')[:300].strip()
                        except: agent_orgname = None
                        try: agent_last_name = agn.find('addressbook').findtext('last-name')[:100].strip()
                        except: agent_last_name = None
                        try: agent_first_name = agn.find('addressbook').findtext('first-name')[:100].strip()
                        except: agent_first_name = None
                        try: agent_country = agn.find('addressbook').find('address').findtext('country')[:3].strip()
                        except: agent_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name" : "uspto.AGENT_G",
                            "GrantID" : document_id,
                            "Position" : agent_sequence,
                            "OrgName" : agent_orgname,
                            "LastName" : agent_last_name,
                            "FirstName" : agent_first_name,
                            "Country" : agent_country,
                            "FileName" : args_array['file_name']
                        })
                        #print(processed_agent)
                        position += 1

        # Find all assignee data
        for asn in r.findall('assignees'):
            position = 1
            for x in asn.findall('assignee'):
                if(x.find('addressbook') != None):
                    try: asn_orgname = x.find('addressbook').findtext('orgname')[:500].strip()
                    except: asn_orgname = None
                    try: asn_role = x.find('addressbook').findtext('role')[:45].strip()
                    except: asn_role = None
                    try: asn_city = x.find('addressbook').find('address').findtext('city')[:100].strip()
                    except: asn_city = None
                    try: asn_state = x.find('addressbook').find('address').findtext('state')[:100].strip()
                    except: asn_state = None
                    try: asn_country = x.find('addressbook').find('address').findtext('country')[:5].strip()
                    except: asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name" : "uspto.ASSIGNEE_G",
                        "GrantID" : document_id,
                        "Position" : position,
                        "OrgName" : asn_orgname,
                        "Role" : asn_role,
                        "City" : asn_city,
                        "State" : asn_state,
                        "Country" : asn_country,
                        "FileName" : args_array['file_name']
                    })
                    #print(processed_assignee)
                    position += 1

        # Find all examiner data
        for exm in r.findall('examiners'):
            position = 1
            for x in exm.findall('primary-examiner'):
                try: exm_last_name = x.findtext('last-name')[:50].strip()
                except: exm_last_name = None
                try: exm_first_name = x.findtext('first-name')[:50].strip()
                except: exm_first_name = None
                try: exm_department = x.findtext('department')[:100].strip()
                except: exm_department = None

                # Append SQL data into dictionary to be written later
                processed_examiner.append({
                    "table_name" : "uspto.EXAMINER_G",
                    "GrantID" : document_id,
                    "Position" : position,
                    "LastName" : exm_last_name,
                    "FirstName" : exm_first_name,
                    "Department" : exm_department,
                    "FileName" : args_array['file_name']
                })
                #print(processed_examiner)
                position += 1

            for x in exm.findall('assistant-examiner'):
                try: exm_last_name = x.findtext('last-name')[:50].strip()
                except: exm_last_name = None
                try: exm_first_name = x.findtext('first-name')[:50].strip()
                except: exm_first_name = None
                try: exm_department = x.findtext('department')[:100].strip()
                except: exm_department = None

                # Append SQL data into dictionary to be written later
                processed_examiner.append({
                    "table_name" : "uspto.EXAMINER_G",
                    "GrantID" : document_id,
                    "Position" : position,
                    "LastName" : exm_last_name,
                    "FirstName" : exm_first_name,
                    "Department" : exm_department,
                    "FileName" : args_array['file_name']
                })
                #print(processed_examiner)
                position += 1

        # Find main priority claims tag
        pcs = r.find('priority-claims')
        position = 1
        if pcs is not None:
            # Find all priority claims in main tag
            for pc in pcs.findall('priority-claim'):
                # Assign data to vars
                try:  pc_country = pc.findtext('country')[:5].strip()
                except: pc_country = None
                try: pc_kind = pc.attrib['kind'][:45].strip()
                except: pc_kind = None
                try: pc_doc_num = pc.findtext('doc-number')[:45].strip()
                except: pc_doc_num = None
                try: pc_date = USPTOSanitizer.return_formatted_date(pc.findtext('date'), args_array, document_id)
                except: pc_date = None

                # Append SQL data into dictionary to be written later
                processed_foreignpriority.append({
                    "table_name" : "uspto.FOREIGNPRIORITY_G",
                    "GrantID" : document_id,
                    "Position" : position,
                    "Kind" : pc_kind,
                    "Country" : pc_country,
                    "DocumentID" : pc_doc_num,
                    "PriorityDate" : pc_date,
                    "FileName" : args_array['file_name']
                })
                #print(processed_foreignpriority)
                position += 1

    # Find the abstract
    try:
        a_elem = document_root.find('abstract')
        if a_elem is not None:
            abstract = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(a_elem))
        else: abstract = None
    except Exception as e:
        abstract = None
        #traceback.print_exc()
        #logger.error("Exception while extracting abstract from " + str(document_id) + ": " + traceback.print_exc())
    #print(abstract)

    # Find the description
    try:
        d_elem = document_root.find('description')
        if d_elem is not None:
            description = USPTOSanitizer.strip_for_csv(' '.join(d_elem.itertext()))
        else: description = None
    except Exception as e:
        description = None
        #traceback.print_exc()
        #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc())
    #print(description)

    # Find the claims
    try:
        c_elem = document_root.find('claims')
        if c_elem is not None:
            claims = USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext()))
        else: claims = None
    except Exception as e:
        claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc())
    #print(claims)

    # Append SQL data into dictionary to be written later
    try:
        processed_grant.append({
            "table_name" : "uspto.GRANT",
            "GrantID" : document_id,
            "Title" : title,
            "IssueDate" : pub_date,
            "Kind" : kind,
            "USSeriesCode" : series_code,
            "Abstract" : abstract,
            "ClaimsNum" : claims_num,
            "DrawingsNum" : number_of_drawings,
            "FiguresNum" : number_of_figures,
            "ApplicationID" : app_no,
            "Description" : description,
            "Claims" : claims,
            "FileDate" : app_date,
            "AppType" : app_type,
            "GrantLength" : grant_length,
            "FileName" : args_array['file_name']
        })
    except Exception as e:
        traceback.print_exc()
        logger.warning("Could not append patent data to array for patent number: " + document_id + " Traceback: " + traceback.format_exc())

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant" : processed_grant,
        "processed_applicant" : processed_applicant,
        "processed_examiner" : processed_examiner,
        "processed_assignee" : processed_assignee,
        "processed_agent" : processed_agent,
        "processed_inventor" : processed_inventor,
        "processed_usclass" : processed_usclass,
        "processed_intclass" : processed_intclass,
        "processed_cpcclass" : processed_cpcclass,
        "processed_gracit" : processed_gracit,
        "processed_forpatcit" : processed_forpatcit,
        "processed_nonpatcit" : processed_nonpatcit,
        "processed_foreignpriority" : processed_foreignpriority
    }
예제 #5
0
def extract_XML4_application_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    tags_dict = {
        "APPLICATION" : ["<us-patent-application"],
        "INTCLASS_A" : ["<classification-ipcr"],
        "USCLASS_A" : ["<main-classification", "<further-classification"],
        "CPCCLASS_A" : ["<classification-cpc"],
        "FOREIGNPRIORITY_A" : ["<priority-claim>", "<priority-claim "],
        "AGENT_A" : ["<agent>", "<agent "],
        "ASSIGNEE_A" : ["<assignee>", "<assignee "],
        "INVENTOR_A" : ["<inventor>", "<inventor "],
        "APPLICANT_A" : ["<us-applicant>", "<applicant>", "<us-applicant ", "<applicant "]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "APPLICATION" : 0,
        "INTCLASS_A" : 0,
        "USCLASS_A" : 0,
        "CPCCLASS_A" : 0,
        "FOREIGNPRIORITY_A" : 0,
        "AGENT_A" : 0,
        "ASSIGNEE_A" : 0,
        "INVENTOR_A" : 0,
        "APPLICANT_A" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Print to stdout and log
    print("-- Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict
예제 #6
0
def extract_XML1_application_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    #CPCCLASS_A and APPLICANT_A are not included in XML1 applications
    # APPLICANT_A are not include in XML1 applications
    tags_dict = {
        "APPLICATION" : ["<patent-application-publication"],
        "INTCLASS_A" : ["<classification-ipc-primary>", "<classification-ipc-secondary>"],
        "USCLASS_A" : ["<classification-us-primary>", "<classification-us-secondary>"],
        "FOREIGNPRIORITY_A" : ["<priority-application-number"],
        "AGENT_A" : ["<correspondence-address>"],
        "INVENTOR_A" : ["<first-named-inventor", "<inventor>"],
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "APPLICATION" : 0,
        "INTCLASS_A" : 0,
        "USCLASS_A" : 0,
        "CPCCLASS_A" : 0,
        "FOREIGNPRIORITY_A" : 0,
        "AGENT_A" : 0,
        "ASSIGNEE_A" : 0,
        "INVENTOR_A" : 0,
        "APPLICANT_A" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<patent-application-publication" in line:
            patent_xml_started = True
            xml_string += "<patent-application-publication>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</patent-application-publication" in line:
            patent_xml_started = False
            xml_string += "</patent-application-publication>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            document_root = ET.fromstring(xml_string)
            #print(document_root)
            # Extract the root tag
            r = document_root.find('subdoc-bibliographic-information')
            # Count the number of assignee tags
            counts_dict['ASSIGNEE_A'] += len(r.findall('assignee'))
            # Count the number of inventor tags
            counts_dict['INVENTOR_A'] += len(r.findall('inventor'))
            # Reset the xml string
            xml_string = ''

        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Print to stdout and log
    print("-- Finished the XML1 appication tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict
예제 #7
0
def extract_XML4_grant_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    tags_dict = {
        "GRANT" : ["<us-patent-grant"],
        "INTCLASS_G" : ["<classification-ipcr"],
        "AGENT_G" : ["<agent>", "<agent "],
        "ASSIGNEE_G" : ["<assignee>", "<assignee "],
        "APPLICANT_G" : ["<us-applicant>", "<us-applicant ", "<applicant", "<applicant>"],
        "INVENTOR_G" : ["<inventor>", "<inventor ", "applicant-inventor"],
        "NONPATCIT_G" : ["<nplcit"],
        "EXAMINER_G" : ["<primary-examiner", "<assistant-examiner"],
        "FOREIGNPRIORITY_G" : ["<priority-claim>", "<priority-claim "]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "GRANT" : 0,
        "INTCLASS_G" : 0,
        "CPCCLASS_G" : 0,
        "USCLASS_G" : 0,
        "INVENTOR_G" : 0,
        "AGENT_G" : 0,
        "ASSIGNEE_G" : 0,
        "APPLICANT_G" : 0,
        "NONPATCIT_G" : 0,
        "EXAMINER_G" : 0,
        "GRACIT_G" : 0,
        "FORPATCIT_G" : 0,
        "FOREIGNPRIORITY_G" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<us-patent-grant" in line:
            patent_xml_started = True
            xml_string += "<us-patent-grant>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</us-patent-grant" in line:
            patent_xml_started = False
            xml_string += "</us-patent-grant>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            document_root = ET.fromstring(xml_string)
            #print(document_root)
            # Extract the root tag
            r = document_root.find('us-bibliographic-data-grant')
            # Get the patent CPC class count
            foc = r.find('us-field-of-classification-search')
            if foc is not None:
                counts_dict["CPCCLASS_G"] += len(foc.findall('classification-cpc-text'))
                counts_dict["USCLASS_G"] += len(foc.findall('classification-national'))
            # Get USCLASS_G count if file format uses field-of-search
            foc = r.find('field-of-search')
            if foc is not None:
                counts_dict["USCLASS_G"] += len(foc.findall('classification-national'))
            # Count the citation / reference tags
            if r.find('us-references-cited') != None: ref_cited_id_string = "us-references-cited"
            elif r.find('references-cited') != None: ref_cited_id_string = "references-cited"
            else: ref_cited_id_string = "references"
            rf = r.find(ref_cited_id_string)
            if rf != None:
                # Check if the XML format is using 'citation' or 'us-citation'
                if rf.find('citation') != None: citation_id_string = "citation"
                elif rf.find('us-citation') != None: citation_id_string = "us-citation"
                else: citation_id_string = "us-citation"
                all_rfc = rf.findall(citation_id_string)
                for rfc in all_rfc:
                    # If the patent citation child is found must be a patent citation
                    if rfc.find('patcit') != None:
                        x = rfc.find('patcit')
                        try: citation_country = x.find('document-id').findtext('country').strip()
                        except: citation_country = None
                        # Check if US or foreign patent citation
                        if(citation_country == 'US'): counts_dict["GRACIT_G"] += 1
                        else: counts_dict["FORPATCIT_G"] += 1
            # Count the foreign patent citiation tags
            # Reset the xml string
            xml_string = ''

        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += line


    # Print to stdout and log
    print("-- Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))

    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict
예제 #8
0
def extract_XML1_application(raw_data, args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define required arrays
    processed_application = []
    processed_foreignpriority = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []

    # Set process start time
    start_time = time.time()

    # Print start message to stdout
    #print '- Starting to extract xml in USPTO application format ' + uspto_xml_format + " Start time: " + time.strftime("%c")

    #print raw_data
    # Pass the xml into Element tree object
    document_root = ET.fromstring(raw_data)
    r = document_root.find('subdoc-bibliographic-information')

    # Get and fix the document_id data
    di = r.find('document-id')
    if di is not None:
        try:
            # This document ID is NOT application number
            document_id = di.findtext('doc-number')
        except:
            document_id = None
            logger.error("No Patent Number was found for: " + url_link)
        try:
            kind = di.findtext('kind-code')[:2]
        except:
            kind = None
        try:
            pub_date = USPTOSanitizer.return_formatted_date(
                di.findtext('document-date'), args_array, document_id)
        except:
            pub_date = None
        try:
            app_type = r.findtext('publication-filing-type')[:45]
        except:
            app_type = None

    # Get application filing data
    ar = r.find('domestic-filing-data')
    if ar is not None:
        try:
            app_no = ar.find('application-number').findtext('doc-number')[:20]
        except:
            app_no = None
        try:
            app_date = USPTOSanitizer.return_formatted_date(
                ar.findtext('filing-date'), args_array, document_id)
        except:
            app_date = None
        try:
            series_code = ar.findtext('application-number-series-code')[:2]
        except:
            series_code = None

    technical_information_element = r.find('technical-information')
    # Init position
    position = 1
    if technical_information_element is not None:
        # Get international classification data
        ic = technical_information_element.find('classification-ipc')
        if ic is not None:

            # Process the primary international class
            icm = ic.find('classification-ipc-primary')
            #TODO: regex the class found into class, subclass and other
            #TODO: find out what maingrou and subgroup are found in this file format
            try:
                i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class(
                    icm.findtext('ipc'))
                i_class_sec = i_class_sec[:15]
                i_class = i_class[:15]
                i_subclass = i_subclass[:15]
                i_class_mgr = i_class_mgr[:15]
                i_class_sgr = i_class_sgr[:15]
            except:
                i_class_sec = None
                i_class = None
                i_subclass = None
                i_class_mgr = None
                i_class_sgr = None
                logger.warning(
                    "Malformed international class found in application ID: " +
                    document_id + " in file: " + url_link)

            # Append SQL data into dictionary to be written later
            processed_intclass.append({
                "table_name": "uspto.INTCLASS_A",
                "ApplicationID": app_no,
                "Position": position,
                "Section": i_class_sec,
                "Class": i_class,
                "SubClass": i_subclass,
                "MainGroup": i_class_mgr,
                "SubGroup": i_class_sgr,
                "FileName": args_array['file_name']
            })

            # Increment Position
            position += 1
            #print processed_intclass

            # Process any secondary international classes
            ics = ic.findall('classification-ipc-secondary')
            if ics is not None:
                for ics_item in ics:
                    try:
                        i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class(
                            ics_item.findtext('ipc'))
                        i_class_sec = i_class_sec[:15]
                        i_class = i_class[:15]
                        i_subclass = i_subclass[:15]
                        i_class_mgr = i_class_mgr[:15]
                        i_class_sgr = i_class_sgr[:15]
                    except:
                        i_class_sec = None
                        i_class = None
                        i_subclass = None
                        i_class_mgr = None
                        i_class_sgr = None
                        logger.warning(
                            "Malformed international class found in application ID: "
                            + document_id + " in file: " + url_link)

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1
                    #print(processed_intclass)

    # Get US classification data
    nc = technical_information_element.find('classification-us')
    # init position
    position = 1
    if nc is not None:

        uspc = nc.find('classification-us-primary').find('uspc')
        try:
            n_class_main = uspc.findtext('class')[:5]
        except:
            n_class_main = None
        try:
            n_subclass = uspc.findtext('subclass')[:15]
        except:
            n_subclass = None

        # Append SQL data into dictionary to be written later
        processed_usclass.append({
            "table_name": "uspto.USCLASS_A",
            "ApplicationID": app_no,
            "Position": position,
            "Class": n_class_main,
            "SubClass": n_subclass,
            "FileName": args_array['file_name']
        })

        # Increment position
        position += 1
        #print processed_usclass

        us_classification_secondary_element = nc.find(
            'classification-us-secondary')
        if us_classification_secondary_element is not None:
            uspc = us_classification_secondary_element.find('uspc')
            try:
                n_class_main = uspc.findtext('class')[:5]
            except:
                n_class_main = None
            try:
                n_subclass = uspc.findtext('subclass')[:5]
            except:
                n_subclass = None

            # Append SQL data into dictionary to be written later
            processed_usclass.append({
                "table_name": "uspto.USCLASS_A",
                "ApplicationID": app_no,
                "Position": position,
                "Class": n_class_main,
                "SubClass": n_subclass,
                "FileName": args_array['file_name']
            })

            # Increment position
            position += 1
            #print processed_usclass

    # Get priority claims
    position = 1
    pc_kind = None
    for pc in r.findall('foreign-priority-data'):
        try:
            pc_country = pc.findtext('country-code')[:100]
        except:
            pc_country = None
        try:
            pc_doc_num = pc.find('priority-application-number').findtext(
                'doc-number')[:100]
        except:
            pc_doc_num = None
        try:
            pc_date = USPTOSanitizer.return_formatted_date(
                pc.findtext('filing-date'), args_array, document_id)
        except:
            pc_date = None

        # Append SQL data into dictionary to be written later
        processed_foreignpriority.append({
            "table_name": "uspto.FOREIGNPRIORITY_A",
            "ApplicationID": app_no,
            "Position": position,
            "Kind": pc_kind,
            "Country": pc_country,
            "DocumentID": pc_doc_num,
            "PriorityDate": pc_date,
            "FileName": args_array['file_name']
        })
        position += 1
        #print(processed_foreignpriority)

    # Get invention title
    try:
        title = technical_information_element.findtext(
            'title-of-invention')[:500]
    except:
        title = None

    # Get inventor data
    iv = r.find('inventors')
    if iv is not None:

        # Init position
        position = 1

        for inventor in iv.findall('first-named-inventor'):
            n = inventor.find('name')
            try:
                inventor_first_name = n.findtext('given-name')[:100]
            except:
                inventor_first_name = None
            try:
                inventor_last_name = n.findtext('family-name')[:100]
            except:
                inventor_last_name = None

            res = inventor.find('residence')
            if res is not None:
                residence_us = res.find('residence-us')
                if residence_us is not None:
                    try:
                        inventor_city = residence_us.findtext('city')[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_us.findtext('state')[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_us.findtext(
                            'country-code')[:100]
                    except:
                        inventor_country = None
                residence_non_us = res.find('residence-non-us')
                if residence_non_us is not None:
                    try:
                        inventor_city = residence_non_us.findtext('city')[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_non_us.findtext(
                            'state')[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_non_us.findtext(
                            'country-code')[:100]
                    except:
                        inventor_country = None

            # Append SQL data into dictionary to be written later
            processed_inventor.append({
                "table_name": "uspto.INVENTOR_A",
                "ApplicationID": app_no,
                "Position": position,
                "FirstName": inventor_first_name,
                "LastName": inventor_last_name,
                "City": inventor_city,
                "State": inventor_state,
                "Country": inventor_country,
                "FileName": args_array['file_name']
            })

            # Increment position
            position += 1
            #print processed_inventor

        # For all secordary inventors
        for inventor in iv.findall('inventor'):
            if inventor is not None:
                n = inventor.find('name')
                if n is not None:
                    try:
                        inventor_first_name = n.findtext('given-name')[:100]
                    except:
                        inventor_first_name = None
                    try:
                        inventor_last_name = n.findtext('family-name')[:100]
                    except:
                        inventor_last_name = None

                res = inventor.find('residence')
                if res is not None:
                    residence_us = res.find('residence-us')
                    if residence_us is not None:
                        try:
                            inventor_city = residence_us.findtext('city')[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_us.findtext(
                                'state')[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_us.findtext(
                                'country-code')[:100]
                        except:
                            inventor_country = None
                    residence_non_us = res.find('residence-non-us')
                    if residence_non_us is not None:
                        try:
                            inventor_city = residence_non_us.findtext(
                                'city')[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_non_us.findtext(
                                'state')[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_non_us.findtext(
                                'country-code')[:100]
                        except:
                            inventor_country = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1
                    #print(processed_inventor)

    assignee_element = r.find('assignee')
    if assignee_element is not None:
        # init position
        position = 1
        try:
            asn_role = assignee_element.findtext('assignee-type')[:100]
        except:
            asn_role = None
        try:
            asn_orgname = assignee_element.findtext('organization-name')[:300]
        except:
            asn_orgname = None
        ad = assignee_element.find('address')
        try:
            asn_city = ad.findtext('city')[:100]
        except:
            asn_city = None
        try:
            asn_state = ad.findtext('state')[:100]
        except:
            asn_state = None
        try:
            asn_country = ad.find('country').findtext('country-code')[:100]
        except:
            asn_country = None

        # Append SQL data into dictionary to be written later
        processed_assignee.append({
            "table_name": "uspto.ASSIGNEE_A",
            "ApplicationID": app_no,
            "Position": position,
            "OrgName": asn_orgname,
            "Role": asn_role,
            "City": asn_city,
            "State": asn_state,
            "Country": asn_country,
            "FileName": args_array['file_name']
        })
        #print(processed_assignee)
        # increment position
        position += 1

    # Find the agent elements
    agent_element = r.find('correspondence-address')
    # init position
    position = 1
    if agent_element is not None:
        try:
            agent_orgname = agent_element.findtext('name-1')
        except:
            agent_orgname = None
        try:
            agent_orgname_2 = agent_element.findtext('name-2')
        except:
            agent_orgname_2 = None
        # Combine Orgname 1 and 2 and shorten if needed
        if agent_orgname != None and agent_orgname_2 != None:
            agent_orgname = agent_orgname + " " + agent_orgname_2
            agent_orgname = agent_orgname[:300]
        try:
            adresss_element = agent_element.find('address')
            if address_element is not None:
                try:
                    agent_city = adresss_element.findtext('city')[:100]
                except:
                    agent_city = None
                try:
                    agent_state = adresss_element.findtext('state')[:100]
                except:
                    agent_state = None
                try:
                    agent_country = adresss_element.find('country').findtext(
                        'country-code')[:100]
                except:
                    agent_country = None
        except:
            agent_city = None
            agent_state = None
            agent_country = None

        # Append SQL data into dictionary to be written later
        processed_agent.append({
            "table_name": "uspto.AGENT_A",
            "ApplicationID": app_no,
            "Position": position,
            "OrgName": agent_orgname,
            "Country": agent_country,
            "FileName": args_array['file_name']
        })

        # increment position
        position += 1

    # Find the abstract of the application
    try:
        abstract = USPTOSanitizer.return_element_text(
            document_root.find('subdoc-abstract')).strip()
    except:
        abstract = None

    # Append SQL data into dictionary to be written later
    processed_application.append({
        "table_name": "uspto.APPLICATION",
        "ApplicationID": app_no,
        "PublicationID": document_id,
        "AppType": app_type,
        "Title": title,
        "FileDate": app_date,
        "PublishDate": pub_date,
        "Kind": kind,
        "USSeriesCode": series_code,
        "Abstract": abstract,
        "FileName": args_array['file_name']
    })

    #print processed_application

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_application": processed_application,
        "processed_foreignpriority": processed_foreignpriority,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass
    }
예제 #9
0
def extract_XML4_application(raw_data, args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define required arrays
    processed_application = []
    processed_priority_claims = []
    processed_assignee = []
    processed_applicant = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []

    # Set process start time
    start_time = time.time()

    # Print start message to stdout
    #print '- Starting to extract xml in USPTO application format ' + uspto_xml_format + " Start time: " + time.strftime("%c")

    # Pass the raw data into Element tree xml object
    patent_root = ET.fromstring(raw_data)

    # Start extract XML data
    for r in patent_root.findall('us-bibliographic-data-application'):

        # Get basic document ID information
        pr = r.find('publication-reference')
        pub_doc = pr.find('document-id')
        try:
            pub_country = pub_doc.findtext('country')
        except:
            pub_country = None
        try:
            document_id = pub_doc.findtext('doc-number')
            document_id = USPTOSanitizer.fix_patent_number(document_id)
        except:
            document_id = None
            logger.error("No Patent Number was found for: " + url_link)
        try:
            kind = pub_doc.findtext('kind')[:2]
        except:
            kind = None
        try:
            pub_date = USPTOSanitizer.return_formatted_date(
                pub_doc.findtext('date'), args_array, document_id)
        except:
            pub_date = None

        # Get application reference data
        ar = r.find('application-reference')
        if ar is not None:
            try:
                app_type = ar.attrib['appl-type'][:45]
            except:
                app_type = None
            app_doc = ar.find('document-id')
            try:
                app_country = app_doc.findtext('country')
            except:
                app_country = None
            try:
                app_no = app_doc.findtext('doc-number')[:20]
            except:
                app_no = None
            try:
                app_date = USPTOSanitizer.return_formatted_date(
                    app_doc.findtext('date'), args_array, document_id)
            except:
                app_date = None
            # Get series code
            try:
                series_code = r.findtext('us-application-series-code')[:2]
            except:
                series_code = None

        # Get priority Claims
        pcs = r.find('priority-claims')
        if pcs is not None:
            for pc in pcs.findall('priority-claim'):
                try:
                    pc_sequence = USPTOSanitizer.strip_leading_zeros(
                        pc.attrib['sequence'])
                except:
                    pc_sequence = None
                try:
                    pc_kind = pc.attrib['kind'][:100]
                except:
                    pc_kind = None
                try:
                    pc_country = pc.findtext('country')[:100]
                except:
                    pc_country = None
                try:
                    pc_doc_num = pc.findtext('doc-number')[:100]
                except:
                    pc_doc_num = None
                try:
                    pc_date = USPTOSanitizer.return_formatted_date(
                        pc.findtext('date'), args_array, document_id)
                except:
                    pc_date = None

                # Append SQL data into dictionary to be written later
                processed_priority_claims.append({
                    "table_name":
                    "uspto.FOREIGNPRIORITY_A",
                    "ApplicationID":
                    app_no,
                    "Position":
                    pc_sequence,
                    "Kind":
                    pc_kind,
                    "Country":
                    pc_country,
                    "DocumentID":
                    pc_doc_num,
                    "PriorityDate":
                    pc_date,
                    "FileName":
                    args_array['file_name']
                })

                #print processed_priority_claims

        # Get International classifcation data
        ics = r.find('classifications-ipcr')
        # Init position for int classifications
        position = 1
        if ics is not None:
            # Get all international classification
            for icc in ics.findall('classification-ipcr'):

                for x in icc.getchildren():
                    if (USPTOSanitizer.check_tag_exists(x, 'section')):
                        i_class_sec = x.text[:100]
                    if (USPTOSanitizer.check_tag_exists(x, 'class')):
                        i_class = x.text[:15]
                    if (USPTOSanitizer.check_tag_exists(x, 'subclass')):
                        i_subclass = x.text[:15]
                    if (USPTOSanitizer.check_tag_exists(x, 'main-group')):
                        i_class_mgr = x.text[:15]
                    if (USPTOSanitizer.check_tag_exists(x, 'subgroup')):
                        i_class_sgr = x.text[:15]

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name": "uspto.INTCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Section": i_class_sec,
                    "Class": i_class,
                    "SubClass": i_subclass,
                    "MainGroup": i_class_mgr,
                    "SubGroup": i_class_sgr,
                    "FileName": args_array['file_name']
                })

                # Increment position
                position += 1

                #print processed_intclass

        # Get US Classification data
        nc = r.find('classification-national')
        # Init position
        position = 1
        if nc is not None:
            try:
                n_class_country = nc.findtext('country')
            except:
                n_class_country = None
            try:
                n_class_info = nc.findtext('main-classification')
            except:
                n_class_info = None
            try:
                n_class_main, n_subclass = USPTOSanitizer.return_class(
                    n_class_info)
                n_class_main = n_class_main[:5]
                n_subclass = n_subclass[:15]
            except:
                n_class_main = None
                n_subclass = None

            # Append SQL data into dictionary to be written later
            processed_usclass.append({
                "table_name": "uspto.USCLASS_A",
                "ApplicationID": app_no,
                "Position": position,
                "Class": n_class_main,
                "SubClass": n_subclass,
                "FileName": args_array['file_name']
            })

            # Increment position
            position += 1

            # TODO: find an instance of futher classification to parse
            if nc.findall('further-classification') is not None:
                nat_class_fur_root = nc.findall('further-classification')
                for n in nat_class_fur_root:
                    try:
                        n_class_info = n.text
                    except:
                        n_class_info = None
                    try:
                        n_class_main, n_subclass = USPTOSanitizer.return_class(
                            n_class_info)
                        n_class_main = n_class_main[:5]
                        n_subclass = n_subclass[:15]
                    except:
                        n_class_main = None
                        n_subclass = None

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1

        # Get CPC Classification data
        cpc_class_element = r.find('classifications-cpc')
        # Init position
        position = 1
        if cpc_class_element is not None:
            main_cpc_class_element = cpc_class_element.find('main-cpc')
            if main_cpc_class_element is not None:
                for cpc_class_item in main_cpc_class_element.findall(
                        'classification-cpc'):
                    try:
                        cpc_section = cpc_class_item.findtext('section')[:15]
                    except:
                        cpc_section = None
                    try:
                        cpc_class = cpc_class_item.findtext('class')[:15]
                    except:
                        cpc_class = None
                    try:
                        cpc_subclass = cpc_class_item.findtext('subclass')[:15]
                    except:
                        cpc_subclass = None
                    try:
                        cpc_mgr = cpc_class_item.findtext('main-group')[:15]
                    except:
                        cpc_mgr = None
                    try:
                        cpc_sgr = cpc_class_item.findtext('subgroup')[:15]
                    except:
                        cpc_sgr = None

                    # Append SQL data into dictionary to be written later
                    processed_cpcclass.append({
                        "table_name":
                        "uspto.CPCCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        cpc_section,
                        "Class":
                        cpc_class,
                        "SubClass":
                        cpc_subclass,
                        "MainGroup":
                        cpc_mgr,
                        "SubGroup":
                        cpc_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1

            further_cpc_class = cpc_class_element.find('further-cpc')
            if further_cpc_class is not None:
                for cpc_class_item in further_cpc_class.findall(
                        'classification-cpc'):
                    try:
                        cpc_section = cpc_class_item.findtext('section')[:15]
                    except:
                        cpc_section = None
                    try:
                        cpc_class = cpc_class_item.findtext('class')[:15]
                    except:
                        cpc_class = None
                    try:
                        cpc_subclass = cpc_class_item.findtext('subclass')[:15]
                    except:
                        cpc_subclass = None
                    try:
                        cpc_mgr = cpc_class_item.findtext('main-group')[:15]
                    except:
                        cpc_mgr = None
                    try:
                        cpc_sgr = cpc_class_item.findtext('subgroup')[:15]
                    except:
                        cpc_sgr = None

                    # Append SQL data into dictionary to be written later
                    processed_cpcclass.append({
                        "table_name":
                        "uspto.CPCCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        cpc_section,
                        "Class":
                        cpc_class,
                        "SubClass":
                        cpc_subclass,
                        "MainGroup":
                        cpc_mgr,
                        "SubGroup":
                        cpc_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1

        # Get the title of the application
        try:
            title = r.findtext('invention-title')[:500]
        except:
            title = None
            logger.error("Title not Found for :" + url_link +
                         " Application ID: " + app_no)

        # Get number of claims
        try:
            claims_num = r.findtext('number-of-claims')
        except:
            claims_num = None

        # Get number of figure, drawings
        nof = r.find('figures')
        if nof is not None:
            try:
                number_of_drawings = nof.findtext('number-of-drawing-sheets')
            except:
                number_of_drawings = None
            try:
                number_of_figures = nof.findtext('number-of-figures')
            except:
                number_of_figures = None
        else:
            number_of_drawings = None
            number_of_figures = None

        # Increment position
        position = 1
        # Get Associated party data
        parties_element = r.find('us-parties')
        if parties_element is not None:
            applicant_element = parties_element.find('us-applicants')
            # Get Applicant data
            for applicant_item in applicant_element.findall('us-applicant'):
                if (applicant_item.find('addressbook') != None):
                    try:
                        applicant_orgname = applicant_item.find(
                            'addressbook').findtext('orgname')[:300]
                    except:
                        applicant_orgname = None
                    try:
                        applicant_role = applicant_item.find(
                            'addressbook').findtext('role')
                    except:
                        applicant_role = None
                    try:
                        applicant_city = applicant_item.find(
                            'addressbook').find('address').findtext(
                                'city')[:100]
                    except:
                        applicant_city = None
                    try:
                        applicant_state = applicant_item.find(
                            'addressbook').find('address').findtext(
                                'state')[:100]
                    except:
                        applicant_state = None
                    try:
                        applicant_country = applicant_item.find(
                            'addressbook').find('address').findtext(
                                'country')[:100]
                    except:
                        applicant_country = None
                    try:
                        applicant_first_name = applicant_item.find(
                            'addressbook').findtext('first-name')[:100]
                    except:
                        applicant_first_name = None
                    try:
                        applicant_last_name = applicant_item.find(
                            'addressbook').findtext('last-name')[:100]
                    except:
                        applicant_last_name = None

                    # Append SQL data into dictionary to be written later
                    processed_applicant.append({
                        "table_name":
                        "uspto.APPLICANT_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "OrgName":
                        applicant_orgname,
                        "FirstName":
                        applicant_first_name,
                        "LastName":
                        applicant_last_name,
                        "City":
                        applicant_city,
                        "State":
                        applicant_state,
                        "Country":
                        applicant_country,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1

                    #print processed_applicant

            # Get the inventor data element
            invs = parties_element.find('inventors')
            # Init position
            position = 1
            if invs is not None:
                # Get all inventors
                for inv in invs.findall("inventor"):
                    if (inv.find('addressbook') != None):
                        try:
                            inventor_first_name = inv.find(
                                'addressbook').findtext('first-name')[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = inv.find(
                                'addressbook').findtext('last-name')[:100]
                        except:
                            inventor_last_name = None
                        try:
                            inventor_city = inv.find('addressbook').find(
                                'address').findtext('city')[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = inv.find('addressbook').find(
                                'address').findtext('state')[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = inv.find('addressbook').find(
                                'address').findtext('country')[:100]
                        except:
                            inventor_country = None
                        try:
                            inventor_nationality = inv.find(
                                'nationality').findtext('country')[:100]
                        except:
                            inventor_nationality = None
                        try:
                            inventor_residence = inv.find(
                                'residence').findtext('country')[:300]
                        except:
                            inventor_residence = None

                        # Append SQL data into dictionary to be written later
                        processed_inventor.append({
                            "table_name":
                            "uspto.INVENTOR_A",
                            "ApplicationID":
                            app_no,
                            "Position":
                            position,
                            "FirstName":
                            inventor_first_name,
                            "LastName":
                            inventor_last_name,
                            "City":
                            inventor_city,
                            "State":
                            inventor_state,
                            "Country":
                            inventor_country,
                            "Nationality":
                            inventor_nationality,
                            "Residence":
                            inventor_residence,
                            "FileName":
                            args_array['file_name']
                        })

                        # Increment position
                        position += 1

                        #print processed_inventor

            # Init position
            position = 1
            # Get agent data
            #TODO Find if available in application ??? Where
            agents_element = parties_element.find('agents')
            if agents_element is not None:
                for agent_item in agents_element.findall('agent'):
                    try:
                        asn_sequence = agent_item.attrib['sequence']
                    except:
                        asn_sequence = None
                    if (agent_item.find('addressbook') != None):
                        try:
                            atn_orgname = agent_item.find(
                                'addressbook').findtext('orgname')[:300]
                        except:
                            atn_orgname = None
                        try:
                            atn_last_name = agent_item.find(
                                'addressbook').findtext('last-name')[:100]
                        except:
                            atn_last_name = None
                        try:
                            atn_first_name = agent_item.find(
                                'addressbook').findtext('first-name')[:100]
                        except:
                            atn_first_name = None
                        try:
                            atn_country = agent_item.find('addressbook').find(
                                'address').findtext('country')[:100]
                        except:
                            atn_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_A",
                            "ApplicationID":
                            app_no,
                            "Position":
                            position,
                            "OrgName":
                            atn_orgname,
                            "LastName":
                            atn_last_name,
                            "FirstName":
                            atn_first_name,
                            "Country":
                            atn_country,
                            "FileName":
                            args_array['file_name']
                        })

                        # Increment position
                        position += 1

                        #print processed_agent

        # Get assignee data
        assignee_element = r.find('assignees')
        # Init position
        position += 1
        if assignee_element is not None:
            for assignee_item in assignee_element.findall('assignee'):
                if (assignee_item.find('addressbook') != None):
                    try:
                        assignee_orgname = assignee_item.find(
                            'addressbook').findtext('orgname')[:300]
                    except:
                        assignee_orgname = None
                    try:
                        assignee_role = assignee_item.find(
                            'addressbook').findtext('role')[:45]
                    except:
                        assignee_role = None
                    try:
                        assignee_city = assignee_item.find('addressbook').find(
                            'address').findtext('city')[:100]
                    except:
                        assignee_city = None
                    try:
                        assignee_state = assignee_item.find(
                            'addressbook').find('address').findtext(
                                'state')[:100]
                    except:
                        assignee_state = None
                    try:
                        assignee_country = assignee_item.find(
                            'addressbook').find('address').findtext(
                                'country')[:100]
                    except:
                        assignee_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "OrgName":
                        assignee_orgname,
                        "Role":
                        assignee_role,
                        "City":
                        assignee_city,
                        "State":
                        assignee_state,
                        "Country":
                        assignee_country,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment position
                    position += 1

                    #print processed_assignee

    # Get abstract data
    # Find the abstract
    try:
        abstract_element = patent_root.find('abstract')
        if abstract_element is not None:
            abstract = USPTOSanitizer.return_element_text(abstract_element)
    except:
        abstract = None
    #print abstract

    # Append SQL data into dictionary to be written later
    processed_application.append({
        "table_name": "uspto.APPLICATION",
        "ApplicationID": app_no,
        "PublicationID": document_id,
        "AppType": app_type,
        "Title": title,
        "FileDate": app_date,
        "PublishDate": pub_date,
        "Kind": kind,
        "USSeriesCode": series_code,
        "Abstract": abstract,
        "ClaimsNum": claims_num,
        "DrawingsNum": number_of_drawings,
        "FiguresNum": number_of_figures,
        "FileName": args_array['file_name']
    })

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_application": processed_application,
        "processed_priority_claims": processed_priority_claims,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass,
    }
예제 #10
0
def extract_XML1_application(raw_data, args_array):

    # Set process start time
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define required arrays
    processed_application = []
    processed_foreignpriority = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []

    # Pass the xml into Element tree object
    document_root = ET.fromstring(raw_data)
    r = document_root.find('subdoc-bibliographic-information')

    # Get and fix the document_id data
    di = r.find('document-id')
    if di is not None:
        # This document ID is NOT application number
        try:
            document_id = di.findtext('doc-number').strip()
        except:
            document_id = None
            logger.error("No Patent Number was found for: " + url_link)
        try:
            kind = di.findtext('kind-code').strip()[:2]
            app_type = USPTOSanitizer.return_xml2_app_type(args_array,
                                                           kind).strip()
        except:
            kind = None
            app_type = None
        try:
            pub_date = USPTOSanitizer.return_formatted_date(
                di.findtext('document-date'), args_array, document_id)
        except:
            pub_date = None

    # Get application filing data
    ar = r.find('domestic-filing-data')
    if ar is not None:
        try:
            app_no = ar.find('application-number').findtext(
                'doc-number').strip()[:20]
        except:
            app_no = None
        try:
            app_date = USPTOSanitizer.return_formatted_date(
                ar.findtext('filing-date'), args_array, document_id)
        except:
            app_date = None
        try:
            series_code = ar.findtext(
                'application-number-series-code').strip()[:2]
        except:
            series_code = None

    # Get technical information
    ti = r.find('technical-information')
    if ti is not None:

        # Get invention title
        try:
            title = USPTOSanitizer.strip_for_csv(
                ti.findtext('title-of-invention')[:500])
        except:
            title = None

        # Get international classification data
        ic = ti.find('classification-ipc')
        if ic is not None:
            # Init position
            position = 1
            # Process the primary international class
            icm = ic.find('classification-ipc-primary')
            if icm is not None:
                #print(icm.findtext('ipc'))
                # Clear variable values
                i_class_sec = None
                i_class = None
                i_subclass = None
                i_class_mgr = None
                i_class_sgr = None
                i_malformed = None
                try:
                    i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application(
                        icm.findtext('ipc'))
                    i_class_sec = i_class_sec.strip()[:15]
                    i_class = i_class.strip()[:15]
                    i_subclass = i_subclass.strip()[:15]
                    i_class_mgr = i_class_mgr.strip()[:15]
                    i_class_sgr = i_class_sgr.strip()[:15]
                except Exception as e:
                    traceback.print_exc()
                    i_class_sec = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = 1
                    logger.warning(
                        "Malformed international class found in application ID: "
                        + document_id + " in file: " + url_link)

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name": "uspto.INTCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Section": i_class_sec,
                    "Class": i_class,
                    "SubClass": i_subclass,
                    "MainGroup": i_class_mgr,
                    "SubGroup": i_class_sgr,
                    "Malformed": i_malformed,
                    "FileName": args_array['file_name']
                })
                #print(processed_intclass)
                position += 1

            # Process any secondary international classes
            ics = ic.findall('classification-ipc-secondary')
            if ics is not None:
                for ics_item in ics:
                    # Clear variable values
                    i_class_sec = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    try:
                        i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application(
                            ics_item.findtext('ipc'))
                        i_class_sec = i_class_sec.strip()[:15]
                        i_class = i_class.strip()[:15]
                        i_subclass = i_subclass.strip()[:15]
                        i_class_mgr = i_class_mgr.strip()[:15]
                        i_class_sgr = i_class_sgr.strip()[:15]
                    except Exception as e:
                        traceback.print_exc()
                        i_class_sec = None
                        i_class = None
                        i_subclass = None
                        i_class_mgr = None
                        i_class_sgr = None
                        i_malformed = 1
                        logger.warning(
                            "Malformed international class found in application ID: "
                            + document_id + " in file: " + url_link)

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

        # Get US classification data
        nc = ti.find('classification-us')
        nc_position = 1
        if nc is not None:
            uspc = nc.find('classification-us-primary').find('uspc')
            if uspc is not None:
                n_class_main = None
                n_subclass = None
                try:
                    n_class_main = uspc.findtext('class').strip()[:5]
                except:
                    n_class_main = None
                try:
                    n_subclass = uspc.findtext('subclass').strip()[:15]
                except:
                    n_subclass = None

                # Append SQL data into dictionary to be written later
                processed_usclass.append({
                    "table_name": "uspto.USCLASS_A",
                    "ApplicationID": app_no,
                    "Position": nc_position,
                    "Class": n_class_main,
                    "SubClass": n_subclass,
                    "FileName": args_array['file_name']
                })
                #print(processed_usclass)
                nc_position += 1

            # Collect all Secondary US class
            ncs = nc.findall('classification-us-secondary')
            for ncs_item in ncs:
                n_class_main = None
                n_subclass = None
                uspc = ncs_item.find('uspc')
                if uspc is not None:
                    try:
                        n_class_main = uspc.findtext('class').strip()[:5]
                    except:
                        n_class_main = None
                    try:
                        n_subclass = uspc.findtext('subclass').strip()[:5]
                    except:
                        n_subclass = None

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        nc_position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_usclass)
                    nc_position += 1

    # Get priority claims
    pc_position = 1
    pc_kind = None
    for pc in r.findall('foreign-priority-data'):
        try:
            pc_country = pc.findtext('country-code').strip()[:100]
        except:
            pc_country = None
        try:
            pc_doc_num = pc.find('priority-application-number').findtext(
                'doc-number').strip()[:100]
        except:
            pc_doc_num = None
        try:
            pc_date = USPTOSanitizer.return_formatted_date(
                pc.findtext('filing-date'), args_array, document_id)
        except:
            pc_date = None

        # Append SQL data into dictionary to be written later
        processed_foreignpriority.append({
            "table_name": "uspto.FOREIGNPRIORITY_A",
            "ApplicationID": app_no,
            "Position": pc_position,
            "Kind": pc_kind,
            "Country": pc_country,
            "DocumentID": pc_doc_num,
            "PriorityDate": pc_date,
            "FileName": args_array['file_name']
        })
        #print(processed_foreignpriority)
        pc_position += 1

    # Get inventor data
    invs = r.find('inventors')
    if invs is not None:
        # Init position
        inv_position = 1
        for inventor in invs.findall('first-named-inventor'):
            n = inventor.find('name')
            try:
                inventor_first_name = n.findtext('given-name').strip()[:100]
            except:
                inventor_first_name = None
            try:
                inventor_last_name = n.findtext('family-name').strip()[:100]
            except:
                inventor_last_name = None
            # Get the residence tag
            res = inventor.find('residence')
            if res is not None:
                residence_us = res.find('residence-us')
                if residence_us is not None:
                    try:
                        inventor_city = residence_us.findtext(
                            'city').strip()[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_us.findtext(
                            'state').strip()[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_us.findtext(
                            'country-code').strip()[:100]
                    except:
                        inventor_country = None
                residence_non_us = res.find('residence-non-us')
                if residence_non_us is not None:
                    try:
                        inventor_city = residence_non_us.findtext(
                            'city').strip()[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_non_us.findtext(
                            'state').strip()[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_non_us.findtext(
                            'country-code').strip()[:100]
                    except:
                        inventor_country = None

            # Append SQL data into dictionary to be written later
            processed_inventor.append({
                "table_name": "uspto.INVENTOR_A",
                "ApplicationID": app_no,
                "Position": inv_position,
                "FirstName": inventor_first_name,
                "LastName": inventor_last_name,
                "City": inventor_city,
                "State": inventor_state,
                "Country": inventor_country,
                "FileName": args_array['file_name']
            })
            #print(processed_inventor)
            inv_position += 1

        # For all secordary inventors
        for inv in invs.findall('inventor'):
            if inv is not None:
                n = inv.find('name')
                if n is not None:
                    try:
                        inventor_first_name = n.findtext(
                            'given-name').strip()[:100]
                    except:
                        inventor_first_name = None
                    try:
                        inventor_last_name = n.findtext(
                            'family-name').strip()[:100]
                    except:
                        inventor_last_name = None

                res = inv.find('residence')
                if res is not None:
                    residence_us = res.find('residence-us')
                    if residence_us is not None:
                        try:
                            inventor_city = residence_us.findtext(
                                'city').strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_us.findtext(
                                'state').strip()[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_us.findtext(
                                'country-code').strip()[:100]
                        except:
                            inventor_country = None
                    residence_non_us = res.find('residence-non-us')
                    if residence_non_us is not None:
                        try:
                            inventor_city = residence_non_us.findtext(
                                'city').strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_non_us.findtext(
                                'state').strip()[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_non_us.findtext(
                                'country-code').strip()[:100]
                        except:
                            inventor_country = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        inv_position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_inventor)
                    inv_position += 1

    # Get assignee data
    # Init position
    asn_position = 1
    for asn in r.findall('assignee'):
        try:
            asn_role = asn.findtext('assignee-type').strip()[:100]
        except:
            asn_role = None
        try:
            asn_orgname = asn.findtext('organization-name').strip()[:300]
        except:
            asn_orgname = None
        adr_elem = asn.find('address')
        try:
            asn_city = adr_elem.findtext('city').strip()[:100]
        except:
            asn_city = None
        try:
            asn_state = adr_elem.findtext('state').strip()[:100]
        except:
            asn_state = None
        try:
            asn_country = adr_elem.find('country').findtext(
                'country-code').strip()[:100]
        except:
            asn_country = None
        if asn_country == None:
            if USPTOSanitizer.is_US_state(asn_state):
                asn_country = "US"
        # These have not been found in XML1,
        # but a full XML parse should be done
        asn_firstname = None
        asn_lastname = None

        # Append SQL data into dictionary to be written later
        processed_assignee.append({
            "table_name": "uspto.ASSIGNEE_A",
            "ApplicationID": app_no,
            "Position": asn_position,
            "OrgName": asn_orgname,
            "FirstName": asn_firstname,
            "LastName": asn_lastname,
            "Role": asn_role,
            "City": asn_city,
            "State": asn_state,
            "Country": asn_country,
            "FileName": args_array['file_name']
        })
        #print(processed_assignee)
        asn_position += 1

    # Find the agent element
    agn = r.find('correspondence-address')
    # Init position
    agn_position = 1
    if agn is not None:
        try:
            agent_orgname = agn.findtext('name-1').strip()
        except:
            agent_orgname = None
        try:
            agent_orgname_2 = agn.findtext('name-2').strip()
        except:
            agent_orgname_2 = None
        # Combine Orgname 1 and 2 and shorten if needed
        if agent_orgname != None and agent_orgname_2 != None:
            agent_orgname = USPTOSanitizer.strip_for_csv(agent_orgname + " " +
                                                         agent_orgname_2)[:300]
        # Get the address element
        addr_elem = agn.find('address')
        if addr_elem is not None:
            try:
                try:
                    agent_addr_1 = addr_elem.findtext(
                        'address-1').strip()[:100]
                except:
                    agent_addr_1 = ""
                try:
                    agent_addr_2 = addr_elem.findtext(
                        'address-2').strip()[:100]
                except:
                    agent_addr_2 = ""
                agent_address = USPTOSanitizer.strip_for_csv(agent_addr_1 +
                                                             agent_addr_2)
            except:
                agent_address = None
            try:
                agent_city = addr_elem.findtext('city').strip()[:50]
            except:
                agent_city = None
            try:
                agent_state = addr_elem.findtext('state').strip()[:3]
            except:
                agent_state = None
            try:
                agent_country = addr_elem.find('country').findtext(
                    'country-code').strip()[:3]
            except:
                if USPTOSanitizer.is_US_state(agent_state):
                    agent_country = "US"
                else:
                    agent_country = None

        # Append SQL data into dictionary to be written later
        processed_agent.append({
            "table_name": "uspto.AGENT_A",
            "ApplicationID": app_no,
            "Position": agn_position,
            "OrgName": agent_orgname,
            "Address": agent_address,
            "City": agent_city,
            "State": agent_state,
            "Country": agent_country,
            "FileName": args_array['file_name']
        })
        #print(processed_agent)
        agn_position += 1

    # Find the abstract of the application
    try:
        abstract = USPTOSanitizer.strip_for_csv(
            USPTOSanitizer.return_element_text(
                document_root.find('subdoc-abstract')))
    except:
        abstract = None

    # Find the description
    try:
        description = ""
        d_elem = document_root.find('subdoc-description')
        if d_elem is not None:
            description += USPTOSanitizer.strip_for_csv(' '.join(
                d_elem.itertext()))
        else:
            description = None
    except Exception as e:
        description = None
        #traceback.print_exc()
        #logger.error("Exception while extracting description from " + str(app_no))
    #print(description)

    # Find the claims
    try:
        claims = ""
        c_elem = document_root.find('subdoc-claims')
        if c_elem is not None:
            claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext()))
        else:
            claims = None
    except Exception as e:
        claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting claim from " + str(app_no))
    #print(claims)

    # Find the number of claims
    try:
        number_of_claims = 0
        for clms in c_elem.findall('claim'):
            number_of_claims += 1
    except Exception as e:
        number_of_claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting number of claims from " + str(app_no))
    #print(number_of_claims)

    # Find the number of drawings and figures
    try:
        number_of_figures = 0
        number_of_drawings = 0
        drw_elem = document_root.find('subdoc-drawings')
        if drw_elem != None:
            for fg in drw_elem.findall('figure'):
                img_type = fg.find('image').attrib['ti'].strip()
                if img_type == "DR": number_of_drawings += 1
                elif img_type == "FG": number_of_figures += 1
        else:
            number_of_figures = None
            number_of_drawings = None
    except Exception as e:
        number_of_figures = None
        number_of_drawings = None
        #traceback.print_exc()
        #logger.error("Exception while extracting figures and drawings num " + str(app_no))
    #print(number_of_figures)
    #print(number_of_drawings)

    # Append SQL data into dictionary to be written later
    processed_application.append({
        "table_name": "uspto.APPLICATION",
        "ApplicationID": app_no,
        "PublicationID": document_id,
        "AppType": app_type,
        "Title": title,
        "FileDate": app_date,
        "PublishDate": pub_date,
        "Kind": kind,
        "USSeriesCode": series_code,
        "Abstract": abstract,
        "ClaimsNum": number_of_claims,
        "DrawingsNum": number_of_drawings,
        "FiguresNum": number_of_figures,
        "Description": description,
        "Claims": claims,
        "FileName": args_array['file_name']
    })
    #print(processed_application)

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_application": processed_application,
        "processed_foreignpriority": processed_foreignpriority,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass
    }
예제 #11
0
def extract_XML4_grant(raw_data, args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define all arrays to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []

    # Stat process timer
    start_time = time.time()

    # Pass the raw_data data into Element Tree
    patent_root = ET.fromstring(raw_data)

    # Start the extraction of XML data
    for r in patent_root.findall('us-bibliographic-data-grant'):

        # Find the main patent grant data
        for pr in r.findall('publication-reference'):
            for di in pr.findall('document-id'):
                try:
                    pub_country = di.findtext('country')
                except:
                    pub_country = None
                try:
                    document_id = di.findtext('doc-number')
                    document_id = USPTOSanitizer.fix_patent_number(
                        document_id)[:20]
                except:
                    document_id = None
                    logger.error("No Patent Number was found for: " + url_link)
                try:
                    kind = di.findtext('kind')[:2]
                except:
                    kind = None
                try:
                    pub_date = USPTOSanitizer.return_formatted_date(
                        di.findtext('date'), args_array, document_id)
                except:
                    pub_date = None

        # Find the main application data
        for ar in r.findall('application-reference'):
            try:
                app_type = ar.attrib['appl-type'][:45]
            except:
                app_type = None
            for di in ar.findall('document-id'):
                try:
                    app_country = di.findtext('country')
                except:
                    app_country = None
                try:
                    app_no = di.findtext('doc-number')[:20]
                except:
                    app_no = None
                try:
                    app_date = USPTOSanitizer.return_formatted_date(
                        di.findtext('date'), args_array, document_id)
                except:
                    app_date = None

        # Get the series code
        try:
            series_code = r.findtext('us-application-series-code')[:2]
        except:
            series_code = None

        # Get the length of grant
        try:
            terms_of_grant = r.find("us-term-of-grant").findtext(
                "length-of-grant")
        except:
            terms_of_grant = None

        # Find all international classifications
        ic = r.find('classifications-ipcr')
        position = 1
        if ic is not None:
            for icc in ic.findall('classification-ipcr'):
                for x in icc.getchildren():
                    if (USPTOSanitizer.check_tag_exists(x, 'section')):
                        try:
                            i_class_sec = x.text[:15]
                        except:
                            i_class_sec = None
                    if (USPTOSanitizer.check_tag_exists(x, 'class')):
                        try:
                            i_class_cls = x.text[:15]
                        except:
                            i_class_cls = None
                    if (USPTOSanitizer.check_tag_exists(x, 'subclass')):
                        try:
                            i_class_sub = x.text[:15]
                        except:
                            i_class_sub = None
                    if (USPTOSanitizer.check_tag_exists(x, 'main-group')):
                        try:
                            i_class_mgr = x.text[:15]
                        except:
                            i_class_mgr = None
                    if (USPTOSanitizer.check_tag_exists(x, 'subgroup')):
                        try:
                            i_class_sgr = x.text[:15]
                        except:
                            i_class_sgr = None

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name": "uspto.INTCLASS_G",
                    "GrantID": document_id,
                    "Position": position,
                    "Section": i_class_sec,
                    "Class": i_class_cls,
                    "SubClass": i_class_sub,
                    "MainGroup": i_class_mgr,
                    "SubGroup": i_class_sgr,
                    "FileName": args_array['file_name']
                })

                position += 1

        # Find all CPC classifications
        cpc = r.find('us-field-of-classification-search')
        #print nat_class_element
        if cpc is not None:
            position = 1
            for cpcc in cpc.findall('classification-cpc-text'):

                try:
                    #print cpc.text
                    cpc_text = cpcc.text
                    #print cpc_text
                    cpc_class_string, cpc_group_string = cpc_text.split(" ")
                    #print cpc_class_string + " " + cpc_group_string
                    cpc_class_sec = cpc_text[0]
                    cpc_class = cpc_class_string[1:3]
                    cpc_subclass = cpc_class_string[3]
                    cpc_class_mgr, cpc_class_sgr = cpc_group_string.rsplit(
                        "/", 1)
                    cpc_class_mgr = cpc_class_mgr[:15]
                    cpc_class_sgr = cpc_class_sgr[:15]
                    #print cpc_class_sec + " " + cpc_class + " " + cpc_subclass + " " + cpc_class_mgr + " " + cpc_class_sgr
                except:
                    #traceback.print_exc()
                    cpc_class_sec = None
                    cpc_class = None
                    cpc_subclass = None
                    cpc_class_mgr = None
                    cpc_class_sgr = None
                    logger.warning(
                        "There was an error parsing the cpc class for Grant ID: "
                        + document_id + " in file: " + url_link)
                    logger.warning("Traceback: " + traceback.format_exc())

                # Append SQL data into dictionary to be written later
                processed_cpcclass.append({
                    "table_name": "uspto.CPCCLASS_G",
                    "GrantID": document_id,
                    "Position": position,
                    "Section": cpc_class_sec,
                    "Class": cpc_class,
                    "SubClass": cpc_subclass,
                    "MainGroup": cpc_class_mgr,
                    "SubGroup": cpc_class_sgr,
                    "FileName": args_array['file_name']
                })

                position += 1

        # Find all US classifications
        for nc in r.findall('classification-national'):
            position = 1
            try:
                n_class_info = nc.findtext('main-classification')
                n_class_main, n_subclass = USPTOSanitizer.return_class(
                    n_class_info)
                n_class_main = n_class_main[:5]
                n_subclass = n_subclass[:15]
            except:
                n_class_main = None
                n_subclass = None

            # Append SQL data into dictionary to be written later
            processed_usclass.append({
                "table_name": "uspto.USCLASS_G",
                "GrantID": document_id,
                "Position": position,
                "Class": n_class_main,
                "SubClass": n_subclass,
                "FileName": args_array['file_name']
            })

            position += 1

            n_class_fur_root = nc.findall(
                'further-classification')  #return a list of all elements
            for n in n_class_fur_root:
                try:
                    n_class_info = n.text
                except:
                    n_class_info = None
                try:
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main[:5]
                    n_subclass = n_subclass[:15]
                except:
                    n_class_main = None
                    n_subclass = None

                # Append SQL data into dictionary to be written later
                processed_usclass.append({
                    "table_name": "uspto.USCLASS_G",
                    "GrantID": document_id,
                    "Position": position,
                    "Class": n_class_main,
                    "SubClass": n_subclass,
                    "FileName": args_array['file_name']
                })

                position += 1

        # Find the title of the patent
        try:
            title = r.findtext('invention-title')[:500]
        except:
            title = None

        # Find all references cited in the grant
        for rf in r.findall('us-references-cited'):
            for rfc in rf.findall('us-citation'):
                # If the patent citation child is found must be a patent citation
                if (rfc.find('patcit') != None):
                    position = 1
                    try:
                        citation_position = USPTOSanitizer.strip_leading_zeros(
                            rfc.find('patcit').attrib['num'])
                    except:
                        citation_position = position
                    for x in rfc.findall('patcit'):
                        try:
                            citation_country = x.find('document-id').findtext(
                                'country')[:100]
                        except:
                            citation_country = None
                        try:
                            citation_grant_id = x.find('document-id').findtext(
                                'doc-number')[:20]
                        except:
                            citation_grant_id = None
                        try:
                            citation_kind = x.find('document-id').findtext(
                                'kind')[:10]
                        except:
                            citation_kind = None
                        try:
                            citation_name = x.find('document-id').findtext(
                                'name')[:100]
                        except:
                            citation_name = None
                        try:
                            citation_date = USPTOSanitizer.return_formatted_date(
                                x.find('document-id').findtext('date'),
                                args_array, document_id)
                        except:
                            citation_date = None
                        try:
                            if rfc.findtext('category') == "cited by examiner":
                                citation_category = 1
                            else:
                                citation_category = 0
                        except:
                            citation_category = None

                    # US patent citations
                    if (citation_country.strip().upper() == 'US'):

                        # Append SQL data into dictionary to be written later
                        processed_gracit.append({
                            "table_name":
                            "uspto.GRACIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            citation_position,
                            "CitedID":
                            citation_grant_id,
                            "Kind":
                            citation_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                    elif (citation_country.strip().upper() != 'US'):

                        # Append SQL data into dictionary to be written later
                        processed_forpatcit.append({
                            "table_name":
                            "uspto.FORPATCIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            citation_position,
                            "CitedID":
                            citation_grant_id,
                            "Kind":
                            citation_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                # If the non patent citations are found
                elif (rfc.find('nplcit') != None):
                    position = 1
                    for x in rfc.findall('nplcit'):
                        try:
                            citation_position = USPTOSanitizer.strip_leading_zeros(
                                rfc.find('nplcit').attrib['num'])
                        except:
                            citation_position = position
                        # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                        try:
                            non_patent_citation_text = x.findtext('othercit')
                        except:
                            non_patent_citation_text = None
                        # TODO: check that strip tags is working
                        try:
                            non_patent_citation_text = re.sub(
                                '<[^>]+>', '',
                                non_patent_citation_text).replace('\n', "")
                        except:
                            non_patent_citation_text = None
                        # TODO: parse the category into boolean for now  How many categories are there and what are they??
                        # TODO: change category to boolean in schema
                        try:
                            if x.findtext('category') == "cited by examiner":
                                citation_category = 1
                            else:
                                citation_category = 0
                        except:
                            citation_category = None

                        # Append SQL data into dictionary to be written later
                        processed_nonpatcit.append({
                            "table_name":
                            "uspto.NONPATCIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            citation_position,
                            "Citation":
                            non_patent_citation_text,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

        # Find number of claims
        try:
            claims_num = r.findtext('number-of-claims')
        except:
            claims_num = None

        # Find the number of figures and number of drawings
        nof = r.find('figures')
        try:
            number_of_drawings = nof.findtext('number-of-drawing-sheets')
            number_of_drawings = number_of_drawings.split("/")[0]
        except:
            number_of_drawings = None
        try:
            number_of_figures = nof.findtext('number-of-figures')
        except:
            number_of_figures = None

        # Find the parties
        for prt in r.findall('us-parties'):
            # Find all applicant data
            for apts in prt.findall('us-applicants'):
                position = 1
                for apt in apts.findall('us-applicant'):
                    if (apt.find('addressbook') != None):
                        try:
                            applicant_orgname = apt.find(
                                'addressbook').findtext('orgname')[:300]
                        except:
                            applicant_orgname = None
                        try:
                            applicant_first_name = apt.find(
                                'addressbook').findtext('first-name')[:100]
                        except:
                            applicant_first_name = None
                        try:
                            applicant_last_name = apt.find(
                                'addressbook').findtext('last-name')[:100]
                        except:
                            applicant_last_name = None
                        try:
                            applicant_city = apt.find('addressbook').find(
                                'address').findtext('city')[:100]
                        except:
                            applicant_city = None
                        try:
                            applicant_state = apt.find('addressbook').find(
                                'address').findtext('state')[:100]
                        except:
                            applicant_state = None
                        try:
                            applicant_country = apt.find('addressbook').find(
                                'address').findtext('country')[:100]
                        except:
                            applicant_country = None

                        # Append SQL data into dictionary to be written later

                        processed_applicant.append({
                            "table_name":
                            "uspto.APPLICANT_G",
                            "GrantID":
                            document_id,
                            "OrgName":
                            applicant_orgname,
                            "Position":
                            position,
                            "FirstName":
                            applicant_first_name,
                            "LastName":
                            applicant_last_name,
                            "City":
                            applicant_city,
                            "State":
                            applicant_state,
                            "Country":
                            applicant_country,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

            # Find all inventor data
            for apts in prt.findall('inventors'):
                position = 1
                for apt in apts.findall('inventor'):
                    try:
                        inventor_sequence = USPTOSanitizer.strip_leading_zeros(
                            apt.attrib['sequence'])
                    except:
                        inventor_sequence = position
                    if (apt.find('addressbook') != None):
                        try:
                            inventor_first_name = apt.find(
                                'addressbook').findtext('first-name')[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = apt.find(
                                'addressbook').findtext('last-name')[:100]
                        except:
                            inventor_last_name = None
                        try:
                            inventor_city = apt.find('addressbook').find(
                                'address').findtext('city')[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = apt.find('addressbook').find(
                                'address').findtext('state')[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = apt.find('addressbook').find(
                                'address').findtext('country')[:100]
                        except:
                            inventor_country = None
                        try:
                            inventor_residence = apt.find('addressbook').find(
                                'address').findtext('country')[:300]
                        except:
                            inventor_residence = None

                        # Append SQL data into dictionary to be written later

                        processed_inventor.append({
                            "table_name":
                            "uspto.INVENTOR_G",
                            "GrantID":
                            document_id,
                            "Position":
                            inventor_sequence,
                            "FirstName":
                            inventor_first_name,
                            "LastName":
                            inventor_last_name,
                            "City":
                            inventor_city,
                            "State":
                            inventor_state,
                            "Country":
                            inventor_country,
                            "Residence":
                            inventor_residence,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

            # Find all agent data
            for agns in prt.findall('agents'):
                position = 1
                for agn in agns.findall('agent'):
                    try:
                        agent_sequence = USPTOSanitizer.strip_leading_zeros(
                            agn.attrib['sequence'])
                    except:
                        agent_sequence = position
                    if (agn.find('addressbook') != None):
                        try:
                            agent_orgname = agn.find('addressbook').findtext(
                                'orgname')[:300]
                        except:
                            agent_orgname = None
                        try:
                            agent_last_name = agn.find('addressbook').findtext(
                                'last-name')[:100]
                        except:
                            agent_last_name = None
                        try:
                            agent_first_name = agn.find(
                                'addressbook').findtext('first-name')[:100]
                        except:
                            agent_first_name = None
                        try:
                            agent_country = agn.find('addressbook').find(
                                'address').findtext('country')[:100]
                        except:
                            agent_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            agent_sequence,
                            "OrgName":
                            agent_orgname,
                            "LastName":
                            agent_last_name,
                            "FirstName":
                            agent_first_name,
                            "Country":
                            agent_country,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

        # Find all assignee data
        for asn in r.findall('assignees'):
            position = 1
            for x in asn.findall('assignee'):
                if (x.find('addressbook') != None):
                    try:
                        asn_orgname = x.find('addressbook').findtext(
                            'orgname')[:500]
                    except:
                        asn_orgname = None
                    try:
                        asn_role = x.find('addressbook').findtext('role')[:45]
                    except:
                        asn_role = None
                    try:
                        asn_city = x.find('addressbook').find(
                            'address').findtext('city')[:100]
                    except:
                        asn_city = None
                    try:
                        asn_state = x.find('addressbook').find(
                            'address').findtext('state')[:100]
                    except:
                        asn_state = None
                    try:
                        asn_country = x.find('addressbook').find(
                            'address').findtext('country')[:100]
                    except:
                        asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

        # Find all examiner data
        for exm in r.findall('examiners'):
            position = 1
            for x in exm.findall('primary-examiner'):
                try:
                    exm_last_name = x.findtext('last-name')[:50]
                except:
                    exm_last_name = None
                try:
                    exm_first_name = x.findtext('first-name')[:50]
                except:
                    exm_first_name = None
                try:
                    exm_department = x.findtext('department')[:100]
                except:
                    exm_department = None

                # Append SQL data into dictionary to be written later
                processed_examiner.append({
                    "table_name": "uspto.EXAMINER_G",
                    "GrantID": document_id,
                    "Position": position,
                    "LastName": exm_last_name,
                    "FirstName": exm_first_name,
                    "Department": exm_department,
                    "FileName": args_array['file_name']
                })

                position += 1

            for x in exm.findall('assistant-examiner'):
                try:
                    exm_last_name = x.findtext('last-name')[:50]
                except:
                    exm_last_name = None
                try:
                    exm_first_name = x.findtext('first-name')[:50]
                except:
                    exm_first_name = None
                try:
                    exm_department = x.findtext('department')[:100]
                except:
                    exm_department = None

                # Append SQL data into dictionary to be written later
                processed_examiner.append({
                    "table_name": "uspto.EXAMINER_G",
                    "GrantID": document_id,
                    "Position": position,
                    "LastName": exm_last_name,
                    "FirstName": exm_first_name,
                    "Department": exm_department,
                    "FileName": args_array['file_name']
                })

                position += 1

    # TODO: see if it's claims or description and store accordingly
    try:
        claims = patent_root.findtext('description')
    except:
        claims = None
    #print claims

    # Find the abstract
    try:
        abstract = USPTOSanitizer.return_element_text(
            patent_root.find('abstract'))
    except:
        traceback.print_exc()
        abstract = None
    #print abstract

    # Append SQL data into dictionary to be written later
    try:
        processed_grant.append({
            "table_name": "uspto.GRANT",
            "GrantID": document_id,
            "Title": title,
            "IssueDate": pub_date,
            "Kind": kind,
            "USSeriesCode": series_code,
            "Abstract": abstract,
            "ClaimsNum": claims_num,
            "DrawingsNum": number_of_drawings,
            "FiguresNum": number_of_figures,
            "ApplicationID": app_no,
            "Claims": claims,
            "FileDate": app_date,
            "AppType": app_type,
            "GrantLength": terms_of_grant,
            "FileName": args_array['file_name']
        })
    except Exception as e:
        print "could not append to array"
        traceback.print_exc()
        logger.warning(
            "Could not append patent data to array for patent number: " +
            document_id + " Traceback: " + traceback.format_exc())

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant": processed_grant,
        "processed_applicant": processed_applicant,
        "processed_examiner": processed_examiner,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass,
        "processed_gracit": processed_gracit,
        "processed_forpatcit": processed_forpatcit,
        "processed_nonpatcit": processed_nonpatcit
    }
예제 #12
0
def extract_XML4_application(raw_data, args_array):

    # Set process start time
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define required arrays
    processed_application = []
    processed_foreignpriority = []
    processed_assignee = []
    processed_applicant = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []

    # Pass the raw data into Element tree xml object
    document_root = ET.fromstring(raw_data)

    # Start extract XML data
    for r in document_root.findall('us-bibliographic-data-application'):

        # Get basic document ID information
        pr = r.find('publication-reference')
        pub_doc = pr.find('document-id')
        try:
            pub_country = pub_doc.findtext('country').strip()
        except:
            pub_country = None
        try:
            document_id = pub_doc.findtext('doc-number').strip()
            document_id = USPTOSanitizer.fix_patent_number(document_id)
        except:
            document_id = None
            logger.error("No Patent Number was found for: " + url_link)
        try:
            kind = pub_doc.findtext('kind').strip()[:2]
        except:
            kind = None
        try:
            pub_date = USPTOSanitizer.return_formatted_date(
                pub_doc.findtext('date'), args_array, document_id)
        except:
            pub_date = None

        # Get application reference data
        ar = r.find('application-reference')
        if ar is not None:
            try:
                app_type = ar.attrib['appl-type'].strip()[:45]
            except:
                app_type = None
            app_doc = ar.find('document-id')
            try:
                app_country = app_doc.findtext('country').strip()
            except:
                app_country = None
            try:
                app_no = app_doc.findtext('doc-number').strip()[:20]
            except:
                app_no = None
            try:
                app_date = USPTOSanitizer.return_formatted_date(
                    app_doc.findtext('date'), args_array, document_id)
            except:
                app_date = None
            # Get series code
            try:
                series_code = r.findtext(
                    'us-application-series-code').strip()[:2]
            except:
                series_code = None

        # Get Priority Claims
        pcs = r.find('priority-claims')
        if pcs is not None:
            for pc in pcs.findall('priority-claim'):
                try:
                    pc_sequence = USPTOSanitizer.strip_leading_zeros(
                        pc.attrib['sequence'])
                except:
                    pc_sequence = None
                try:
                    pc_kind = pc.attrib['kind'].strip()[:100]
                except:
                    pc_kind = None
                try:
                    pc_country = pc.findtext('country').strip()[:100]
                except:
                    pc_country = None
                try:
                    pc_doc_num = pc.findtext('doc-number').strip()[:100]
                except:
                    pc_doc_num = None
                try:
                    pc_date = USPTOSanitizer.return_formatted_date(
                        pc.findtext('date'), args_array, document_id)
                except:
                    pc_date = None

                # Append SQL data into dictionary to be written later
                processed_foreignpriority.append({
                    "table_name":
                    "uspto.FOREIGNPRIORITY_A",
                    "ApplicationID":
                    app_no,
                    "Position":
                    pc_sequence,
                    "Kind":
                    pc_kind,
                    "Country":
                    pc_country,
                    "DocumentID":
                    pc_doc_num,
                    "PriorityDate":
                    pc_date,
                    "FileName":
                    args_array['file_name']
                })
                #print(processed_foreignpriority)

        # Find all international classifications
        ic = r.find('classifications-ipcr')
        position = 1
        if ic is not None:
            for icc in ic.findall('classification-ipcr'):
                for x in icc.getchildren():
                    if (USPTOSanitizer.check_tag_exists(x, 'section')):
                        try:
                            i_class_sec = x.text.strip()[:15]
                        except:
                            i_class_sec = None
                    if (USPTOSanitizer.check_tag_exists(x, 'class')):
                        try:
                            i_class_cls = x.text.strip()[:15]
                        except:
                            i_class_cls = None
                    if (USPTOSanitizer.check_tag_exists(x, 'subclass')):
                        try:
                            i_class_sub = x.text.strip()[:15]
                        except:
                            i_class_sub = None
                    if (USPTOSanitizer.check_tag_exists(x, 'main-group')):
                        try:
                            i_class_mgr = x.text.strip()[:15]
                        except:
                            i_class_mgr = None
                    if (USPTOSanitizer.check_tag_exists(x, 'subgroup')):
                        try:
                            i_class_sgr = x.text.strip()[:15]
                        except:
                            i_class_sgr = None

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name": "uspto.INTCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Section": i_class_sec,
                    "Class": i_class_cls,
                    "SubClass": i_class_sub,
                    "MainGroup": i_class_mgr,
                    "SubGroup": i_class_sgr,
                    "FileName": args_array['file_name']
                })
                #print(processed_intclass)
                position += 1

        # Get US Classification data
        nc = r.find('classification-national')
        position = 1
        if nc is not None:
            ncm = nc.find('main-classification')
            if ncm is not None:
                #print(ncm.text)
                n_class_main = None
                n_subclass = None
                n_malformed = 1
                try:
                    n_class_info = nc.findtext('main-classification')
                except:
                    n_class_info = None
                try:
                    n_class_main, n_subclass = USPTOSanitizer.return_US_class_XML4_application(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]
                except:
                    n_class_main = None
                    n_subclass = None
                    n_malformed = 1

                # Append SQL data into dictionary to be written later
                processed_usclass.append({
                    "table_name": "uspto.USCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Class": n_class_main,
                    "SubClass": n_subclass,
                    "Malformed": n_malformed,
                    "FileName": args_array['file_name']
                })
                #print(processed_usclass)
                position += 1

            # TODO: find an instance of futher classification to parse
            ncs = nc.findall('further-classification')
            for ncs_item in ncs:
                #print("Further: " + ncs_item.text)
                n_class_main = None
                n_subclass = None
                n_malformed = 1
                try:
                    n_class_info = ncs_item.text
                    n_class_main, n_subclass = USPTOSanitizer.return_US_class_XML4_application(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]
                except:
                    n_class_main = None
                    n_subclass = None
                    n_malformed = 1

                # Append SQL data into dictionary to be written later
                processed_usclass.append({
                    "table_name": "uspto.USCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Class": n_class_main,
                    "SubClass": n_subclass,
                    "Malformed": n_malformed,
                    "FileName": args_array['file_name']
                })
                #print(processed_usclass)
                position += 1

        # Get CPC Classification data
        cpc_class_element = r.find('classifications-cpc')
        # Init position
        position = 1
        if cpc_class_element is not None:
            main_cpc_class_element = cpc_class_element.find('main-cpc')
            if main_cpc_class_element is not None:
                for cpc_class_item in main_cpc_class_element.findall(
                        'classification-cpc'):
                    try:
                        cpc_section = cpc_class_item.findtext(
                            'section').strip()[:15]
                    except:
                        cpc_section = None
                    try:
                        cpc_class = cpc_class_item.findtext(
                            'class').strip()[:15]
                    except:
                        cpc_class = None
                    try:
                        cpc_subclass = cpc_class_item.findtext(
                            'subclass').strip()[:15]
                    except:
                        cpc_subclass = None
                    try:
                        cpc_mgr = cpc_class_item.findtext(
                            'main-group').strip()[:15]
                    except:
                        cpc_mgr = None
                    try:
                        cpc_sgr = cpc_class_item.findtext(
                            'subgroup').strip()[:15]
                    except:
                        cpc_sgr = None

                    # Append SQL data into dictionary to be written later
                    processed_cpcclass.append({
                        "table_name":
                        "uspto.CPCCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        cpc_section,
                        "Class":
                        cpc_class,
                        "SubClass":
                        cpc_subclass,
                        "MainGroup":
                        cpc_mgr,
                        "SubGroup":
                        cpc_sgr,
                        "FileName":
                        args_array['file_name']
                    })
                    position += 1

            further_cpc_class = cpc_class_element.find('further-cpc')
            if further_cpc_class is not None:
                for cpc_class_item in further_cpc_class.findall(
                        'classification-cpc'):
                    try:
                        cpc_section = cpc_class_item.findtext(
                            'section').strip()[:15]
                    except:
                        cpc_section = None
                    try:
                        cpc_class = cpc_class_item.findtext(
                            'class').strip()[:15]
                    except:
                        cpc_class = None
                    try:
                        cpc_subclass = cpc_class_item.findtext(
                            'subclass').strip()[:15]
                    except:
                        cpc_subclass = None
                    try:
                        cpc_mgr = cpc_class_item.findtext(
                            'main-group').strip()[:15]
                    except:
                        cpc_mgr = None
                    try:
                        cpc_sgr = cpc_class_item.findtext(
                            'subgroup').strip()[:15]
                    except:
                        cpc_sgr = None

                    # Append SQL data into dictionary to be written later
                    processed_cpcclass.append({
                        "table_name":
                        "uspto.CPCCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        cpc_section,
                        "Class":
                        cpc_class,
                        "SubClass":
                        cpc_subclass,
                        "MainGroup":
                        cpc_mgr,
                        "SubGroup":
                        cpc_sgr,
                        "FileName":
                        args_array['file_name']
                    })
                    position += 1

        # Get the title of the application
        try:
            title = USPTOSanitizer.strip_for_csv(
                r.findtext('invention-title')[:500])
        except:
            title = None
            logger.error("Title not Found for :" + url_link +
                         " Application ID: " + app_no)

        # Get number of figure, drawings
        nof = r.find('figures')
        if nof is not None:
            try:
                number_of_drawings = nof.findtext(
                    'number-of-drawing-sheets').strip()
            except:
                number_of_drawings = None
            try:
                number_of_figures = nof.findtext('number-of-figures').strip()
            except:
                number_of_figures = None
        else:
            number_of_drawings = None
            number_of_figures = None

        # Check if XML format uses 'us-parties' or 'parties'
        if r.find('us-parties') != None: parties_id_string = "us-parties"
        elif r.find('parties') != None: parties_id_string = "parties"
        else: parties_id_string = "parties"
        prt = r.find(parties_id_string)
        if prt is not None:
            # Increment position
            appl_position = 1
            invt_position = 1
            atn_position = 1
            # Check if the XML format uses 'applicants' or 'us-applicants'
            if prt.find('us-applicants') != None:
                applicants_id_string = 'us-applicants'
            elif prt.find('applicants') != None:
                applicants_id_string = 'applicants'
            else:
                applicants_id_string = 'applicants'
            # Get Applicant data
            appl_elem = prt.find(applicants_id_string)
            # Check if the XML format uses 'applicant' or 'us-applicant'
            if appl_elem.find('us-applicant') != None:
                applicant_id_string = 'us-applicant'
            elif appl_elem.find('applicant') != None:
                applicant_id_string = 'applicant'
            else:
                applicant_id_string = 'applicant'
            for appl in appl_elem.findall(applicant_id_string):
                if (appl.find('addressbook') != None):
                    try:
                        appl_orgname = USPTOSanitizer.strip_for_csv(
                            appl.find('addressbook').findtext('orgname'))[:300]
                    except:
                        appl_orgname = None
                    try:
                        appl_role = appl.find('addressbook').findtext('role')
                    except:
                        appl_role = None
                    try:
                        appl_city = appl.find('addressbook').find(
                            'address').findtext('city').strip()[:100]
                    except:
                        appl_city = None
                    try:
                        appl_state = appl.find('addressbook').find(
                            'address').findtext('state').strip()[:100]
                    except:
                        appl_state = None
                    try:
                        appl_country = appl.find('addressbook').find(
                            'address').findtext('country').strip()[:100]
                    except:
                        appl_country = None
                    try:
                        appl_firstname = USPTOSanitizer.strip_for_csv(
                            appl.find('addressbook').findtext(
                                'first-name'))[:100]
                    except:
                        appl_firstname = None
                    try:
                        appl_lastname = USPTOSanitizer.strip_for_csv(
                            appl.find('addressbook').findtext(
                                'last-name'))[:100]
                    except:
                        appl_lastname = None

                    # Append SQL data into dictionary to be written later
                    processed_applicant.append({
                        "table_name":
                        "uspto.APPLICANT_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        appl_position,
                        "OrgName":
                        appl_orgname,
                        "FirstName":
                        appl_firstname,
                        "LastName":
                        appl_lastname,
                        "City":
                        appl_city,
                        "State":
                        appl_state,
                        "Country":
                        appl_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_applicant)
                    appl_position += 1

            # Get the inventor data element
            invs = prt.find('inventors')
            # Init position
            position = 1
            if invs is not None:
                # Get all inventors
                for inv in invs.findall("inventor"):
                    if (inv.find('addressbook') != None):
                        try:
                            inv_first_name = inv.find('addressbook').findtext(
                                'first-name').strip()[:100]
                        except:
                            inv_first_name = None
                        try:
                            inv_last_name = inv.find('addressbook').findtext(
                                'last-name').strip()[:100]
                        except:
                            inv_last_name = None
                        try:
                            inv_city = inv.find('addressbook').find(
                                'address').findtext('city').strip()[:100]
                        except:
                            inv_city = None
                        try:
                            inv_state = inv.find('addressbook').find(
                                'address').findtext('state').strip()[:100]
                        except:
                            inv_state = None
                        try:
                            inv_country = inv.find('addressbook').find(
                                'address').findtext('country').strip()[:100]
                        except:
                            inv_country = None
                        try:
                            inv_nationality = inv.find('nationality').findtext(
                                'country').strip()[:100]
                        except:
                            inv_nationality = None
                        try:
                            inv_residence = inv.find('residence').findtext(
                                'country').strip()[:300]
                        except:
                            inv_residence = None

                        # Append SQL data into dictionary to be written later
                        processed_inventor.append({
                            "table_name":
                            "uspto.INVENTOR_A",
                            "ApplicationID":
                            app_no,
                            "Position":
                            invt_position,
                            "FirstName":
                            inv_first_name,
                            "LastName":
                            inv_last_name,
                            "City":
                            inv_city,
                            "State":
                            inv_state,
                            "Country":
                            inv_country,
                            "Nationality":
                            inv_nationality,
                            "Residence":
                            inv_residence,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_inventor)
                        invt_position += 1

            # Init position
            position = 1
            # Get agent data
            #TODO Find if available in application ??? Where
            agents_element = prt.find('agents')
            if agents_element is not None:
                for agent_item in agents_element.findall('agent'):
                    try:
                        asn_sequence = agent_item.attrib['sequence']
                    except:
                        asn_sequence = None
                    if (agent_item.find('addressbook') != None):
                        try:
                            atn_orgname = agent_item.find(
                                'addressbook').findtext(
                                    'orgname').strip()[:300]
                        except:
                            atn_orgname = None
                        try:
                            atn_last_name = agent_item.find(
                                'addressbook').findtext(
                                    'last-name').strip()[:100]
                        except:
                            atn_last_name = None
                        try:
                            atn_first_name = agent_item.find(
                                'addressbook').findtext(
                                    'first-name').strip()[:100]
                        except:
                            atn_first_name = None
                        try:
                            atn_country = agent_item.find('addressbook').find(
                                'address').findtext('country').strip()[:100]
                        except:
                            atn_country = None
                        atn_address = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_A",
                            "ApplicationID":
                            app_no,
                            "Position":
                            atn_position,
                            "OrgName":
                            atn_orgname,
                            "LastName":
                            atn_last_name,
                            "FirstName":
                            atn_first_name,
                            "Country":
                            atn_country,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_agent)
                        atn_position += 1

        # Get assignee data
        asn_elem = r.find('assignees')
        # Init position
        position = 1
        if asn_elem is not None:
            for asn_item in asn_elem.findall('assignee'):
                if (asn_item.find('addressbook') != None):
                    try:
                        asn_orgname = asn_item.find('addressbook').findtext(
                            'orgname').strip()[:300]
                    except:
                        asn_orgname = None
                    try:
                        asn_firstname = asn_item.find('addressbook').findtext(
                            'first-name').strip()[:100]
                    except:
                        asn_firstname = None
                    try:
                        asn_lastname = asn_item.find('addressbook').findtext(
                            'last-name').strip()[:100]
                    except:
                        asn_lastname = None
                    try:
                        asn_role = asn_item.find('addressbook').findtext(
                            'role').strip()[:5]
                    except:
                        asn_role = None
                    try:
                        asn_city = asn_item.find('addressbook').find(
                            'address').findtext('city').strip()[:50]
                    except:
                        asn_city = None
                    try:
                        asn_state = asn_item.find('addressbook').find(
                            'address').findtext('state').strip()[:10]
                    except:
                        asn_state = None
                    try:
                        asn_country = asn_item.find('addressbook').find(
                            'address').findtext('country').strip()[:3]
                    except:
                        asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "FirstName":
                        asn_firstname,
                        "LastName":
                        asn_lastname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_assignee)
                    position += 1

    # Find the abstract
    try:
        abstract_element = document_root.find('abstract')
        if abstract_element is not None:
            abstract = USPTOSanitizer.strip_for_csv(
                USPTOSanitizer.return_element_text(abstract_element))
        else:
            abstract = None
    except:
        abstract = None
    #print(abstract)

    # Find the description
    try:
        description = ""
        d_elem = document_root.find('description')
        if d_elem is not None:
            description += USPTOSanitizer.strip_for_csv(' '.join(
                d_elem.itertext()))
        else:
            description = None
    except Exception as e:
        description = None
        #traceback.print_exc()
        #logger.error("Exception while extracting description from " + str(app_no) + ": " + traceback.print_exc())
    #print(description)

    # Find the claims
    try:
        claims = ""
        c_elem = document_root.find('claims')
        if c_elem is not None:
            claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext()))
        else:
            claims = None
    except Exception as e:
        claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting claim from " + str(app_no) + ": " + traceback.print_exc())
    #print(claims)

    # Find the number of claims
    try:
        number_of_claims = 0
        for clms in c_elem.findall('claim'):
            number_of_claims += 1
    except Exception as e:
        number_of_claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting claim from " + str(app_no))
    #print(number_of_claims)

    # Find the number of drawings and figures
    try:
        number_of_figures = 0
        number_of_drawings = 0
        drw_elem = document_root.find('drawings')
        if drw_elem != None:
            for fg in drw_elem.findall('figure'):
                img_type = fg.find('img').attrib['img-content'].strip()
                if img_type == "drawing": number_of_drawings += 1
                elif img_type == "figure": number_of_figures += 1
        else:
            number_of_figures = None
            number_of_drawings = None
    except Exception as e:
        number_of_figures = None
        number_of_drawings = None
        #traceback.print_exc()
        #logger.error("Exception while extracting figures and drawings num " + str(app_no))
    #print(number_of_figures)
    #print(number_of_drawings)

    # Append SQL data into dictionary to be written later
    processed_application.append({
        "table_name": "uspto.APPLICATION",
        "ApplicationID": app_no,
        "PublicationID": document_id,
        "AppType": app_type,
        "Title": title,
        "FileDate": app_date,
        "PublishDate": pub_date,
        "Kind": kind,
        "USSeriesCode": series_code,
        "Abstract": abstract,
        "ClaimsNum": number_of_claims,
        "DrawingsNum": number_of_drawings,
        "FiguresNum": number_of_figures,
        "Description": description,
        "Claims": claims,
        "FileName": args_array['file_name']
    })
    #print(processed_application)

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_application": processed_application,
        "processed_foreignpriority": processed_foreignpriority,
        "processed_applicant": processed_applicant,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass,
    }
예제 #13
0
    def insert_2005_grant_classifications(self, args_array, json_obj):

        # Start timer
        start_time = time.time()
        logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

        try:

            # Insert the items in the json object into the appropiate
            # patent grant in the database
            for item in json_obj:
                # Extract the patent grant ID from the document number
                publication_number = item['publication_number'].split("-")[1]
                # Create an array to hold all CPC codes for the patent
                cpc_array = []
                ipc_array = []
                # Loop through all CPC codes for the item and
                # convert to dict
                for code in item['cpc']:
                    cpc_dict = USPTOSanitizer.extract_BQ_CPC_string_to_dict(
                        code['code'])
                    # Append the CPC dict to array
                    cpc_array.append(cpc_dict)
                # Loop through all IPC codes for the item and
                # convert to dict
                for code in itemp['ipc']:
                    ipc_dict = USPTOSanitizer.extract_BQ_CPC_string_to_dict(
                        code['code'])
                    # Append the CPC dict to array
                    ipc_array.append(ipc_dict)

                # Pass the publication_number and cpc_array to SQL to be inserted
                args_array['database_connection'].insert_CPC_patched_item(
                    publication_number, cpc_array)
                # Pass the publication_number and ipc_array to SQL to be inserted
                args_array['database_connection'].insert_IPC_patched_item(
                    publication_number, ipc_array)

            print(
                "-- All 2005 classification codes inserted into database successfully"
            )
            logger.info(
                "-- All 2005 classification codes inserted into database successfully"
            )
            exit(0)

        except Exception as e:
            # If the insertion process failed then exit with status 1
            traceback.print_exc()
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Exception: " + str(exc_type) + " in Filename: " +
                         str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                         " Traceback: " + traceback.format_exc())
            print(
                "-- Failed to insert all 2005 classification codes inserted into database"
            )
            logger.info(
                "-- Failed to insert all 2005 classification codes inserted into database"
            )
            exit(1)
def extract_csv_line(args_array, line):

    #print(line)
    # Declare a processed array to append to
    processed_array = {
        "table_name": set_table_name_from_type(args_array['extraction_type']),
        "FileName": args_array['file_name'],
        "extraction_type": args_array['extraction_type']
    }

    # Handle a correspondance items
    if args_array['extraction_type'] == "cases":
        processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1])
        processed_array['PacerID'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[2])
        processed_array['CourtTitle'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[3])
        processed_array['DistrictID'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[4])
        processed_array['CaseTitle'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[5])
        processed_array['AssignedTo'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[6])
        processed_array['ReferredTo'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[7])
        processed_array['Cause'] = USPTOSanitizer.clean_PAIR_csv_item(line[8])
        processed_array[
            'JurisdictionBasis'] = USPTOSanitizer.clean_PAIR_csv_item(line[9])
        processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[10])
        processed_array['CloseDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[11])
        processed_array['LastFileDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[12])
        processed_array['JuryDemand'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[13])
        processed_array['Demand'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[14])
        processed_array['LeadCase'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[15])
        processed_array['RelatedCase'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[16])
        processed_array['Settlement'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[17])
        processed_array['CaseIDRaw'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[18])
        processed_array['CaseType1'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[19])
        processed_array['CaseType2'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[20])
        processed_array['CaseType3'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[21])
        processed_array['CaseTypeNote'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[22])

    elif args_array['extraction_type'] == "pacercases":
        processed_array['ApplicationID'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[0])
        processed_array[
            'ParentApplicationID'] = USPTOSanitizer.clean_PAIR_csv_item(
                line[1])
        processed_array['FileDate'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[2])
        processed_array[
            'ContinuationType'] = USPTOSanitizer.clean_PAIR_csv_item(line[3])

    elif args_array['extraction_type'] == "names":
        processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1])
        processed_array['PartyType'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[3])
        processed_array['Name'] = USPTOSanitizer.clean_PAIR_csv_item(line[5])

    elif args_array['extraction_type'] == "attorneys":
        processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[1])
        processed_array['CaseIDRaw'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[2])
        processed_array['PartyType'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[4])
        processed_array['Name'] = USPTOSanitizer.clean_PAIR_csv_item(line[6])
        processed_array['ContactInfo'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[7])
        processed_array['Position'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[8])

    elif args_array['extraction_type'] == "patents":
        processed_array['CaseID'] = USPTOSanitizer.clean_PAIR_csv_item(line[2])
        processed_array['PacerID'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[1])
        processed_array['NOS'] = USPTOSanitizer.clean_PAIR_csv_item(line[4])
        processed_array['PatentID'] = USPTOSanitizer.strip_leading_zeros(
            USPTOSanitizer.clean_PAIR_csv_item(line[11]))
        processed_array['PatentDocType'] = USPTOSanitizer.clean_PAIR_csv_item(
            line[12])

    # Return the array for storage
    return processed_array
예제 #15
0
def process_XML_application_content(args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # If csv file insertion is required, then open all the files
    # into args_array
    if "csv" in args_array['command_args'] or (
            "database" in args_array['command_args']
            and args_array['database_insert_mode'] == "bulk"):
        args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(
            args_array['document_type'], args_array['file_name'],
            args_array['csv_directory'])

    # Process zip file by getting .dat or .txt file and .xml filenames
    start_time = time.time()

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_zip_to_array(args_array)

    # create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # read through the file and append into groups of string.
    # Send the finished strings to be parsed
    # Use uspto_xml_format to determine file contents and parse accordingly
    if args_array['uspto_xml_format'] == "aXML4":

        # Loop through all lines in the xml file
        for line in xml_file.readlines():

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<us-patent-application" in line:

                patent_xml_started = True
                xml_string += line

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</us-patent-application" in line:

                patent_xml_started = False
                xml_string += line

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(
                    xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(
                    processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_new_html_characters(line)

    elif args_array['uspto_xml_format'] == "aXML1":

        line_count = 1

        # Loop through all lines in the xml file
        for line in xml_file.readlines():

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<patent-application-publication" in line:

                patent_xml_started = True
                xml_string += line

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</patent-application-publication" in line:

                patent_xml_started = False
                xml_string += line

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(
                    xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(
                    processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Close the .xml file being read from
    xml_file.close()
    # Close the all the .csv files being written to
    USPTOCSVHandler.close_csv_files(args_array)

    # Set a flag file_processed to ensure that the bulk insert succeeds
    file_processed = True

    # If data is to be inserted as bulk csv files, then call the sql function
    if args_array['database_insert_mode'] == 'bulk':
        file_processed = args_array['database_connection'].load_csv_bulk_data(
            args_array, logger)

    # If the file was successfully processed into the database
    if file_processed:
        # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed"
        USPTOLogger.write_process_log(args_array)
        if "csv" not in args_array['command_args']:
            # Close all the open csv files
            USPTOCSVHandler.delete_csv_files(args_array)

        # Print message to stdout and log
        print '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(
            args_array['document_type'], args_array['url_link'],
            time.time() - start_time, time.strftime("%c"))
        logger.info(
            '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'
            .format(args_array['document_type'], args_array['url_link'],
                    time.time() - start_time, time.strftime("%c")))

    else:
        # Print message to stdout and log
        print '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(
            args_array['document_type'], args_array['url_link'],
            time.time() - start_time, time.strftime("%c"))
        logger.info(
            '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'
            .format(args_array['document_type'], args_array['url_link'],
                    time.time() - start_time, time.strftime("%c")))
예제 #16
0
def extract_XML2_grant(raw_data, args_array):

    #
    # Data documentation on the fields in XML2 Grant data can be found
    # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file

    # Start timer
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define all arrays needed to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []
    processed_foreignpriority = []

    # Pass the raw data into Element tree xml object
    try:
        document_root = ET.fromstring(raw_data)
    except ET.ParseError as e:
        print_xml = raw_data.split("\n")
        for num, line in enumerate(print_xml, start=1):
            print(str(num) + ' : ' + line)
        logger.error(
            "Character Entity prevented ET from parsing XML in file: " +
            url_link)
        traceback.print_exc()
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Exception: " + str(exc_type) + " in Filename: " +
                     str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                     " Traceback: " + traceback.format_exc())

    # SDOBI is the bibliographic data
    r = document_root.find('SDOBI')
    if r is not None:
        # B100 Document Identification
        for B100 in r.findall('B100'):
            try:
                document_id = USPTOSanitizer.return_element_text(
                    B100.find('B110')).strip()
                document_id = USPTOSanitizer.fix_patent_number(
                    document_id)[:20]
            except:
                document_id = None
                logger.error("No Patent Number was found for: " + url_link)
            try:
                kind = USPTOSanitizer.return_element_text(
                    B100.find('B130')).strip()[:2]
                app_type = USPTOSanitizer.return_xml2_app_type(
                    args_array, kind).strip()
            except:
                kind = None
            # PATENT ISSUE DATE
            try:
                pub_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B100.find('B140')),
                    args_array, document_id)
            except:
                pub_date = None
            # B190 is Publishing Country or Organization
            # This is always US in Red Book Patent Grant documents and
            # this field is not stored or used.
            try:
                pub_country = USPTOSanitizer.return_element_text(
                    B100.find('B190')).strip()
            except:
                pub_country = None

        # B200 is Domestic Filing Data
        for B200 in r.findall('B200'):
            # TODO: find this in XML2 applications
            app_country = None
            # Application number
            try:
                app_no = USPTOSanitizer.return_element_text(
                    B200.find('B210')).strip()[:20]
            except:
                app_no = None
            # Application Date
            try:
                app_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B200.find('B220')),
                    args_array, document_id)
            except:
                app_date = None
            # Series Code
            try:
                series_code = USPTOSanitizer.return_element_text(
                    B200.find('B211US')).strip()[:2]
            except:
                series_code = None

        # Collect the Grant Length
        try:
            grant_length = USPTOSanitizer.return_element_text(
                r.find("B400").find("B472").find("B474")).strip()
        except:
            grant_length = None

        # Collect Technical information
        # such as classification and references
        # TODO: don't need the loop here
        for B500 in r.findall('B500'):
            # US Classification
            for B520 in B500.findall('B520'):
                position = 1
                # USCLASS
                for B521 in B520.findall('B521'):
                    # Reset the class vars
                    n_class = None
                    n_section = None
                    n_subclass = None
                    # Collect class vars
                    n_class_info = USPTOSanitizer.return_element_text(B521)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_usclass)
                    position += 1

                # B522 USCLASS FURTHER
                for B522 in B520.findall('B522'):
                    n_class_info = USPTOSanitizer.return_element_text(B522)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    position += 1

            # B510 International Class data
            # TODO: check if I need to set all variables to empty or can just leave as null
            # TODO: check if classification is parsed correctly
            for B510 in B500.findall('B510'):
                #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id))
                # Reset position
                position = 1
                # B511 Main Class
                for B511 in B510.findall('B511'):
                    i_section = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    int_class = USPTOSanitizer.return_element_text(B511)
                    # Int Class is:
                    if (len(int_class.split()) > 1):
                        sec_1, sec_2 = int_class.split()
                        sec_1 = sec_1.strip()[:15]
                        # Remove the Section from first character
                        i_section = sec_1[0]
                        i_class = sec_1[1:3]
                        i_subclass = sec_1[-1]
                        i_class_mgr = sec_2.strip()[:-2]
                        i_class_sgr = sec_2.strip()[-2:]
                    else:
                        int_class = int_class.strip()[:15]
                        i_section = int_class[0]
                        i_class = int_class[1:]
                        i_subclass = int_class[-1]
                        i_malformed = 1

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_section,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

                # B512 Further International Class
                for B512 in B510.findall('B512'):
                    i_section = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    int_class = USPTOSanitizer.return_element_text(B512)
                    # Split class in to class and group
                    if (len(int_class.split()) > 1):
                        sec_1, sec_2 = int_class.split()
                        sec_1 = sec_1.strip()[:15]
                        # Remove the Section from first character
                        i_section = sec_1[0]
                        i_class = sec_1[1:3]
                        i_subclass = sec_1[-1]
                        i_class_mgr = sec_2.strip()[:-2]
                        i_class_sgr = sec_2.strip()[-2:]
                    else:
                        # TODO: Is this correct??
                        int_class = int_class.strip()[:15]
                        i_section = int_class[0]
                        i_class = int_class[1:]
                        i_subclass = int_class[-1]
                        i_malformed = 1

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_section,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

            # B540 Collect Title
            for B540 in B500.findall('B540'):
                try:
                    title = USPTOSanitizer.strip_for_csv(
                        USPTOSanitizer.return_element_text(B540)[:500])
                except:
                    title = None

            # Patent Citations
            for B560 in B500.findall('B560'):
                # Reset position counter for all citations loop
                position = 1
                # B561 is Patent Citation
                for B561 in B560.findall('B561'):

                    # TODO: find out how to do PCIT, DOC without loop.  Only B561 needs loop
                    pcit = B561.find('PCIT')
                    # Determien if the patent is US or not
                    #TODO: needs to check better, what does non US patent look like
                    # If all patents have PARTY-US then perhaps a databse call to check the country of origin
                    # would still allow to separate into GRACIT and FORPATCIT_G
                    #if PCIT.find("PARTY-US") == True:
                    #print "CITATION COUNTRY US"
                    #citation_country = "US"
                    #else:
                    #citation_country = "NON-US"
                    #logger.warning("NON US patent found")

                    #citation_country = "US"

                    # Declare items in case they are not found
                    citation_name = None
                    citation_city = None
                    citation_state = None
                    citation_country = None

                    doc = pcit.find('DOC')
                    if doc is not None:
                        try:
                            citation_document_number = USPTOSanitizer.return_element_text(
                                doc.find('DNUM')).strip()[:15]
                        except:
                            citation_document_number = None
                        try:
                            pct_kind = USPTOSanitizer.return_element_text(
                                doc.find('KIND')).strip()[:10]
                        except:
                            pct_kind = None
                        try:
                            citation_date = USPTOSanitizer.return_formatted_date(
                                USPTOSanitizer.return_element_text(
                                    doc.find('DATE')), args_array, document_id)
                        except:
                            citation_date = None
                        prt = pcit.find('PARTY-US')
                        if prt is not None:
                            try:
                                citation_name = USPTOSanitizer.return_element_text(
                                    prt.find("NAM").find("SNM")).strip()[:100]
                            except:
                                citation_name = None
                            # Citation Address info
                            try:
                                citation_city = USPTOSanitizer.return_element_text(
                                    prt.find('ADR').find('CITY')).strip()[:100]
                            except:
                                citation_city = None
                            try:
                                citation_state = USPTOSanitizer.return_element_text(
                                    prt.find('ADR').find('STATE')).strip()[:3]
                            except:
                                citation_state = None
                            # Citation country
                            try:
                                citation_country = USPTOSanitizer.return_element_text(
                                    prt.find("ADR").find('CTRY')).strip()[:3]
                            except:
                                try:
                                    # If state is a US state, set country to US
                                    if USPTOSanitizer.is_US_state(
                                            citation_state):
                                        citation_country = "US"
                                    else:
                                        citation_country = None
                                except:
                                    citation_country = None

                        # Parse citation category
                        if (len(B561.getchildren()) > 1):
                            try:
                                citation_category = B561.getchildren(
                                )[1].tag.replace("\n", "").replace("\r",
                                                                   "").upper()
                            except:
                                citation_category = None
                        else:
                            citation_category = None

                        #TODO: be aware that there may be something crazy in the
                        # citation document number
                        if pct_kind != None:

                            # Append SQL data into dictionary to be written later
                            processed_gracit.append({
                                "table_name":
                                "uspto.GRACIT_G",
                                "GrantID":
                                document_id,
                                "Position":
                                position,
                                "CitedID":
                                citation_document_number,
                                "Kind":
                                pct_kind,
                                "Name":
                                citation_name,
                                "Date":
                                citation_date,
                                "Country":
                                citation_country,
                                "Category":
                                citation_category,
                                "FileName":
                                args_array['file_name']
                            })
                            #print(processed_gracit)
                            position += 1

                        else:

                            # Append SQL data into dictionary to be written later
                            processed_forpatcit.append({
                                "table_name":
                                "uspto.FORPATCIT_G",
                                "GrantID":
                                document_id,
                                "Position":
                                position,
                                "CitedID":
                                citation_document_number,
                                "Kind":
                                pct_kind,
                                "Name":
                                citation_name,
                                "Date":
                                citation_date,
                                "Country":
                                citation_country,
                                "Category":
                                citation_category,
                                "FileName":
                                args_array['file_name']
                            })
                            #print(processed_forpatcit)
                            position += 1

                # Reset position counter for non-patent citations loop
                position = 1
                # Non-patent Literature Citations
                for B562 in B560.findall('B562'):
                    NCIT = B562.find('NCIT')
                    if NCIT is not None:
                        # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                        non_patent_citation_text = USPTOSanitizer.return_element_text(
                            NCIT)
                        non_patent_citation_text = re.sub(
                            '<[^>]+>', '', non_patent_citation_text)
                    else:
                        non_patent_citation_text = None

                    # Parse citation category into code
                    if (len(B562.getchildren()) > 1):
                        try:
                            ncitation_category = B562.getchildren(
                            )[1].tag.replace("\n", "").replace("\r",
                                                               "").upper()
                        except:
                            ncitation_category = None
                    else:
                        ncitation_category = None

                    # Append SQL data into dictionary to be written later
                    processed_nonpatcit.append({
                        "table_name":
                        "uspto.NONPATCIT_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Citation":
                        non_patent_citation_text,
                        "Category":
                        ncitation_category,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_nonpatcit)
                    position += 1

            # Collect number of claims
            for B570 in B500.findall('B570'):
                try:
                    claims_num = USPTOSanitizer.return_element_text(
                        B570.find('B577')).strip()
                except:
                    claims_num = None

            # Collect number of drawings and figures
            for B590 in B500.findall('B590'):
                for B595 in B590.findall('B595'):
                    try:
                        number_of_drawings = USPTOSanitizer.return_element_text(
                            B595).strip()
                        number_of_drawings = number_of_drawings.split("/")[0]
                    except:
                        number_of_drawings = None
                for B596 in B590.findall('B596'):
                    try:
                        number_of_figures = USPTOSanitizer.return_element_text(
                            B596).strip()
                    except:
                        number_of_figures = None

            # TODO: B582 find out what it is.  Looks like patent classifications but it's all alone in the XML

        # B700 is Parties
        # TODO: find the applicant data and append to array
        for B700 in r.findall('B700'):
            # B720 Inventor
            for B720 in B700.findall('B720'):
                # Reset position for inventors
                position = 1
                # Collect inventor information
                for B721 in B720.findall('B721'):
                    for i in B721.findall('PARTY-US'):
                        # Inventor Name
                        try:
                            inventor_first_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('FNM')).strip()[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('SNM')).strip()[:100]
                        except:
                            inventor_last_name = None
                        # Inventor Address info
                        try:
                            inventor_city = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('CITY')).strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('STATE')).strip()[:3]
                        except:
                            inventor_state = None
                        # Inventor country
                        try:
                            inventor_country = USPTOSanitizer.return_element_text(
                                i.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # If state is a US state, set country to US
                                if USPTOSanitizer.is_US_state(inventor_state):
                                    inventor_country = "US"
                                else:
                                    inventor_country = None
                            except:
                                inventor_country = None
                        inventor_nationality = None
                        inventor_residence = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "Nationality":
                        inventor_nationality,
                        "Residence":
                        inventor_residence,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_inventor)
                    position += 1

            # B730 Assignee
            # TODO: check if finding child of child is working
            # Reset position for assignees
            position = 1
            for B730 in B700.findall('B730'):
                for B731 in B730.findall('B731'):
                    for x in B731.findall('PARTY-US'):
                        try:
                            asn_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM")).strip()[:500]
                        except:
                            asn_orgname = None
                        asn_role = None
                        try:
                            asn_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY')).strip()[:100]
                        except:
                            asn_city = None
                        try:
                            asn_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE')).strip()[:30]
                        except:
                            asn_state = None
                        # Assignee country
                        try:
                            asn_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # Fix country if country missing
                                if USPTOSanitizer.is_US_state(asn_state):
                                    asn_country = "US"
                                else:
                                    asn_country = None
                            except:
                                asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_assignee)
                    position += 1

            # B740 is Legal Agent / Attorney
            for B740 in B700.findall('B740'):
                # Reset position for agents
                position = 1
                for B741 in B740.findall('B741'):
                    for x in B741.findall('PARTY-US'):
                        # Attorney Organization
                        try:
                            agent_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM")).strip()[:300]
                        except:
                            agent_orgname = None
                        # Attorney Name
                        try:
                            agent_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:100]
                        except:
                            agent_last_name = None
                        try:
                            agent_first_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:100]
                        except:
                            agent_first_name = None
                        # Attorney Address information
                        try:
                            agent_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY')).strip()[:100]
                        except:
                            agent_city = None
                        try:
                            agent_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE')).strip()[:30]
                        except:
                            agent_state = None
                        # Agent country
                        try:
                            agent_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # Fix country if missing
                                if USPTOSanitizer.is_US_state(agent_state):
                                    agent_country = "US"
                                else:
                                    agent_country = None
                            except:
                                agent_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "OrgName":
                            agent_orgname,
                            "LastName":
                            agent_last_name,
                            "FirstName":
                            agent_first_name,
                            "Country":
                            agent_country,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_agent)
                        position += 1

            # B745 Examiner
            for B745 in B700.findall('B745'):
                position = 1
                # Primary Examiner
                for B746 in B745.findall('B746'):
                    for x in B746.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:50]
                        except:
                            examiner_fist_name = None
                        try:
                            examiner_department = USPTOSanitizer.return_element_text(
                                B745.find('B748US')).strip()[:50]
                        except:
                            examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_examiner)
                        position += 1

                # Assistant Examiner
                for B747 in B745.findall('B747'):
                    for x in B747.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:50]
                        except:
                            examiner_fist_name = None
                        try:
                            examiner_department = USPTOSanitizer.return_element_text(
                                B745.find('B748US')).strip()[:50]
                        except:
                            examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_examiner)
                        position += 1

        # B300 Foreign priotiry data
        position = 1
        for B300 in r.findall('B300'):
            # Country
            try:
                pc_country = USPTOSanitizer.return_element_text(
                    B300.find('B330').find('CTRY')).strip()[:5]
            except:
                pc_country = None
            # Prority filing date
            try:
                pc_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(
                        B300.find('B320').find('DATE')).strip()[:45])
            except:
                pc_date = None
            # Prority document number
            try:
                pc_doc_num = USPTOSanitizer.return_element_text(
                    B300.find('B310').find('DNUM')).strip()[:45]
            except:
                pc_doc_dum = None

            # Set the fields that are not in gXML2
            pc_kind = None

            # Append SQL data into dictionary to be written later
            processed_foreignpriority.append({
                "table_name":
                "uspto.FOREIGNPRIORITY_G",
                "GrantID":
                document_id,
                "Position":
                position,
                "Kind":
                pc_kind,
                "Country":
                pc_country,
                "DocumentID":
                pc_doc_num,
                "PriorityDate":
                pc_date,
                "FileName":
                args_array['file_name']
            })
            #print(processed_foreignpriority)
            position += 1

        # Collect Abstract from data
        try:
            a_elem = document_root.find('SDOAB')
            if a_elem is not None:
                abstract = USPTOSanitizer.strip_for_csv(
                    USPTOSanitizer.return_element_text(a_elem))
            else:
                abstract = None
        except Exception as e:
            abstract = None
            #traceback.print_exc()
            #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc())
        #print(abstract)

        # Collect detailed description from DETDESC
        try:
            d_elem = document_root.find('SDODE').find('DETDESC')
            if d_elem is not None:
                description = USPTOSanitizer.strip_for_csv(' '.join(
                    d_elem.itertext()))
            else:
                description = None
        except Exception as e:
            description = None
            #traceback.print_exc()
            #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc())
        #print(description)

        # Collect claims from data
        try:
            c_elem = document_root.find('SDOCL')
            if c_elem is not None:
                claims = USPTOSanitizer.strip_for_csv(' '.join(
                    c_elem.itertext()))
                #claims = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(c_elem))
            else:
                claims = None
        except Exception as e:
            claims = None
            #traceback.print_exc()
            #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc())
        #print(claims)

        # Append SQL data into dictionary to be written later
        processed_grant.append({
            "table_name": "uspto.GRANT",
            "GrantID": document_id,
            "Title": title,
            "Claims": claims,
            "Description": description,
            "IssueDate": pub_date,
            "Kind": kind,
            "GrantLength": grant_length,
            "USSeriesCode": series_code,
            "Abstract": abstract,
            "ClaimsNum": claims_num,
            "DrawingsNum": number_of_drawings,
            "FiguresNum": number_of_figures,
            "ApplicationID": app_no,
            "FileDate": app_date,
            "AppType": app_type,
            "FileName": args_array['file_name']
        })
        #print(processed_grant)

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant": processed_grant,
        "processed_applicant": processed_applicant,
        "processed_examiner": processed_examiner,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_gracit": processed_gracit,
        "processed_forpatcit": processed_forpatcit,
        "processed_nonpatcit": processed_nonpatcit,
        "processed_foreignpriority": processed_foreignpriority
    }
def process_XML_grant_content(args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    if "database" in args_array["command_args"]:
        # Pass the database connection to variable
        database_connection = args_array['database_connection']

    # If csv file insertion is required, then open all the files
    # into args_array
    if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"):
        args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory'])

    # Set the start time of operation
    start_time = time.time()

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # read through the file and append into groups of string.
    # Send the finished strings to be parsed
    # Use uspto_xml_format to determine file contents and parse accordingly
    #print "The xml format is: " + args_array['uspto_xml_format']
    if args_array['uspto_xml_format'] == "gXML4":

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # grant bibliographic information
            if "<us-patent-grant" in line:
                patent_xml_started = True
                xml_string += "<us-patent-grant>"

            # This identifies end of well-formed XML segement for single patent
            # grant bibliographic information
            elif "</us-patent-grant" in line:

                patent_xml_started = False
                xml_string += line
                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreGrantData.store_grant_data(processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                # Check which type of encoding should be used to fix the line string
                xml_string += USPTOSanitizer.replace_new_html_characters(line)

    # Used for gXML2 files
    elif args_array['uspto_xml_format'] == "gXML2":

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # grant bibliographic information
            if "<PATDOC" in line:
                patent_xml_started = True
                xml_string += "<PATDOC>"

                # Print line with number
                #print str(line_number) + " : " + line
                #line_number += 1

            # This identifies end of well-formed XML segement for single patent
            # grant bibliographic information
            elif "</PATDOC" in line:
                patent_xml_started = False
                xml_string += line

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreGrantData.store_grant_data(processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                # Check which type of encoding should be used to fix the line string
                xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Close all the open .csv files being written to
    USPTOCSVHandler.close_csv_files(args_array)

    # Set a flag file_processed to ensure that the bulk insert succeeds
    # This should be true, in case the database insertion method is not bulk
    file_processed = True

    # If data is to be inserted as bulk csv files, then call the sql function
    if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk':
        # Check for previous attempt to process the file and clean database if required
        database_connection.remove_previous_file_records(args_array['document_type'], args_array['file_name'])
        # Load CSV file into database
        file_processed = database_connection.load_csv_bulk_data(args_array)

    if file_processed:
        # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed"
        USPTOLogger.write_process_log(args_array)
        if "csv" not in args_array['command_args']:
            # Delete all the open csv files
            USPTOCSVHandler.delete_csv_files(args_array)

        # Print message to stdout and log
        print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return file_processed as success status
        return file_processed
    else:
        # Print message to stdout and log
        print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return None as failed status during database insertion
        return None
예제 #18
0
def process_XML_application_content(args_array):

    # Process zip file by getting .dat or .txt file and .xml filenames
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # If csv file insertion is required, then open all the files
    # into args_array
    if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"):
        args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory'])

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # read through the file and append into groups of string.
    # Send the finished strings to be parsed
    # Use uspto_xml_format to determine file contents and parse accordingly
    if args_array['uspto_xml_format'] == "aXML4":

        # Loop through all lines in the xml file
        for line in xml_file_contents:
            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<us-patent-application" in line:
                patent_xml_started = True
                xml_string += "<us-patent-application>"

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</us-patent-application" in line:
                patent_xml_started = False
                xml_string += "</us-patent-application>"

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(processed_data_array, args_array)
                # Reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_new_html_characters(line)

    elif args_array['uspto_xml_format'] == "aXML1":

        line_count = 1

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<patent-application-publication" in line:
                patent_xml_started = True
                xml_string += "<patent-application-publication>"

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</patent-application-publication" in line:
                patent_xml_started = False
                xml_string += "</patent-application-publication>"

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(processed_data_array, args_array)
                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Close the all the .csv files being written to
    USPTOCSVHandler.close_csv_files(args_array)

    # Set a flag file_processed to ensure that the bulk insert succeeds
    # This should be true, in case the database insertion method is not bulk
    file_processed = True

    # If data is to be inserted as bulk csv files, then call the sql function
    if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk':
        # Check for previous attempt to process the file and clean database if required
        args_array['database_connection'].remove_previous_file_records(args_array['document_type'], args_array['file_name'])
        # Loop through each csv file and bulk copy into database
        for key, csv_file in list(args_array['csv_file_array'].items()):
            # Load CSV file into database
            file_processed = args_array['database_connection'].load_csv_bulk_data(args_array, key, csv_file)

    # If the file was successfully processed into the database
    if file_processed:
        # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed"
        USPTOLogger.write_process_log(args_array)
        if "csv" not in args_array['command_args']:
            # Close all the open csv files
            USPTOCSVHandler.delete_csv_files(args_array)

        print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return the file procecssed status
        return file_processed
    else:
        print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return None to show database insertion failed
        return None
예제 #19
0
def extract_XML2_grant(raw_data, args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    #print raw_data

    # Define all arrays needed to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []

    # Start timer
    start_time = time.time()

    try:
        # Pass the raw data into Element tree xml object
        patent_root = ET.fromstring(raw_data)

    except ET.ParseError as e:
        print_xml = raw_data.split("\n")
        for num, line in enumerate(print_xml, start=1):
            print(str(num) + ' : ' + line)
        logger.error(
            "Character Entity prevented ET from parsing XML in file: " +
            url_link)
        # Print traceback
        traceback.print_exc()
        # Print exception information to file
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Exception: " + str(exc_type) + " in Filename: " +
                     str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                     " Traceback: " + traceback.format_exc())

    # Start the parsing process for XML
    for r in patent_root.findall('SDOBI'):

        # Collect document data
        for B100 in r.findall('B100'):  #GRANT
            try:
                document_id = USPTOSanitizer.return_element_text(
                    B100.find('B110'))
                document_id = USPTOSanitizer.fix_patent_number(
                    document_id)[:20]
            except:
                document_id = None
                logger.error("No Patent Number was found for: " + url_link)
            try:
                kind = USPTOSanitizer.return_element_text(
                    B100.find('B130'))[:2]
            except:
                kind = None
            try:
                pub_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B100.find('B140')),
                    args_array, document_id)  # PATENT ISSUE DATE
            except:
                pub_date = None
            try:
                pub_country = USPTOSanitizer.return_element_text(
                    B100.find('B190'))  # PATENT APPLICANT COUNTRY??
            except:
                pub_country = None

        # Collect apllication data in document
        for B200 in r.findall('B200'):  # APPLICATION
            # TODO: find these datas in XML2 applications
            app_type = None
            app_country = None
            try:
                app_no = USPTOSanitizer.return_element_text(
                    B200.find('B210'))[:20]
            except:
                app_no = None
            try:
                app_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B200.find('B220')),
                    args_array, document_id)  # APPLICATION DATE
            except:
                app_date = None
            try:
                series_code = USPTOSanitizer.return_element_text(
                    B200.find('B211US'))[:2]
            except:
                series_code = None

        # Collect the grant length
        grant_length = USPTOSanitizer.return_element_text(r.find("B474"))

        # Collect US classification
        for B500 in r.findall('B500'):
            for B520 in B500.findall('B520'):  #US CLASSIFICATION
                position = 1
                for B521 in B520.findall('B521'):  # USCLASS MAIN
                    n_class_info = USPTOSanitizer.return_element_text(B521)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main[:5]
                    n_subclass = n_subclass[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1
                for B522 in B520.findall('B522'):  # USCLASS FURTHER
                    n_class_info = USPTOSanitizer.return_element_text(B522)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main[:5]
                    n_subclass = n_subclass[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect International Class data
            # TODO: check if I need to set all variables to empty or can just leave as null
            # TODO: check if classification is parsed correctly
            for B510 in B500.findall('B510'):  # INTERNATIONAL CLASS
                #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id))
                # Reset position
                position = 1
                for B511 in B510.findall('B511'):  #MAIN CLASS
                    i_class_version_date = None
                    i_class_action_date = None
                    i_class_gnr = None
                    i_class_level = None
                    i_class_sec = None
                    int_class = USPTOSanitizer.return_element_text(B511)
                    # TODO: check international classification and rewrite this parsing piece.
                    if (len(int_class.split()) > 1):
                        i_class, i_subclass = int_class.split()
                        i_class = i_class[:15]
                        i_subclass = i_subclass[:15]
                    else:
                        i_class = int_class[:15]
                        i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_class_sps = None
                    i_class_val = None
                    i_class_status = None
                    i_class_ds = None

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

                #INTERNATIONAL CLASS FURTHER
                for B512 in B510.findall('B511'):
                    i_class_version_date = None
                    i_class_action_date = None
                    i_class_gnr = None
                    i_class_level = None
                    i_class_sec = None
                    int_class = USPTOSanitizer.return_element_text(B512)
                    # TODO: splitting int class does not include possible multiple subclasses
                    if (len(int_class.split()) > 1):
                        i_class = int_class.split()[0][:15]
                        i_subclass = int_class.split()[1][:15]
                    else:
                        i_class = int_class[:15]
                        i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_class_sps = None
                    i_class_val = None
                    i_class_status = None
                    i_class_ds = None

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect Tite
            for B540 in B500.findall('B540'):
                try:
                    title = USPTOSanitizer.return_element_text(B540)[:500]
                except:
                    title = None

            # Collect Citations
            for B560 in B500.findall('B560'):  # CITATIONS

                # Reset position counter for all citations loop
                position = 1

                for B561 in B560.findall('B561'):  #PATCIT
                    # TODO: find out how to do PCIT, DOC without loop.  Only B561 needs loop
                    PCIT = B561.find('PCIT')
                    # Determien if the patent is US or not
                    #TODO: needs to check better, what does non US patent look like
                    # If all patents have PARTY-US then perhaps a databse call to check the country of origin
                    # would still allow to separate into GRACIT and FORPATCIT_G
                    #if PCIT.find("PARTY-US") == True:
                    #print "CiTATION OUNTRY US"
                    #citation_country == "US"
                    #else:
                    #citation_country = "NON-US"
                    #logger.warning("NON US patent found")

                    citation_country = "US"

                    DOC = PCIT.find('DOC')
                    try:
                        citation_document_number = USPTOSanitizer.return_element_text(
                            DOC.find('DNUM'))[:15]
                    except:
                        citation_document_number = None
                    try:
                        pct_kind = USPTOSanitizer.return_element_text(
                            DOC.find('KIND'))[:10]
                    except:
                        pct_kind = None
                    try:
                        citation_date = USPTOSanitizer.return_formatted_date(
                            USPTOSanitizer.return_element_text(
                                DOC.find('DATE'), args_array, document_id))
                    except:
                        citation_date = None
                    try:
                        citation_name = USPTOSanitizer.return_element_text(
                            PCIT.find('PARTY-US'))[:100]
                    except:
                        citation_name = None

                    # Parse citation category
                    if (len(B561.getchildren()) > 1):
                        citation_category = B561.getchildren()[1].tag.replace(
                            "\n", "").replace("\r", "")
                        #print type(citation_category)
                        # TODO: check that the citation category tag matches correctly
                        #print "Citation Category = " + citation_category + " Length: " + str(len(citation_category))
                        if "CITED-BY-EXAMINER" in citation_category:
                            citation_category = 1
                        elif "CITED-BY-OTHER" in citation_category:
                            citation_category = 2
                        else:
                            citation_category = 0
                            logger.warning("Cited by unknown type")
                    else:
                        citation_category = None

                    #TODO: be aware that there may be something crazy in the citation document number
                    if citation_country == "US":

                        # Append SQL data into dictionary to be written later
                        processed_gracit.append({
                            "table_name":
                            "uspto.GRACIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "CitedID":
                            citation_document_number,
                            "Kind":
                            pct_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                    else:

                        # Append SQL data into dictionary to be written later
                        processed_forpatcit.append({
                            "table_name":
                            "uspto.FORPATCIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "CitedID":
                            citation_document_number,
                            "Kind":
                            pct_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                # Reset position counter for non-patent citations loop
                position = 1
                for B562 in B560.findall('B562'):  #NON-PATENT LITERATURE
                    for NCIT in B562.findall('NCIT'):
                        # sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                        non_patent_citation_text = USPTOSanitizer.return_element_text(
                            NCIT)
                        non_patent_citation_text = re.sub(
                            '<[^>]+>', '', non_patent_citation_text)

                        # parse citation cateory into code
                        ncitation_category = ET.tostring(NCIT)
                        if (len(B562.getchildren()) > 1):
                            ncitation_category = B562.getchildren(
                            )[1].tag.replace("\n", "").replace("\r", "")
                            #print type(ncitation_category)
                            #rint "Non patent citation category" + ncitation_category
                        if "CITED-BY-EXAMINER" in ncitation_category:
                            ncitation_category = 1
                        elif "CITED-BY-OTHER" in ncitation_category:
                            ncitation_category = 2
                        else:
                            ncitation_category = 0

                    # Append SQL data into dictionary to be written later
                    processed_nonpatcit.append({
                        "table_name":
                        "uspto.NONPATCIT_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Citation":
                        non_patent_citation_text,
                        "Category":
                        ncitation_category,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect number of claims
            for B570 in B500.findall('B570'):
                try:
                    claims_num = USPTOSanitizer.return_element_text(
                        B570.find('B577'))
                except:
                    claims_num = None

            # Collect number of drawings and figures
            for B590 in B500.findall('B590'):
                for B595 in B590.findall('B595'):
                    try:
                        number_of_drawings = USPTOSanitizer.return_element_text(
                            B595)
                        number_of_drawings = number_of_drawings.split("/")[0]
                    except:
                        number_of_drawings = None
                for B596 in B590.findall('B596'):
                    try:
                        number_of_figures = USPTOSanitizer.return_element_text(
                            B596)
                    except:
                        number_of_figures = None

            # TODO: B582 find out what it is.  Looks like patent classifications but it's all alone in the XML

        # Collect party information
        # TODO: find the applicant data and append to array
        for B700 in r.findall('B700'):  #PARTIES

            # Collect inventor data
            for B720 in B700.findall('B720'):  #INVENTOR
                # Reset position for inventors
                position = 1

                # Collect inventor information
                for B721 in B720.findall('B721'):
                    for i in B721.findall('PARTY-US'):
                        itSequence = position
                        try:
                            inventor_first_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('FNM'))[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('SNM'))[:100]
                        except:
                            inventor_last_name = None
                        try:
                            inventor_city = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('CITY'))[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('STATE'))[:100]
                        except:
                            inventor_state = None
                        # TODO: find out if country can be other than US
                        inventor_country = "US"
                        inventor_nationality = None
                        inventor_residence = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "Nationality":
                        inventor_nationality,
                        "Residence":
                        inventor_residence,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect Assignee data
            # TODO: check if finding child of child is working
            # Reset position for assignees
            position = 1
            for B730 in B700.findall('B730'):
                for B731 in B730.findall('B731'):
                    for x in B731.findall('PARTY-US'):
                        try:
                            asn_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM"))[:500]
                        except:
                            asn_orgname = None
                        asn_role = None
                        try:
                            asn_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY'))[:100]
                        except:
                            asn_city = None
                        try:
                            asn_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE'))[:100]
                        except:
                            asn_state = None
                        # TODO: find out if country is always US because it's never included.  Check all other references also
                        asn_country = "US"

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment the position placement
                    position += 1

            # Collect agent data
            for B740 in B700.findall('B740'):  #AGENT
                # Reset position for agents
                position = 1
                for B741 in B740.findall('B741'):
                    for x in B741.findall('PARTY-US'):
                        try:
                            agent_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM"))[:300]
                        except:
                            agent_orgname = None
                        try:
                            agent_last_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('FNM'))[:100]
                        except:
                            agent_last_name = None
                        try:
                            agent_first_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('SNM'))[:100]
                        except:
                            agent_first_name = None
                        agent_country = "US"

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "OrgName":
                            agent_orgname,
                            "LastName":
                            agent_last_name,
                            "FirstName":
                            agent_first_name,
                            "Country":
                            agent_country,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

            # Collect examiner data
            for B745 in B700.findall('B745'):
                position = 1
                # Primary Examiner
                for B746 in B745.findall('B746'):
                    for x in B746.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM'))[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM'))[:50]
                        except:
                            examiner_fist_name = None
                        #TODO: find out if 748US is the department
                        examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                # Assistant Examiner
                for B747 in B745.findall('B747'):
                    for x in B747.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM'))[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM'))[:50]
                        except:
                            examiner_fist_name = None
                        #TODO: find out if 748US is the department
                        examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

        # Collect Abstract from data
        try:
            abstr = patent_root.find('SDOAB')
            abstract = USPTOSanitizer.return_element_text(abstr)
            #print abstract
        except:
            abstract = None

        # Collect claims from data
        try:
            cl = patent_root.find('SDOCL')
            claims = USPTOSanitizer.return_element_text(cl)
            #print claims
        except:
            traceback.print_exc()
            claims = None

        # Append SQL data into dictionary to be written later
        processed_grant.append({
            "table_name": "uspto.GRANT",
            "GrantID": document_id,
            "Title": title,
            "IssueDate": pub_date,
            "Kind": kind,
            "GrantLength": grant_length,
            "USSeriesCode": series_code,
            "Abstract": abstract,
            "ClaimsNum": claims_num,
            "DrawingsNum": number_of_drawings,
            "FiguresNum": number_of_figures,
            "ApplicationID": app_no,
            "Claims": claims,
            "FileDate": app_date,
            "AppType": app_type,
            "FileName": args_array['file_name']
        })

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant": processed_grant,
        "processed_applicant": processed_applicant,
        "processed_examiner": processed_examiner,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_gracit": processed_gracit,
        "processed_forpatcit": processed_forpatcit,
        "processed_nonpatcit": processed_nonpatcit
    }
예제 #20
0
def extract_XML2_grant_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    # NOTE: CPCClASS_G, APPLICANT_G, are not available in XML2 Grant files
    tags_dict = {
        "GRANT" : ["<PATDOC"],
        "INTCLASS_G" : ["<B510"],
        "USCLASS_G" : ["<B521", "<B522"],
        "INVENTOR_G" : ["<B721"],
        "AGENT_G" : ["<B740"],
        "ASSIGNEE_G" : ["<B730"],
        "NONPATCIT_G" : ["<B562"],
        "EXAMINER_G" : ["<B746", "<B747"],
        "FOREIGNPRIORITY_G" : ["<B310"]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "GRANT" : 0,
        "INTCLASS_G" : 0,
        "CPCCLASS_G" : 0,
        "USCLASS_G" : 0,
        "INVENTOR_G" : 0,
        "AGENT_G" : 0,
        "ASSIGNEE_G" : 0,
        "APPLICANT_G" : 0,
        "NONPATCIT_G" : 0,
        "EXAMINER_G" : 0,
        "GRACIT_G" : 0,
        "FORPATCIT_G" : 0,
        "FOREIGNPRIORITY_G" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Count the items that cannot be counted by only tags
    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<PATDOC" in line:
            patent_xml_started = True
            xml_string += "<PATDOC>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</PATDOC" in line:
            patent_xml_started = False
            xml_string += "</PATDOC>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            try:
                document_root = ET.fromstring(xml_string)
                # SDOBI is the bibliographic data
                r = document_root.find('SDOBI')
                # Patent Citations
                B500 = r.find('B500')
                if B500 is not None:
                    for B560 in B500.findall('B560'):
                        # B561 is Patent Citation
                        for B561 in B560.findall('B561'):
                            try: pcit = B561.find('PCIT').find('DOC')
                            except: pcit = None
                            if pcit is not None:
                                prt = pcit.find('PARTY-US')
                                try: citation_state = USPTOSanitizer.return_element_text(prt.find('ADR').find('STATE')).strip()[:3]
                                except: citation_state = None
                                try: citation_country = USPTOSanitizer.return_element_text(prt.find("ADR").find('CTRY')).strip()[:3]
                                except:
                                    try:
                                        # If state is a US state, set country to US
                                        if USPTOSanitizer.is_US_state(citation_state):
                                            citation_country = "US"
                                        else: citation_country = None
                                    except: citation_country = None
                                if citation_country == "US" or citation_country == None: counts_dict['GRACIT_G'] += 1
                                elif citation_country is not None: counts_dict['FORPATCIT_G'] += 1
                # Reset the xml string
                xml_string = ''

            except ET.ParseError as e:
                print_xml = xml_string.split("\n")
                for num, line in enumerate(print_xml, start = 1):
                    #print(str(num) + ' : ' + line)
                    logger.error(str(num) + ' : ' + line)
                logger.error("Character Entity prevented ET from parsing XML in file: " + args_array['file_name'] )
                traceback.print_exc()
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc())


        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Print to stdout and log
    print("-- Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))

    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)
    return counts_dict