예제 #1
0
def extract_XML2_grant(raw_data, args_array):

    #
    # Data documentation on the fields in XML2 Grant data can be found
    # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file

    # Start timer
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define all arrays needed to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []
    processed_foreignpriority = []

    # Pass the raw data into Element tree xml object
    try:
        document_root = ET.fromstring(raw_data)
    except ET.ParseError as e:
        print_xml = raw_data.split("\n")
        for num, line in enumerate(print_xml, start=1):
            print(str(num) + ' : ' + line)
        logger.error(
            "Character Entity prevented ET from parsing XML in file: " +
            url_link)
        traceback.print_exc()
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Exception: " + str(exc_type) + " in Filename: " +
                     str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                     " Traceback: " + traceback.format_exc())

    # SDOBI is the bibliographic data
    r = document_root.find('SDOBI')
    if r is not None:
        # B100 Document Identification
        for B100 in r.findall('B100'):
            try:
                document_id = USPTOSanitizer.return_element_text(
                    B100.find('B110')).strip()
                document_id = USPTOSanitizer.fix_patent_number(
                    document_id)[:20]
            except:
                document_id = None
                logger.error("No Patent Number was found for: " + url_link)
            try:
                kind = USPTOSanitizer.return_element_text(
                    B100.find('B130')).strip()[:2]
                app_type = USPTOSanitizer.return_xml2_app_type(
                    args_array, kind).strip()
            except:
                kind = None
            # PATENT ISSUE DATE
            try:
                pub_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B100.find('B140')),
                    args_array, document_id)
            except:
                pub_date = None
            # B190 is Publishing Country or Organization
            # This is always US in Red Book Patent Grant documents and
            # this field is not stored or used.
            try:
                pub_country = USPTOSanitizer.return_element_text(
                    B100.find('B190')).strip()
            except:
                pub_country = None

        # B200 is Domestic Filing Data
        for B200 in r.findall('B200'):
            # TODO: find this in XML2 applications
            app_country = None
            # Application number
            try:
                app_no = USPTOSanitizer.return_element_text(
                    B200.find('B210')).strip()[:20]
            except:
                app_no = None
            # Application Date
            try:
                app_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B200.find('B220')),
                    args_array, document_id)
            except:
                app_date = None
            # Series Code
            try:
                series_code = USPTOSanitizer.return_element_text(
                    B200.find('B211US')).strip()[:2]
            except:
                series_code = None

        # Collect the Grant Length
        try:
            grant_length = USPTOSanitizer.return_element_text(
                r.find("B400").find("B472").find("B474")).strip()
        except:
            grant_length = None

        # Collect Technical information
        # such as classification and references
        # TODO: don't need the loop here
        for B500 in r.findall('B500'):
            # US Classification
            for B520 in B500.findall('B520'):
                position = 1
                # USCLASS
                for B521 in B520.findall('B521'):
                    # Reset the class vars
                    n_class = None
                    n_section = None
                    n_subclass = None
                    # Collect class vars
                    n_class_info = USPTOSanitizer.return_element_text(B521)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_usclass)
                    position += 1

                # B522 USCLASS FURTHER
                for B522 in B520.findall('B522'):
                    n_class_info = USPTOSanitizer.return_element_text(B522)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main.strip()[:5]
                    n_subclass = n_subclass.strip()[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    position += 1

            # B510 International Class data
            # TODO: check if I need to set all variables to empty or can just leave as null
            # TODO: check if classification is parsed correctly
            for B510 in B500.findall('B510'):
                #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id))
                # Reset position
                position = 1
                # B511 Main Class
                for B511 in B510.findall('B511'):
                    i_section = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    int_class = USPTOSanitizer.return_element_text(B511)
                    # Int Class is:
                    if (len(int_class.split()) > 1):
                        sec_1, sec_2 = int_class.split()
                        sec_1 = sec_1.strip()[:15]
                        # Remove the Section from first character
                        i_section = sec_1[0]
                        i_class = sec_1[1:3]
                        i_subclass = sec_1[-1]
                        i_class_mgr = sec_2.strip()[:-2]
                        i_class_sgr = sec_2.strip()[-2:]
                    else:
                        int_class = int_class.strip()[:15]
                        i_section = int_class[0]
                        i_class = int_class[1:]
                        i_subclass = int_class[-1]
                        i_malformed = 1

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_section,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

                # B512 Further International Class
                for B512 in B510.findall('B512'):
                    i_section = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    int_class = USPTOSanitizer.return_element_text(B512)
                    # Split class in to class and group
                    if (len(int_class.split()) > 1):
                        sec_1, sec_2 = int_class.split()
                        sec_1 = sec_1.strip()[:15]
                        # Remove the Section from first character
                        i_section = sec_1[0]
                        i_class = sec_1[1:3]
                        i_subclass = sec_1[-1]
                        i_class_mgr = sec_2.strip()[:-2]
                        i_class_sgr = sec_2.strip()[-2:]
                    else:
                        # TODO: Is this correct??
                        int_class = int_class.strip()[:15]
                        i_section = int_class[0]
                        i_class = int_class[1:]
                        i_subclass = int_class[-1]
                        i_malformed = 1

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_section,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

            # B540 Collect Title
            for B540 in B500.findall('B540'):
                try:
                    title = USPTOSanitizer.strip_for_csv(
                        USPTOSanitizer.return_element_text(B540)[:500])
                except:
                    title = None

            # Patent Citations
            for B560 in B500.findall('B560'):
                # Reset position counter for all citations loop
                position = 1
                # B561 is Patent Citation
                for B561 in B560.findall('B561'):

                    # TODO: find out how to do PCIT, DOC without loop.  Only B561 needs loop
                    pcit = B561.find('PCIT')
                    # Determien if the patent is US or not
                    #TODO: needs to check better, what does non US patent look like
                    # If all patents have PARTY-US then perhaps a databse call to check the country of origin
                    # would still allow to separate into GRACIT and FORPATCIT_G
                    #if PCIT.find("PARTY-US") == True:
                    #print "CITATION COUNTRY US"
                    #citation_country = "US"
                    #else:
                    #citation_country = "NON-US"
                    #logger.warning("NON US patent found")

                    #citation_country = "US"

                    # Declare items in case they are not found
                    citation_name = None
                    citation_city = None
                    citation_state = None
                    citation_country = None

                    doc = pcit.find('DOC')
                    if doc is not None:
                        try:
                            citation_document_number = USPTOSanitizer.return_element_text(
                                doc.find('DNUM')).strip()[:15]
                        except:
                            citation_document_number = None
                        try:
                            pct_kind = USPTOSanitizer.return_element_text(
                                doc.find('KIND')).strip()[:10]
                        except:
                            pct_kind = None
                        try:
                            citation_date = USPTOSanitizer.return_formatted_date(
                                USPTOSanitizer.return_element_text(
                                    doc.find('DATE')), args_array, document_id)
                        except:
                            citation_date = None
                        prt = pcit.find('PARTY-US')
                        if prt is not None:
                            try:
                                citation_name = USPTOSanitizer.return_element_text(
                                    prt.find("NAM").find("SNM")).strip()[:100]
                            except:
                                citation_name = None
                            # Citation Address info
                            try:
                                citation_city = USPTOSanitizer.return_element_text(
                                    prt.find('ADR').find('CITY')).strip()[:100]
                            except:
                                citation_city = None
                            try:
                                citation_state = USPTOSanitizer.return_element_text(
                                    prt.find('ADR').find('STATE')).strip()[:3]
                            except:
                                citation_state = None
                            # Citation country
                            try:
                                citation_country = USPTOSanitizer.return_element_text(
                                    prt.find("ADR").find('CTRY')).strip()[:3]
                            except:
                                try:
                                    # If state is a US state, set country to US
                                    if USPTOSanitizer.is_US_state(
                                            citation_state):
                                        citation_country = "US"
                                    else:
                                        citation_country = None
                                except:
                                    citation_country = None

                        # Parse citation category
                        if (len(B561.getchildren()) > 1):
                            try:
                                citation_category = B561.getchildren(
                                )[1].tag.replace("\n", "").replace("\r",
                                                                   "").upper()
                            except:
                                citation_category = None
                        else:
                            citation_category = None

                        #TODO: be aware that there may be something crazy in the
                        # citation document number
                        if pct_kind != None:

                            # Append SQL data into dictionary to be written later
                            processed_gracit.append({
                                "table_name":
                                "uspto.GRACIT_G",
                                "GrantID":
                                document_id,
                                "Position":
                                position,
                                "CitedID":
                                citation_document_number,
                                "Kind":
                                pct_kind,
                                "Name":
                                citation_name,
                                "Date":
                                citation_date,
                                "Country":
                                citation_country,
                                "Category":
                                citation_category,
                                "FileName":
                                args_array['file_name']
                            })
                            #print(processed_gracit)
                            position += 1

                        else:

                            # Append SQL data into dictionary to be written later
                            processed_forpatcit.append({
                                "table_name":
                                "uspto.FORPATCIT_G",
                                "GrantID":
                                document_id,
                                "Position":
                                position,
                                "CitedID":
                                citation_document_number,
                                "Kind":
                                pct_kind,
                                "Name":
                                citation_name,
                                "Date":
                                citation_date,
                                "Country":
                                citation_country,
                                "Category":
                                citation_category,
                                "FileName":
                                args_array['file_name']
                            })
                            #print(processed_forpatcit)
                            position += 1

                # Reset position counter for non-patent citations loop
                position = 1
                # Non-patent Literature Citations
                for B562 in B560.findall('B562'):
                    NCIT = B562.find('NCIT')
                    if NCIT is not None:
                        # Sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                        non_patent_citation_text = USPTOSanitizer.return_element_text(
                            NCIT)
                        non_patent_citation_text = re.sub(
                            '<[^>]+>', '', non_patent_citation_text)
                    else:
                        non_patent_citation_text = None

                    # Parse citation category into code
                    if (len(B562.getchildren()) > 1):
                        try:
                            ncitation_category = B562.getchildren(
                            )[1].tag.replace("\n", "").replace("\r",
                                                               "").upper()
                        except:
                            ncitation_category = None
                    else:
                        ncitation_category = None

                    # Append SQL data into dictionary to be written later
                    processed_nonpatcit.append({
                        "table_name":
                        "uspto.NONPATCIT_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Citation":
                        non_patent_citation_text,
                        "Category":
                        ncitation_category,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_nonpatcit)
                    position += 1

            # Collect number of claims
            for B570 in B500.findall('B570'):
                try:
                    claims_num = USPTOSanitizer.return_element_text(
                        B570.find('B577')).strip()
                except:
                    claims_num = None

            # Collect number of drawings and figures
            for B590 in B500.findall('B590'):
                for B595 in B590.findall('B595'):
                    try:
                        number_of_drawings = USPTOSanitizer.return_element_text(
                            B595).strip()
                        number_of_drawings = number_of_drawings.split("/")[0]
                    except:
                        number_of_drawings = None
                for B596 in B590.findall('B596'):
                    try:
                        number_of_figures = USPTOSanitizer.return_element_text(
                            B596).strip()
                    except:
                        number_of_figures = None

            # TODO: B582 find out what it is.  Looks like patent classifications but it's all alone in the XML

        # B700 is Parties
        # TODO: find the applicant data and append to array
        for B700 in r.findall('B700'):
            # B720 Inventor
            for B720 in B700.findall('B720'):
                # Reset position for inventors
                position = 1
                # Collect inventor information
                for B721 in B720.findall('B721'):
                    for i in B721.findall('PARTY-US'):
                        # Inventor Name
                        try:
                            inventor_first_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('FNM')).strip()[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('SNM')).strip()[:100]
                        except:
                            inventor_last_name = None
                        # Inventor Address info
                        try:
                            inventor_city = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('CITY')).strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('STATE')).strip()[:3]
                        except:
                            inventor_state = None
                        # Inventor country
                        try:
                            inventor_country = USPTOSanitizer.return_element_text(
                                i.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # If state is a US state, set country to US
                                if USPTOSanitizer.is_US_state(inventor_state):
                                    inventor_country = "US"
                                else:
                                    inventor_country = None
                            except:
                                inventor_country = None
                        inventor_nationality = None
                        inventor_residence = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "Nationality":
                        inventor_nationality,
                        "Residence":
                        inventor_residence,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_inventor)
                    position += 1

            # B730 Assignee
            # TODO: check if finding child of child is working
            # Reset position for assignees
            position = 1
            for B730 in B700.findall('B730'):
                for B731 in B730.findall('B731'):
                    for x in B731.findall('PARTY-US'):
                        try:
                            asn_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM")).strip()[:500]
                        except:
                            asn_orgname = None
                        asn_role = None
                        try:
                            asn_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY')).strip()[:100]
                        except:
                            asn_city = None
                        try:
                            asn_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE')).strip()[:30]
                        except:
                            asn_state = None
                        # Assignee country
                        try:
                            asn_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # Fix country if country missing
                                if USPTOSanitizer.is_US_state(asn_state):
                                    asn_country = "US"
                                else:
                                    asn_country = None
                            except:
                                asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_assignee)
                    position += 1

            # B740 is Legal Agent / Attorney
            for B740 in B700.findall('B740'):
                # Reset position for agents
                position = 1
                for B741 in B740.findall('B741'):
                    for x in B741.findall('PARTY-US'):
                        # Attorney Organization
                        try:
                            agent_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM")).strip()[:300]
                        except:
                            agent_orgname = None
                        # Attorney Name
                        try:
                            agent_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:100]
                        except:
                            agent_last_name = None
                        try:
                            agent_first_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:100]
                        except:
                            agent_first_name = None
                        # Attorney Address information
                        try:
                            agent_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY')).strip()[:100]
                        except:
                            agent_city = None
                        try:
                            agent_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE')).strip()[:30]
                        except:
                            agent_state = None
                        # Agent country
                        try:
                            agent_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY')).strip()[:3]
                        except:
                            try:
                                # Fix country if missing
                                if USPTOSanitizer.is_US_state(agent_state):
                                    agent_country = "US"
                                else:
                                    agent_country = None
                            except:
                                agent_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "OrgName":
                            agent_orgname,
                            "LastName":
                            agent_last_name,
                            "FirstName":
                            agent_first_name,
                            "Country":
                            agent_country,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_agent)
                        position += 1

            # B745 Examiner
            for B745 in B700.findall('B745'):
                position = 1
                # Primary Examiner
                for B746 in B745.findall('B746'):
                    for x in B746.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:50]
                        except:
                            examiner_fist_name = None
                        try:
                            examiner_department = USPTOSanitizer.return_element_text(
                                B745.find('B748US')).strip()[:50]
                        except:
                            examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_examiner)
                        position += 1

                # Assistant Examiner
                for B747 in B745.findall('B747'):
                    for x in B747.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM')).strip()[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM')).strip()[:50]
                        except:
                            examiner_fist_name = None
                        try:
                            examiner_department = USPTOSanitizer.return_element_text(
                                B745.find('B748US')).strip()[:50]
                        except:
                            examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })
                        #print(processed_examiner)
                        position += 1

        # B300 Foreign priotiry data
        position = 1
        for B300 in r.findall('B300'):
            # Country
            try:
                pc_country = USPTOSanitizer.return_element_text(
                    B300.find('B330').find('CTRY')).strip()[:5]
            except:
                pc_country = None
            # Prority filing date
            try:
                pc_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(
                        B300.find('B320').find('DATE')).strip()[:45])
            except:
                pc_date = None
            # Prority document number
            try:
                pc_doc_num = USPTOSanitizer.return_element_text(
                    B300.find('B310').find('DNUM')).strip()[:45]
            except:
                pc_doc_dum = None

            # Set the fields that are not in gXML2
            pc_kind = None

            # Append SQL data into dictionary to be written later
            processed_foreignpriority.append({
                "table_name":
                "uspto.FOREIGNPRIORITY_G",
                "GrantID":
                document_id,
                "Position":
                position,
                "Kind":
                pc_kind,
                "Country":
                pc_country,
                "DocumentID":
                pc_doc_num,
                "PriorityDate":
                pc_date,
                "FileName":
                args_array['file_name']
            })
            #print(processed_foreignpriority)
            position += 1

        # Collect Abstract from data
        try:
            a_elem = document_root.find('SDOAB')
            if a_elem is not None:
                abstract = USPTOSanitizer.strip_for_csv(
                    USPTOSanitizer.return_element_text(a_elem))
            else:
                abstract = None
        except Exception as e:
            abstract = None
            #traceback.print_exc()
            #logger.error("Exception while extracting description from " + str(document_id) + ": " + traceback.print_exc())
        #print(abstract)

        # Collect detailed description from DETDESC
        try:
            d_elem = document_root.find('SDODE').find('DETDESC')
            if d_elem is not None:
                description = USPTOSanitizer.strip_for_csv(' '.join(
                    d_elem.itertext()))
            else:
                description = None
        except Exception as e:
            description = None
            #traceback.print_exc()
            #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc())
        #print(description)

        # Collect claims from data
        try:
            c_elem = document_root.find('SDOCL')
            if c_elem is not None:
                claims = USPTOSanitizer.strip_for_csv(' '.join(
                    c_elem.itertext()))
                #claims = USPTOSanitizer.strip_for_csv(USPTOSanitizer.return_element_text(c_elem))
            else:
                claims = None
        except Exception as e:
            claims = None
            #traceback.print_exc()
            #logger.error("Exception while extracting claim from " + str(document_id) + ": " + traceback.print_exc())
        #print(claims)

        # Append SQL data into dictionary to be written later
        processed_grant.append({
            "table_name": "uspto.GRANT",
            "GrantID": document_id,
            "Title": title,
            "Claims": claims,
            "Description": description,
            "IssueDate": pub_date,
            "Kind": kind,
            "GrantLength": grant_length,
            "USSeriesCode": series_code,
            "Abstract": abstract,
            "ClaimsNum": claims_num,
            "DrawingsNum": number_of_drawings,
            "FiguresNum": number_of_figures,
            "ApplicationID": app_no,
            "FileDate": app_date,
            "AppType": app_type,
            "FileName": args_array['file_name']
        })
        #print(processed_grant)

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant": processed_grant,
        "processed_applicant": processed_applicant,
        "processed_examiner": processed_examiner,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_gracit": processed_gracit,
        "processed_forpatcit": processed_forpatcit,
        "processed_nonpatcit": processed_nonpatcit,
        "processed_foreignpriority": processed_foreignpriority
    }
예제 #2
0
def extract_XML1_application(raw_data, args_array):

    # Set process start time
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    # Define required arrays
    processed_application = []
    processed_foreignpriority = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_cpcclass = []

    # Pass the xml into Element tree object
    document_root = ET.fromstring(raw_data)
    r = document_root.find('subdoc-bibliographic-information')

    # Get and fix the document_id data
    di = r.find('document-id')
    if di is not None:
        # This document ID is NOT application number
        try:
            document_id = di.findtext('doc-number').strip()
        except:
            document_id = None
            logger.error("No Patent Number was found for: " + url_link)
        try:
            kind = di.findtext('kind-code').strip()[:2]
            app_type = USPTOSanitizer.return_xml2_app_type(args_array,
                                                           kind).strip()
        except:
            kind = None
            app_type = None
        try:
            pub_date = USPTOSanitizer.return_formatted_date(
                di.findtext('document-date'), args_array, document_id)
        except:
            pub_date = None

    # Get application filing data
    ar = r.find('domestic-filing-data')
    if ar is not None:
        try:
            app_no = ar.find('application-number').findtext(
                'doc-number').strip()[:20]
        except:
            app_no = None
        try:
            app_date = USPTOSanitizer.return_formatted_date(
                ar.findtext('filing-date'), args_array, document_id)
        except:
            app_date = None
        try:
            series_code = ar.findtext(
                'application-number-series-code').strip()[:2]
        except:
            series_code = None

    # Get technical information
    ti = r.find('technical-information')
    if ti is not None:

        # Get invention title
        try:
            title = USPTOSanitizer.strip_for_csv(
                ti.findtext('title-of-invention')[:500])
        except:
            title = None

        # Get international classification data
        ic = ti.find('classification-ipc')
        if ic is not None:
            # Init position
            position = 1
            # Process the primary international class
            icm = ic.find('classification-ipc-primary')
            if icm is not None:
                #print(icm.findtext('ipc'))
                # Clear variable values
                i_class_sec = None
                i_class = None
                i_subclass = None
                i_class_mgr = None
                i_class_sgr = None
                i_malformed = None
                try:
                    i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application(
                        icm.findtext('ipc'))
                    i_class_sec = i_class_sec.strip()[:15]
                    i_class = i_class.strip()[:15]
                    i_subclass = i_subclass.strip()[:15]
                    i_class_mgr = i_class_mgr.strip()[:15]
                    i_class_sgr = i_class_sgr.strip()[:15]
                except Exception as e:
                    traceback.print_exc()
                    i_class_sec = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = 1
                    logger.warning(
                        "Malformed international class found in application ID: "
                        + document_id + " in file: " + url_link)

                # Append SQL data into dictionary to be written later
                processed_intclass.append({
                    "table_name": "uspto.INTCLASS_A",
                    "ApplicationID": app_no,
                    "Position": position,
                    "Section": i_class_sec,
                    "Class": i_class,
                    "SubClass": i_subclass,
                    "MainGroup": i_class_mgr,
                    "SubGroup": i_class_sgr,
                    "Malformed": i_malformed,
                    "FileName": args_array['file_name']
                })
                #print(processed_intclass)
                position += 1

            # Process any secondary international classes
            ics = ic.findall('classification-ipc-secondary')
            if ics is not None:
                for ics_item in ics:
                    # Clear variable values
                    i_class_sec = None
                    i_class = None
                    i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_malformed = None
                    try:
                        i_class_sec, i_class, i_subclass, i_class_mgr, i_class_sgr = USPTOSanitizer.return_international_class_XML1_application(
                            ics_item.findtext('ipc'))
                        i_class_sec = i_class_sec.strip()[:15]
                        i_class = i_class.strip()[:15]
                        i_subclass = i_subclass.strip()[:15]
                        i_class_mgr = i_class_mgr.strip()[:15]
                        i_class_sgr = i_class_sgr.strip()[:15]
                    except Exception as e:
                        traceback.print_exc()
                        i_class_sec = None
                        i_class = None
                        i_subclass = None
                        i_class_mgr = None
                        i_class_sgr = None
                        i_malformed = 1
                        logger.warning(
                            "Malformed international class found in application ID: "
                            + document_id + " in file: " + url_link)

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "Malformed":
                        i_malformed,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_intclass)
                    position += 1

        # Get US classification data
        nc = ti.find('classification-us')
        nc_position = 1
        if nc is not None:
            uspc = nc.find('classification-us-primary').find('uspc')
            if uspc is not None:
                n_class_main = None
                n_subclass = None
                try:
                    n_class_main = uspc.findtext('class').strip()[:5]
                except:
                    n_class_main = None
                try:
                    n_subclass = uspc.findtext('subclass').strip()[:15]
                except:
                    n_subclass = None

                # Append SQL data into dictionary to be written later
                processed_usclass.append({
                    "table_name": "uspto.USCLASS_A",
                    "ApplicationID": app_no,
                    "Position": nc_position,
                    "Class": n_class_main,
                    "SubClass": n_subclass,
                    "FileName": args_array['file_name']
                })
                #print(processed_usclass)
                nc_position += 1

            # Collect all Secondary US class
            ncs = nc.findall('classification-us-secondary')
            for ncs_item in ncs:
                n_class_main = None
                n_subclass = None
                uspc = ncs_item.find('uspc')
                if uspc is not None:
                    try:
                        n_class_main = uspc.findtext('class').strip()[:5]
                    except:
                        n_class_main = None
                    try:
                        n_subclass = uspc.findtext('subclass').strip()[:5]
                    except:
                        n_subclass = None

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        nc_position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_usclass)
                    nc_position += 1

    # Get priority claims
    pc_position = 1
    pc_kind = None
    for pc in r.findall('foreign-priority-data'):
        try:
            pc_country = pc.findtext('country-code').strip()[:100]
        except:
            pc_country = None
        try:
            pc_doc_num = pc.find('priority-application-number').findtext(
                'doc-number').strip()[:100]
        except:
            pc_doc_num = None
        try:
            pc_date = USPTOSanitizer.return_formatted_date(
                pc.findtext('filing-date'), args_array, document_id)
        except:
            pc_date = None

        # Append SQL data into dictionary to be written later
        processed_foreignpriority.append({
            "table_name": "uspto.FOREIGNPRIORITY_A",
            "ApplicationID": app_no,
            "Position": pc_position,
            "Kind": pc_kind,
            "Country": pc_country,
            "DocumentID": pc_doc_num,
            "PriorityDate": pc_date,
            "FileName": args_array['file_name']
        })
        #print(processed_foreignpriority)
        pc_position += 1

    # Get inventor data
    invs = r.find('inventors')
    if invs is not None:
        # Init position
        inv_position = 1
        for inventor in invs.findall('first-named-inventor'):
            n = inventor.find('name')
            try:
                inventor_first_name = n.findtext('given-name').strip()[:100]
            except:
                inventor_first_name = None
            try:
                inventor_last_name = n.findtext('family-name').strip()[:100]
            except:
                inventor_last_name = None
            # Get the residence tag
            res = inventor.find('residence')
            if res is not None:
                residence_us = res.find('residence-us')
                if residence_us is not None:
                    try:
                        inventor_city = residence_us.findtext(
                            'city').strip()[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_us.findtext(
                            'state').strip()[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_us.findtext(
                            'country-code').strip()[:100]
                    except:
                        inventor_country = None
                residence_non_us = res.find('residence-non-us')
                if residence_non_us is not None:
                    try:
                        inventor_city = residence_non_us.findtext(
                            'city').strip()[:100]
                    except:
                        inventor_city = None
                    try:
                        inventor_state = residence_non_us.findtext(
                            'state').strip()[:100]
                    except:
                        inventor_state = None
                    try:
                        inventor_country = residence_non_us.findtext(
                            'country-code').strip()[:100]
                    except:
                        inventor_country = None

            # Append SQL data into dictionary to be written later
            processed_inventor.append({
                "table_name": "uspto.INVENTOR_A",
                "ApplicationID": app_no,
                "Position": inv_position,
                "FirstName": inventor_first_name,
                "LastName": inventor_last_name,
                "City": inventor_city,
                "State": inventor_state,
                "Country": inventor_country,
                "FileName": args_array['file_name']
            })
            #print(processed_inventor)
            inv_position += 1

        # For all secordary inventors
        for inv in invs.findall('inventor'):
            if inv is not None:
                n = inv.find('name')
                if n is not None:
                    try:
                        inventor_first_name = n.findtext(
                            'given-name').strip()[:100]
                    except:
                        inventor_first_name = None
                    try:
                        inventor_last_name = n.findtext(
                            'family-name').strip()[:100]
                    except:
                        inventor_last_name = None

                res = inv.find('residence')
                if res is not None:
                    residence_us = res.find('residence-us')
                    if residence_us is not None:
                        try:
                            inventor_city = residence_us.findtext(
                                'city').strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_us.findtext(
                                'state').strip()[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_us.findtext(
                                'country-code').strip()[:100]
                        except:
                            inventor_country = None
                    residence_non_us = res.find('residence-non-us')
                    if residence_non_us is not None:
                        try:
                            inventor_city = residence_non_us.findtext(
                                'city').strip()[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = residence_non_us.findtext(
                                'state').strip()[:100]
                        except:
                            inventor_state = None
                        try:
                            inventor_country = residence_non_us.findtext(
                                'country-code').strip()[:100]
                        except:
                            inventor_country = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_A",
                        "ApplicationID":
                        app_no,
                        "Position":
                        inv_position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "FileName":
                        args_array['file_name']
                    })
                    #print(processed_inventor)
                    inv_position += 1

    # Get assignee data
    # Init position
    asn_position = 1
    for asn in r.findall('assignee'):
        try:
            asn_role = asn.findtext('assignee-type').strip()[:100]
        except:
            asn_role = None
        try:
            asn_orgname = asn.findtext('organization-name').strip()[:300]
        except:
            asn_orgname = None
        adr_elem = asn.find('address')
        try:
            asn_city = adr_elem.findtext('city').strip()[:100]
        except:
            asn_city = None
        try:
            asn_state = adr_elem.findtext('state').strip()[:100]
        except:
            asn_state = None
        try:
            asn_country = adr_elem.find('country').findtext(
                'country-code').strip()[:100]
        except:
            asn_country = None
        if asn_country == None:
            if USPTOSanitizer.is_US_state(asn_state):
                asn_country = "US"
        # These have not been found in XML1,
        # but a full XML parse should be done
        asn_firstname = None
        asn_lastname = None

        # Append SQL data into dictionary to be written later
        processed_assignee.append({
            "table_name": "uspto.ASSIGNEE_A",
            "ApplicationID": app_no,
            "Position": asn_position,
            "OrgName": asn_orgname,
            "FirstName": asn_firstname,
            "LastName": asn_lastname,
            "Role": asn_role,
            "City": asn_city,
            "State": asn_state,
            "Country": asn_country,
            "FileName": args_array['file_name']
        })
        #print(processed_assignee)
        asn_position += 1

    # Find the agent element
    agn = r.find('correspondence-address')
    # Init position
    agn_position = 1
    if agn is not None:
        try:
            agent_orgname = agn.findtext('name-1').strip()
        except:
            agent_orgname = None
        try:
            agent_orgname_2 = agn.findtext('name-2').strip()
        except:
            agent_orgname_2 = None
        # Combine Orgname 1 and 2 and shorten if needed
        if agent_orgname != None and agent_orgname_2 != None:
            agent_orgname = USPTOSanitizer.strip_for_csv(agent_orgname + " " +
                                                         agent_orgname_2)[:300]
        # Get the address element
        addr_elem = agn.find('address')
        if addr_elem is not None:
            try:
                try:
                    agent_addr_1 = addr_elem.findtext(
                        'address-1').strip()[:100]
                except:
                    agent_addr_1 = ""
                try:
                    agent_addr_2 = addr_elem.findtext(
                        'address-2').strip()[:100]
                except:
                    agent_addr_2 = ""
                agent_address = USPTOSanitizer.strip_for_csv(agent_addr_1 +
                                                             agent_addr_2)
            except:
                agent_address = None
            try:
                agent_city = addr_elem.findtext('city').strip()[:50]
            except:
                agent_city = None
            try:
                agent_state = addr_elem.findtext('state').strip()[:3]
            except:
                agent_state = None
            try:
                agent_country = addr_elem.find('country').findtext(
                    'country-code').strip()[:3]
            except:
                if USPTOSanitizer.is_US_state(agent_state):
                    agent_country = "US"
                else:
                    agent_country = None

        # Append SQL data into dictionary to be written later
        processed_agent.append({
            "table_name": "uspto.AGENT_A",
            "ApplicationID": app_no,
            "Position": agn_position,
            "OrgName": agent_orgname,
            "Address": agent_address,
            "City": agent_city,
            "State": agent_state,
            "Country": agent_country,
            "FileName": args_array['file_name']
        })
        #print(processed_agent)
        agn_position += 1

    # Find the abstract of the application
    try:
        abstract = USPTOSanitizer.strip_for_csv(
            USPTOSanitizer.return_element_text(
                document_root.find('subdoc-abstract')))
    except:
        abstract = None

    # Find the description
    try:
        description = ""
        d_elem = document_root.find('subdoc-description')
        if d_elem is not None:
            description += USPTOSanitizer.strip_for_csv(' '.join(
                d_elem.itertext()))
        else:
            description = None
    except Exception as e:
        description = None
        #traceback.print_exc()
        #logger.error("Exception while extracting description from " + str(app_no))
    #print(description)

    # Find the claims
    try:
        claims = ""
        c_elem = document_root.find('subdoc-claims')
        if c_elem is not None:
            claims += USPTOSanitizer.strip_for_csv(' '.join(c_elem.itertext()))
        else:
            claims = None
    except Exception as e:
        claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting claim from " + str(app_no))
    #print(claims)

    # Find the number of claims
    try:
        number_of_claims = 0
        for clms in c_elem.findall('claim'):
            number_of_claims += 1
    except Exception as e:
        number_of_claims = None
        #traceback.print_exc()
        #logger.error("Exception while extracting number of claims from " + str(app_no))
    #print(number_of_claims)

    # Find the number of drawings and figures
    try:
        number_of_figures = 0
        number_of_drawings = 0
        drw_elem = document_root.find('subdoc-drawings')
        if drw_elem != None:
            for fg in drw_elem.findall('figure'):
                img_type = fg.find('image').attrib['ti'].strip()
                if img_type == "DR": number_of_drawings += 1
                elif img_type == "FG": number_of_figures += 1
        else:
            number_of_figures = None
            number_of_drawings = None
    except Exception as e:
        number_of_figures = None
        number_of_drawings = None
        #traceback.print_exc()
        #logger.error("Exception while extracting figures and drawings num " + str(app_no))
    #print(number_of_figures)
    #print(number_of_drawings)

    # Append SQL data into dictionary to be written later
    processed_application.append({
        "table_name": "uspto.APPLICATION",
        "ApplicationID": app_no,
        "PublicationID": document_id,
        "AppType": app_type,
        "Title": title,
        "FileDate": app_date,
        "PublishDate": pub_date,
        "Kind": kind,
        "USSeriesCode": series_code,
        "Abstract": abstract,
        "ClaimsNum": number_of_claims,
        "DrawingsNum": number_of_drawings,
        "FiguresNum": number_of_figures,
        "Description": description,
        "Claims": claims,
        "FileName": args_array['file_name']
    })
    #print(processed_application)

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_application": processed_application,
        "processed_foreignpriority": processed_foreignpriority,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_cpcclass": processed_cpcclass
    }
예제 #3
0
def extract_XML2_grant_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    # NOTE: CPCClASS_G, APPLICANT_G, are not available in XML2 Grant files
    tags_dict = {
        "GRANT" : ["<PATDOC"],
        "INTCLASS_G" : ["<B510"],
        "USCLASS_G" : ["<B521", "<B522"],
        "INVENTOR_G" : ["<B721"],
        "AGENT_G" : ["<B740"],
        "ASSIGNEE_G" : ["<B730"],
        "NONPATCIT_G" : ["<B562"],
        "EXAMINER_G" : ["<B746", "<B747"],
        "FOREIGNPRIORITY_G" : ["<B310"]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "GRANT" : 0,
        "INTCLASS_G" : 0,
        "CPCCLASS_G" : 0,
        "USCLASS_G" : 0,
        "INVENTOR_G" : 0,
        "AGENT_G" : 0,
        "ASSIGNEE_G" : 0,
        "APPLICANT_G" : 0,
        "NONPATCIT_G" : 0,
        "EXAMINER_G" : 0,
        "GRACIT_G" : 0,
        "FORPATCIT_G" : 0,
        "FOREIGNPRIORITY_G" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Count the items that cannot be counted by only tags
    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<PATDOC" in line:
            patent_xml_started = True
            xml_string += "<PATDOC>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</PATDOC" in line:
            patent_xml_started = False
            xml_string += "</PATDOC>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            try:
                document_root = ET.fromstring(xml_string)
                # SDOBI is the bibliographic data
                r = document_root.find('SDOBI')
                # Patent Citations
                B500 = r.find('B500')
                if B500 is not None:
                    for B560 in B500.findall('B560'):
                        # B561 is Patent Citation
                        for B561 in B560.findall('B561'):
                            try: pcit = B561.find('PCIT').find('DOC')
                            except: pcit = None
                            if pcit is not None:
                                prt = pcit.find('PARTY-US')
                                try: citation_state = USPTOSanitizer.return_element_text(prt.find('ADR').find('STATE')).strip()[:3]
                                except: citation_state = None
                                try: citation_country = USPTOSanitizer.return_element_text(prt.find("ADR").find('CTRY')).strip()[:3]
                                except:
                                    try:
                                        # If state is a US state, set country to US
                                        if USPTOSanitizer.is_US_state(citation_state):
                                            citation_country = "US"
                                        else: citation_country = None
                                    except: citation_country = None
                                if citation_country == "US" or citation_country == None: counts_dict['GRACIT_G'] += 1
                                elif citation_country is not None: counts_dict['FORPATCIT_G'] += 1
                # Reset the xml string
                xml_string = ''

            except ET.ParseError as e:
                print_xml = xml_string.split("\n")
                for num, line in enumerate(print_xml, start = 1):
                    #print(str(num) + ' : ' + line)
                    logger.error(str(num) + ' : ' + line)
                logger.error("Character Entity prevented ET from parsing XML in file: " + args_array['file_name'] )
                traceback.print_exc()
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc())


        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Print to stdout and log
    print("-- Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))

    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)
    return counts_dict
예제 #4
0
def extract_XML2_grant(raw_data, args_array):

    #
    # Data documentation on the fields in XML2 Grant data can be found
    # in the /documents/data_descriptions/PatentGrantSGMLv19-Documentation.pdf file
    #

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Pass the url_link and format into local variables
    url_link = args_array['url_link']
    uspto_xml_format = args_array['uspto_xml_format']

    #print raw_data

    # Define all arrays needed to hold the data
    processed_grant = []
    processed_applicant = []
    processed_examiner = []
    processed_assignee = []
    processed_agent = []
    processed_inventor = []
    processed_usclass = []
    processed_intclass = []
    processed_gracit = []
    processed_forpatcit = []
    processed_nonpatcit = []
    processed_foreignpriority = []

    # Start timer
    start_time = time.time()

    try:
        # Pass the raw data into Element tree xml object
        patent_root = ET.fromstring(raw_data)

    except ET.ParseError as e:
        print_xml = raw_data.split("\n")
        for num, line in enumerate(print_xml, start=1):
            print(str(num) + ' : ' + line)
        logger.error(
            "Character Entity prevented ET from parsing XML in file: " +
            url_link)
        # Print traceback
        traceback.print_exc()
        # Print exception information to file
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Exception: " + str(exc_type) + " in Filename: " +
                     str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                     " Traceback: " + traceback.format_exc())

    # Start the parsing process for XML
    for r in patent_root.findall('SDOBI'):

        # Collect main grant document data
        for B100 in r.findall('B100'):
            try:
                document_id = USPTOSanitizer.return_element_text(
                    B100.find('B110'))
                document_id = USPTOSanitizer.fix_patent_number(
                    document_id)[:20]
            except:
                document_id = None
                logger.error("No Patent Number was found for: " + url_link)
            try:
                kind = USPTOSanitizer.return_element_text(
                    B100.find('B130'))[:2]
                app_type = USPTOSanitizer.return_xml2_app_type(
                    args_array, kind)
            except:
                kind = None
            try:
                # PATENT ISSUE DATE
                pub_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B100.find('B140')),
                    args_array, document_id)
            except:
                pub_date = None
            try:
                # PATENT APPLICANT COUNTRY??
                pub_country = USPTOSanitizer.return_element_text(
                    B100.find('B190'))
            except:
                pub_country = None

        # Collect apllication data in document
        for B200 in r.findall('B200'):
            # TODO: find this in XML2 applications
            app_country = None
            try:
                # Application number
                app_no = USPTOSanitizer.return_element_text(
                    B200.find('B210'))[:20]
            except:
                app_no = None
            try:
                # Application date
                app_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(B200.find('B220')),
                    args_array, document_id)
            except:
                app_date = None
            try:
                series_code = USPTOSanitizer.return_element_text(
                    B200.find('B211US'))[:2]
            except:
                series_code = None

        # Collect the grant length
        grant_length = USPTOSanitizer.return_element_text(r.find("B474"))

        # Collect US classification
        for B500 in r.findall('B500'):
            # US Classification
            for B520 in B500.findall('B520'):
                position = 1
                # USCLASS
                for B521 in B520.findall('B521'):
                    n_class_info = USPTOSanitizer.return_element_text(B521)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main[:5]
                    n_subclass = n_subclass[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1
                for B522 in B520.findall('B522'):
                    # USCLASS FURTHER
                    n_class_info = USPTOSanitizer.return_element_text(B522)
                    n_class_main, n_subclass = USPTOSanitizer.return_class(
                        n_class_info)
                    n_class_main = n_class_main[:5]
                    n_subclass = n_subclass[:15]

                    # Append SQL data into dictionary to be written later
                    processed_usclass.append({
                        "table_name":
                        "uspto.USCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Class":
                        n_class_main,
                        "SubClass":
                        n_subclass,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect International Class data
            # TODO: check if I need to set all variables to empty or can just leave as null
            # TODO: check if classification is parsed correctly
            for B510 in B500.findall('B510'):  # INTERNATIONAL CLASS
                #logger.warning("International Classifcation found in XML2: " + args_array['url_link'] + " document: " + str(document_id))
                # Reset position
                position = 1
                for B511 in B510.findall('B511'):  #MAIN CLASS
                    i_class_version_date = None
                    i_class_action_date = None
                    i_class_gnr = None
                    i_class_level = None
                    i_class_sec = None
                    int_class = USPTOSanitizer.return_element_text(B511)
                    # TODO: check international classification and rewrite this parsing piece.
                    if (len(int_class.split()) > 1):
                        i_class, i_subclass = int_class.split()
                        i_class = i_class[:15]
                        i_subclass = i_subclass[:15]
                    else:
                        i_class = int_class[:15]
                        i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_class_sps = None
                    i_class_val = None
                    i_class_status = None
                    i_class_ds = None

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

                #INTERNATIONAL CLASS FURTHER
                for B512 in B510.findall('B511'):
                    i_class_version_date = None
                    i_class_action_date = None
                    i_class_gnr = None
                    i_class_level = None
                    i_class_sec = None
                    int_class = USPTOSanitizer.return_element_text(B512)
                    # TODO: splitting int class does not include possible multiple subclasses
                    if (len(int_class.split()) > 1):
                        i_class = int_class.split()[0][:15]
                        i_subclass = int_class.split()[1][:15]
                    else:
                        i_class = int_class[:15]
                        i_subclass = None
                    i_class_mgr = None
                    i_class_sgr = None
                    i_class_sps = None
                    i_class_val = None
                    i_class_status = None
                    i_class_ds = None

                    # Append SQL data into dictionary to be written later
                    processed_intclass.append({
                        "table_name":
                        "uspto.INTCLASS_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Section":
                        i_class_sec,
                        "Class":
                        i_class,
                        "SubClass":
                        i_subclass,
                        "MainGroup":
                        i_class_mgr,
                        "SubGroup":
                        i_class_sgr,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect Tite
            for B540 in B500.findall('B540'):
                try:
                    title = USPTOSanitizer.return_element_text(B540)[:500]
                except:
                    title = None

            # Collect Citations
            for B560 in B500.findall('B560'):  # CITATIONS

                # Reset position counter for all citations loop
                position = 1

                for B561 in B560.findall('B561'):  #PATCIT
                    # TODO: find out how to do PCIT, DOC without loop.  Only B561 needs loop
                    PCIT = B561.find('PCIT')
                    # Determien if the patent is US or not
                    #TODO: needs to check better, what does non US patent look like
                    # If all patents have PARTY-US then perhaps a databse call to check the country of origin
                    # would still allow to separate into GRACIT and FORPATCIT_G
                    #if PCIT.find("PARTY-US") == True:
                    #print "CiTATION OUNTRY US"
                    #citation_country == "US"
                    #else:
                    #citation_country = "NON-US"
                    #logger.warning("NON US patent found")

                    citation_country = "US"

                    DOC = PCIT.find('DOC')
                    try:
                        citation_document_number = USPTOSanitizer.return_element_text(
                            DOC.find('DNUM'))[:15]
                    except:
                        citation_document_number = None
                    try:
                        pct_kind = USPTOSanitizer.return_element_text(
                            DOC.find('KIND'))[:10]
                    except:
                        pct_kind = None
                    try:
                        citation_date = USPTOSanitizer.return_formatted_date(
                            USPTOSanitizer.return_element_text(
                                DOC.find('DATE'), args_array, document_id))
                    except:
                        citation_date = None
                    try:
                        citation_name = USPTOSanitizer.return_element_text(
                            PCIT.find('PARTY-US'))[:100]
                    except:
                        citation_name = None

                    # Parse citation category
                    if (len(B561.getchildren()) > 1):
                        citation_category = B561.getchildren()[1].tag.replace(
                            "\n", "").replace("\r", "")
                        #print type(citation_category)
                        # TODO: check that the citation category tag matches correctly
                        #print "Citation Category = " + citation_category + " Length: " + str(len(citation_category))
                        if "CITED-BY-EXAMINER" in citation_category:
                            citation_category = 1
                        elif "CITED-BY-OTHER" in citation_category:
                            citation_category = 2
                        else:
                            citation_category = 0
                            logger.warning("Cited by unknown type")
                    else:
                        citation_category = None

                    #TODO: be aware that there may be something crazy in the citation document number
                    if citation_country == "US":

                        # Append SQL data into dictionary to be written later
                        processed_gracit.append({
                            "table_name":
                            "uspto.GRACIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "CitedID":
                            citation_document_number,
                            "Kind":
                            pct_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                    else:

                        # Append SQL data into dictionary to be written later
                        processed_forpatcit.append({
                            "table_name":
                            "uspto.FORPATCIT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "CitedID":
                            citation_document_number,
                            "Kind":
                            pct_kind,
                            "Name":
                            citation_name,
                            "Date":
                            citation_date,
                            "Country":
                            citation_country,
                            "Category":
                            citation_category,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                # Reset position counter for non-patent citations loop
                position = 1
                for B562 in B560.findall('B562'):  #NON-PATENT LITERATURE
                    for NCIT in B562.findall('NCIT'):
                        # sometimes, there will be '<i> or <sup>, etc.' in the reference string; we need to remove it
                        non_patent_citation_text = USPTOSanitizer.return_element_text(
                            NCIT)
                        non_patent_citation_text = re.sub(
                            '<[^>]+>', '', non_patent_citation_text)

                        # parse citation cateory into code
                        ncitation_category = ET.tostring(NCIT)
                        if (len(B562.getchildren()) > 1):
                            ncitation_category = B562.getchildren(
                            )[1].tag.replace("\n", "").replace("\r", "")
                            #print type(ncitation_category)
                            #rint "Non patent citation category" + ncitation_category
                        if "CITED-BY-EXAMINER" in ncitation_category:
                            ncitation_category = 1
                        elif "CITED-BY-OTHER" in ncitation_category:
                            ncitation_category = 2
                        else:
                            ncitation_category = 0

                    # Append SQL data into dictionary to be written later
                    processed_nonpatcit.append({
                        "table_name":
                        "uspto.NONPATCIT_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "Citation":
                        non_patent_citation_text,
                        "Category":
                        ncitation_category,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect number of claims
            for B570 in B500.findall('B570'):
                try:
                    claims_num = USPTOSanitizer.return_element_text(
                        B570.find('B577'))
                except:
                    claims_num = None

            # Collect number of drawings and figures
            for B590 in B500.findall('B590'):
                for B595 in B590.findall('B595'):
                    try:
                        number_of_drawings = USPTOSanitizer.return_element_text(
                            B595)
                        number_of_drawings = number_of_drawings.split("/")[0]
                    except:
                        number_of_drawings = None
                for B596 in B590.findall('B596'):
                    try:
                        number_of_figures = USPTOSanitizer.return_element_text(
                            B596)
                    except:
                        number_of_figures = None

            # TODO: B582 find out what it is.  Looks like patent classifications but it's all alone in the XML

        # Collect party information
        # TODO: find the applicant data and append to array
        for B700 in r.findall('B700'):  #PARTIES

            # Collect inventor data
            for B720 in B700.findall('B720'):  #INVENTOR
                # Reset position for inventors
                position = 1

                # Collect inventor information
                for B721 in B720.findall('B721'):
                    for i in B721.findall('PARTY-US'):
                        itSequence = position
                        try:
                            inventor_first_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('FNM'))[:100]
                        except:
                            inventor_first_name = None
                        try:
                            inventor_last_name = USPTOSanitizer.return_element_text(
                                i.find('NAM').find('SNM'))[:100]
                        except:
                            inventor_last_name = None
                        try:
                            inventor_city = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('CITY'))[:100]
                        except:
                            inventor_city = None
                        try:
                            inventor_state = USPTOSanitizer.return_element_text(
                                i.find('ADR').find('STATE'))[:3]
                        except:
                            inventor_state = None
                        # Inventor country
                        try:
                            inventor_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY'))[:3]
                        except:
                            try:
                                if USPTOSanitizer.is_US_state(inventor_state):
                                    inventor_country = "US"
                                else:
                                    inventor_country = None
                            except:
                                inventor_country = None
                        inventor_nationality = None
                        inventor_residence = None

                    # Append SQL data into dictionary to be written later
                    processed_inventor.append({
                        "table_name":
                        "uspto.INVENTOR_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "FirstName":
                        inventor_first_name,
                        "LastName":
                        inventor_last_name,
                        "City":
                        inventor_city,
                        "State":
                        inventor_state,
                        "Country":
                        inventor_country,
                        "Nationality":
                        inventor_nationality,
                        "Residence":
                        inventor_residence,
                        "FileName":
                        args_array['file_name']
                    })

                    position += 1

            # Collect Assignee data
            # TODO: check if finding child of child is working
            # Reset position for assignees
            position = 1
            for B730 in B700.findall('B730'):
                for B731 in B730.findall('B731'):
                    for x in B731.findall('PARTY-US'):
                        try:
                            asn_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM"))[:500]
                        except:
                            asn_orgname = None
                        asn_role = None
                        try:
                            asn_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY'))[:100]
                        except:
                            asn_city = None
                        try:
                            asn_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE'))[:30]
                        except:
                            asn_state = None
                        # Assignee country
                        try:
                            asn_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY'))[:3]
                        except:
                            try:
                                if USPTOSanitizer.is_US_state(asn_state):
                                    asn_country = "US"
                                else:
                                    asn_country = None
                            except:
                                asn_country = None

                    # Append SQL data into dictionary to be written later
                    processed_assignee.append({
                        "table_name":
                        "uspto.ASSIGNEE_G",
                        "GrantID":
                        document_id,
                        "Position":
                        position,
                        "OrgName":
                        asn_orgname,
                        "Role":
                        asn_role,
                        "City":
                        asn_city,
                        "State":
                        asn_state,
                        "Country":
                        asn_country,
                        "FileName":
                        args_array['file_name']
                    })

                    # Increment the position placement
                    position += 1

            # Collect agent data
            for B740 in B700.findall('B740'):
                # Reset position for agents
                position = 1
                for B741 in B740.findall('B741'):
                    for x in B741.findall('PARTY-US'):
                        try:
                            agent_orgname = USPTOSanitizer.return_element_text(
                                x.find('NAM').find("ONM"))[:300]
                        except:
                            agent_orgname = None
                        try:
                            agent_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM'))[:100]
                        except:
                            agent_last_name = None
                        try:
                            agent_first_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM'))[:100]
                        except:
                            agent_first_name = None
                        # Attorney Address information
                        try:
                            agent_city = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CITY'))[:100]
                        except:
                            agent_city = None
                        try:
                            agent_state = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('STATE'))[:30]
                        except:
                            agent_state = None
                        # Agent country
                        try:
                            agent_country = USPTOSanitizer.return_element_text(
                                x.find("ADR").find('CTRY'))[:3]
                        except:
                            try:
                                if USPTOSanitizer.is_US_state(agent_state):
                                    agent_country = "US"
                                else:
                                    agent_country = None
                            except:
                                agent_country = None

                        # Append SQL data into dictionary to be written later
                        processed_agent.append({
                            "table_name":
                            "uspto.AGENT_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "OrgName":
                            agent_orgname,
                            "LastName":
                            agent_last_name,
                            "FirstName":
                            agent_first_name,
                            "Country":
                            agent_country,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

            # Collect examiner data
            for B745 in B700.findall('B745'):
                position = 1
                # Primary Examiner
                for B746 in B745.findall('B746'):
                    for x in B746.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM'))[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM'))[:50]
                        except:
                            examiner_fist_name = None
                        #TODO: find out if 748US is the department
                        examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

                # Assistant Examiner
                for B747 in B745.findall('B747'):
                    for x in B747.findall('PARTY-US'):
                        try:
                            examiner_last_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('SNM'))[:50]
                        except:
                            examiner_last_name = None
                        try:
                            examiner_fist_name = USPTOSanitizer.return_element_text(
                                x.find('NAM').find('FNM'))[:50]
                        except:
                            examiner_fist_name = None
                        #TODO: find out if 748US is the department
                        examiner_department = None

                        # Append SQL data into dictionary to be written later
                        processed_examiner.append({
                            "table_name":
                            "uspto.EXAMINER_G",
                            "GrantID":
                            document_id,
                            "Position":
                            position,
                            "LastName":
                            examiner_last_name,
                            "FirstName":
                            examiner_fist_name,
                            "Department":
                            examiner_department,
                            "FileName":
                            args_array['file_name']
                        })

                        position += 1

        # Collect foreign priotiry data
        position = 1
        for B300 in r.findall('B300'):
            # Country
            try:
                pc_country = USPTOSanitizer.return_element_text(
                    B300.find('B330').find('CTRY'))[:5]
            except:
                pc_country = None
            # Prority filing date
            try:
                pc_date = USPTOSanitizer.return_formatted_date(
                    USPTOSanitizer.return_element_text(
                        B300.find('B320').find('DATE'))[:45])
            except:
                pc_date = None
            # Prority document number
            try:
                pc_doc_num = USPTOSanitizer.return_element_text(
                    B300.find('B310').find('DNUM'))[:45]
            except:
                pc_doc_dum = None

            # Set the fields that are not in gXML2
            pc_kind = None

            # Append SQL data into dictionary to be written later
            processed_foreignpriority.append({
                "table_name":
                "uspto.FOREIGNPRIORITY_G",
                "GrantID":
                document_id,
                "Position":
                position,
                "Kind":
                pc_kind,
                "Country":
                pc_country,
                "DocumentID":
                pc_doc_num,
                "PriorityDate":
                pc_date,
                "FileName":
                args_array['file_name']
            })
            #print(processed_foreignpriority)
            # Increment Position
            position += 1

        # Collect Abstract from data
        try:
            abstr = patent_root.find('SDOAB')
            abstract = USPTOSanitizer.return_element_text(abstr).strip()
            #print abstract
        except:
            abstract = None

        # Collect claims from data
        try:
            cl = patent_root.find('SDOCL')
            claims = USPTOSanitizer.return_element_text(cl)
            #print claims
        except:
            traceback.print_exc()
            claims = None

        # Append SQL data into dictionary to be written later
        processed_grant.append({
            "table_name": "uspto.GRANT",
            "GrantID": document_id,
            "Title": title,
            "IssueDate": pub_date,
            "Kind": kind,
            "GrantLength": grant_length,
            "USSeriesCode": series_code,
            "Abstract": abstract,
            "ClaimsNum": claims_num,
            "DrawingsNum": number_of_drawings,
            "FiguresNum": number_of_figures,
            "ApplicationID": app_no,
            "Claims": claims,
            "FileDate": app_date,
            "AppType": app_type,
            "FileName": args_array['file_name']
        })

    # Return a dictionary of the processed_ data arrays
    return {
        "processed_grant": processed_grant,
        "processed_applicant": processed_applicant,
        "processed_examiner": processed_examiner,
        "processed_assignee": processed_assignee,
        "processed_agent": processed_agent,
        "processed_inventor": processed_inventor,
        "processed_usclass": processed_usclass,
        "processed_intclass": processed_intclass,
        "processed_gracit": processed_gracit,
        "processed_forpatcit": processed_forpatcit,
        "processed_nonpatcit": processed_nonpatcit,
        "processed_foreignpriority": processed_foreignpriority
    }