Python USPTOSanitizer.decode_line 예제들

프로그래밍 언어: Python

클래스/타입: USPTOSanitizer

메소드/함수: decode_line

hotexamples.com에서의 예제들: 6

Python USPTOSanitizer.decode_line - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 USPTOSanitizer.decode_line에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

return_element_text(9)

return_formatted_date(8)

strip_leading_zeros(6)

decode_line(6)

fix_patent_number(6)

replace_old_html_characters(5)

strip_for_csv(4)

return_class(4)

check_tag_exists(4)

is_US_state(4)

replace_new_html_characters(3)

return_xml2_app_type(3)

clean_PAIR_csv_item(2)

return_CPC_class_application(1)

return_class_XML4_grant(1)

extract_BQ_CPC_string_to_dict(1)

return_international_class(1)

return_international_class_XML1_application(1)

escape_value_for_sql(1)

return_US_class_XML4_application(1)

예제 #1

파일 보기

파일: USPTOProcessXMLGrant.py 프로젝트: bioinfonerd-forks/uspto

def process_XML_grant_content(args_array):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    if "database" in args_array["command_args"]:
        # Pass the database connection to variable
        database_connection = args_array['database_connection']

    # If csv file insertion is required, then open all the files
    # into args_array
    if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"):
        args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory'])

    # Set the start time of operation
    start_time = time.time()

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # read through the file and append into groups of string.
    # Send the finished strings to be parsed
    # Use uspto_xml_format to determine file contents and parse accordingly
    #print "The xml format is: " + args_array['uspto_xml_format']
    if args_array['uspto_xml_format'] == "gXML4":

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # grant bibliographic information
            if "<us-patent-grant" in line:
                patent_xml_started = True
                xml_string += "<us-patent-grant>"

            # This identifies end of well-formed XML segement for single patent
            # grant bibliographic information
            elif "</us-patent-grant" in line:

                patent_xml_started = False
                xml_string += line
                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreGrantData.store_grant_data(processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                # Check which type of encoding should be used to fix the line string
                xml_string += USPTOSanitizer.replace_new_html_characters(line)

    # Used for gXML2 files
    elif args_array['uspto_xml_format'] == "gXML2":

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # grant bibliographic information
            if "<PATDOC" in line:
                patent_xml_started = True
                xml_string += "<PATDOC>"

                # Print line with number
                #print str(line_number) + " : " + line
                #line_number += 1

            # This identifies end of well-formed XML segement for single patent
            # grant bibliographic information
            elif "</PATDOC" in line:
                patent_xml_started = False
                xml_string += line

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreGrantData.store_grant_data(processed_data_array, args_array)

                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                # Check which type of encoding should be used to fix the line string
                xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Close all the open .csv files being written to
    USPTOCSVHandler.close_csv_files(args_array)

    # Set a flag file_processed to ensure that the bulk insert succeeds
    # This should be true, in case the database insertion method is not bulk
    file_processed = True

    # If data is to be inserted as bulk csv files, then call the sql function
    if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk':
        # Check for previous attempt to process the file and clean database if required
        database_connection.remove_previous_file_records(args_array['document_type'], args_array['file_name'])
        # Load CSV file into database
        file_processed = database_connection.load_csv_bulk_data(args_array)

    if file_processed:
        # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed"
        USPTOLogger.write_process_log(args_array)
        if "csv" not in args_array['command_args']:
            # Delete all the open csv files
            USPTOCSVHandler.delete_csv_files(args_array)

        # Print message to stdout and log
        print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return file_processed as success status
        return file_processed
    else:
        # Print message to stdout and log
        print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return None as failed status during database insertion
        return None

예제 #2

파일 보기

파일: USPTOVerifyLinks.py 프로젝트: ayxemma/uspto

def extract_XML4_application_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    tags_dict = {
        "APPLICATION" : ["<us-patent-application"],
        "INTCLASS_A" : ["<classification-ipcr"],
        "USCLASS_A" : ["<main-classification", "<further-classification"],
        "CPCCLASS_A" : ["<classification-cpc"],
        "FOREIGNPRIORITY_A" : ["<priority-claim>", "<priority-claim "],
        "AGENT_A" : ["<agent>", "<agent "],
        "ASSIGNEE_A" : ["<assignee>", "<assignee "],
        "INVENTOR_A" : ["<inventor>", "<inventor "],
        "APPLICANT_A" : ["<us-applicant>", "<applicant>", "<us-applicant ", "<applicant "]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "APPLICATION" : 0,
        "INTCLASS_A" : 0,
        "USCLASS_A" : 0,
        "CPCCLASS_A" : 0,
        "FOREIGNPRIORITY_A" : 0,
        "AGENT_A" : 0,
        "ASSIGNEE_A" : 0,
        "INVENTOR_A" : 0,
        "APPLICANT_A" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Print to stdout and log
    print("-- Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML4 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict

예제 #3

파일 보기

파일: USPTOVerifyLinks.py 프로젝트: ayxemma/uspto

def extract_XML1_application_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    #CPCCLASS_A and APPLICANT_A are not included in XML1 applications
    # APPLICANT_A are not include in XML1 applications
    tags_dict = {
        "APPLICATION" : ["<patent-application-publication"],
        "INTCLASS_A" : ["<classification-ipc-primary>", "<classification-ipc-secondary>"],
        "USCLASS_A" : ["<classification-us-primary>", "<classification-us-secondary>"],
        "FOREIGNPRIORITY_A" : ["<priority-application-number"],
        "AGENT_A" : ["<correspondence-address>"],
        "INVENTOR_A" : ["<first-named-inventor", "<inventor>"],
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "APPLICATION" : 0,
        "INTCLASS_A" : 0,
        "USCLASS_A" : 0,
        "CPCCLASS_A" : 0,
        "FOREIGNPRIORITY_A" : 0,
        "AGENT_A" : 0,
        "ASSIGNEE_A" : 0,
        "INVENTOR_A" : 0,
        "APPLICANT_A" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<patent-application-publication" in line:
            patent_xml_started = True
            xml_string += "<patent-application-publication>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</patent-application-publication" in line:
            patent_xml_started = False
            xml_string += "</patent-application-publication>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            document_root = ET.fromstring(xml_string)
            #print(document_root)
            # Extract the root tag
            r = document_root.find('subdoc-bibliographic-information')
            # Count the number of assignee tags
            counts_dict['ASSIGNEE_A'] += len(r.findall('assignee'))
            # Count the number of inventor tags
            counts_dict['INVENTOR_A'] += len(r.findall('inventor'))
            # Reset the xml string
            xml_string = ''

        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Print to stdout and log
    print("-- Finished the XML1 appication tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML1 application tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict

예제 #4

파일 보기

파일: USPTOVerifyLinks.py 프로젝트: ayxemma/uspto

def extract_XML4_grant_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    tags_dict = {
        "GRANT" : ["<us-patent-grant"],
        "INTCLASS_G" : ["<classification-ipcr"],
        "AGENT_G" : ["<agent>", "<agent "],
        "ASSIGNEE_G" : ["<assignee>", "<assignee "],
        "APPLICANT_G" : ["<us-applicant>", "<us-applicant ", "<applicant", "<applicant>"],
        "INVENTOR_G" : ["<inventor>", "<inventor ", "applicant-inventor"],
        "NONPATCIT_G" : ["<nplcit"],
        "EXAMINER_G" : ["<primary-examiner", "<assistant-examiner"],
        "FOREIGNPRIORITY_G" : ["<priority-claim>", "<priority-claim "]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "GRANT" : 0,
        "INTCLASS_G" : 0,
        "CPCCLASS_G" : 0,
        "USCLASS_G" : 0,
        "INVENTOR_G" : 0,
        "AGENT_G" : 0,
        "ASSIGNEE_G" : 0,
        "APPLICANT_G" : 0,
        "NONPATCIT_G" : 0,
        "EXAMINER_G" : 0,
        "GRACIT_G" : 0,
        "FORPATCIT_G" : 0,
        "FOREIGNPRIORITY_G" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<us-patent-grant" in line:
            patent_xml_started = True
            xml_string += "<us-patent-grant>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</us-patent-grant" in line:
            patent_xml_started = False
            xml_string += "</us-patent-grant>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            document_root = ET.fromstring(xml_string)
            #print(document_root)
            # Extract the root tag
            r = document_root.find('us-bibliographic-data-grant')
            # Get the patent CPC class count
            foc = r.find('us-field-of-classification-search')
            if foc is not None:
                counts_dict["CPCCLASS_G"] += len(foc.findall('classification-cpc-text'))
                counts_dict["USCLASS_G"] += len(foc.findall('classification-national'))
            # Get USCLASS_G count if file format uses field-of-search
            foc = r.find('field-of-search')
            if foc is not None:
                counts_dict["USCLASS_G"] += len(foc.findall('classification-national'))
            # Count the citation / reference tags
            if r.find('us-references-cited') != None: ref_cited_id_string = "us-references-cited"
            elif r.find('references-cited') != None: ref_cited_id_string = "references-cited"
            else: ref_cited_id_string = "references"
            rf = r.find(ref_cited_id_string)
            if rf != None:
                # Check if the XML format is using 'citation' or 'us-citation'
                if rf.find('citation') != None: citation_id_string = "citation"
                elif rf.find('us-citation') != None: citation_id_string = "us-citation"
                else: citation_id_string = "us-citation"
                all_rfc = rf.findall(citation_id_string)
                for rfc in all_rfc:
                    # If the patent citation child is found must be a patent citation
                    if rfc.find('patcit') != None:
                        x = rfc.find('patcit')
                        try: citation_country = x.find('document-id').findtext('country').strip()
                        except: citation_country = None
                        # Check if US or foreign patent citation
                        if(citation_country == 'US'): counts_dict["GRACIT_G"] += 1
                        else: counts_dict["FORPATCIT_G"] += 1
            # Count the foreign patent citiation tags
            # Reset the xml string
            xml_string = ''

        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += line


    # Print to stdout and log
    print("-- Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML4 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))

    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)

    # Return the dictionary of counts for found tags
    return counts_dict

예제 #5

파일 보기

파일: USPTOVerifyLinks.py 프로젝트: ayxemma/uspto

def extract_XML2_grant_tag_counts(args_array):

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # Declare a dictionary to use in counting tags
    # NOTE: CPCClASS_G, APPLICANT_G, are not available in XML2 Grant files
    tags_dict = {
        "GRANT" : ["<PATDOC"],
        "INTCLASS_G" : ["<B510"],
        "USCLASS_G" : ["<B521", "<B522"],
        "INVENTOR_G" : ["<B721"],
        "AGENT_G" : ["<B740"],
        "ASSIGNEE_G" : ["<B730"],
        "NONPATCIT_G" : ["<B562"],
        "EXAMINER_G" : ["<B746", "<B747"],
        "FOREIGNPRIORITY_G" : ["<B310"]
    }

    # Declare a dictionary to hold counts by table
    counts_dict = {
        "file_name" : args_array['file_name'],
        "GRANT" : 0,
        "INTCLASS_G" : 0,
        "CPCCLASS_G" : 0,
        "USCLASS_G" : 0,
        "INVENTOR_G" : 0,
        "AGENT_G" : 0,
        "ASSIGNEE_G" : 0,
        "APPLICANT_G" : 0,
        "NONPATCIT_G" : 0,
        "EXAMINER_G" : 0,
        "GRACIT_G" : 0,
        "FORPATCIT_G" : 0,
        "FOREIGNPRIORITY_G" : 0
    }

    # Print to stdout and log
    print("-- Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))
    logger.info("Starting the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Started: " + time.strftime("%c"))

    # Loop through the file contents line by line
    for line in xml_file_contents:
        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)
        # Loop through tags_dict items and look for XML tag
        for table, tag in tags_dict.items():
            item_found = False
            # If list is provided
            if isinstance(tag, list):
                for item in tag:
                    # Look for field tag
                    if item in line:
                        item_found = True
            if item_found == True:
                # Increment the count for appropriate table
                counts_dict[table] += 1

    # Count the items that cannot be counted by only tags
    # Parse the tags that need to be XML parsed
    # Create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # Loop through all lines in the xml file
    for line in xml_file_contents:

        # Decode the line from byte-object
        line = USPTOSanitizer.decode_line(line)

        # This identifies the start of well formed XML segment for patent
        # grant bibliographic information
        if "<PATDOC" in line:
            patent_xml_started = True
            xml_string += "<PATDOC>"

        # This identifies end of well-formed XML segement for single patent
        # grant bibliographic information
        elif "</PATDOC" in line:
            patent_xml_started = False
            xml_string += "</PATDOC>"
            #print(xml_string)
            # Pass the raw_data data into Element Tree
            try:
                document_root = ET.fromstring(xml_string)
                # SDOBI is the bibliographic data
                r = document_root.find('SDOBI')
                # Patent Citations
                B500 = r.find('B500')
                if B500 is not None:
                    for B560 in B500.findall('B560'):
                        # B561 is Patent Citation
                        for B561 in B560.findall('B561'):
                            try: pcit = B561.find('PCIT').find('DOC')
                            except: pcit = None
                            if pcit is not None:
                                prt = pcit.find('PARTY-US')
                                try: citation_state = USPTOSanitizer.return_element_text(prt.find('ADR').find('STATE')).strip()[:3]
                                except: citation_state = None
                                try: citation_country = USPTOSanitizer.return_element_text(prt.find("ADR").find('CTRY')).strip()[:3]
                                except:
                                    try:
                                        # If state is a US state, set country to US
                                        if USPTOSanitizer.is_US_state(citation_state):
                                            citation_country = "US"
                                        else: citation_country = None
                                    except: citation_country = None
                                if citation_country == "US" or citation_country == None: counts_dict['GRACIT_G'] += 1
                                elif citation_country is not None: counts_dict['FORPATCIT_G'] += 1
                # Reset the xml string
                xml_string = ''

            except ET.ParseError as e:
                print_xml = xml_string.split("\n")
                for num, line in enumerate(print_xml, start = 1):
                    #print(str(num) + ' : ' + line)
                    logger.error(str(num) + ' : ' + line)
                logger.error("Character Entity prevented ET from parsing XML in file: " + args_array['file_name'] )
                traceback.print_exc()
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc())


        # This is used to append lines of file when inside single patent grant
        elif patent_xml_started == True:
            # Check which type of encoding should be used to fix the line string
            xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Print to stdout and log
    print("-- Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))
    logger.info("Finished the XML2 grant tag counting process for contents of: " + args_array['file_name'] + ". Time Finished: " + time.strftime("%c"))

    # Return the dictionary of counts for found tags
    if args_array['stdout_level'] == 1: pprint(counts_dict)
    return counts_dict

예제 #6

파일 보기

def process_XML_application_content(args_array):

    # Process zip file by getting .dat or .txt file and .xml filenames
    start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # If csv file insertion is required, then open all the files
    # into args_array
    if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"):
        args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory'])

    # Extract the XML file from the ZIP file
    xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array)

    # If xml_file_contents is None or False, then return immediately
    if xml_file_contents == None or xml_file_contents == False:
        return False

    # create variables needed to parse the file
    xml_string = ''
    patent_xml_started = False
    # read through the file and append into groups of string.
    # Send the finished strings to be parsed
    # Use uspto_xml_format to determine file contents and parse accordingly
    if args_array['uspto_xml_format'] == "aXML4":

        # Loop through all lines in the xml file
        for line in xml_file_contents:
            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<us-patent-application" in line:
                patent_xml_started = True
                xml_string += "<us-patent-application>"

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</us-patent-application" in line:
                patent_xml_started = False
                xml_string += "</us-patent-application>"

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(processed_data_array, args_array)
                # Reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_new_html_characters(line)

    elif args_array['uspto_xml_format'] == "aXML1":

        line_count = 1

        # Loop through all lines in the xml file
        for line in xml_file_contents:

            # Decode the line from byte-object
            line = USPTOSanitizer.decode_line(line)

            # This identifies the start of well formed XML segment for patent
            # application bibliographic information
            if "<patent-application-publication" in line:
                patent_xml_started = True
                xml_string += "<patent-application-publication>"

            # This identifies end of well-formed XML segement for single patent
            # application bibliographic information
            elif "</patent-application-publication" in line:
                patent_xml_started = False
                xml_string += "</patent-application-publication>"

                # Call the function extract data
                processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array)
                # Call function to write data to csv or database
                USPTOStoreApplicationData.store_application_data(processed_data_array, args_array)
                # reset the xml string
                xml_string = ''

            # This is used to append lines of file when inside single patent grant
            elif patent_xml_started == True:
                xml_string += USPTOSanitizer.replace_old_html_characters(line)

    # Close the all the .csv files being written to
    USPTOCSVHandler.close_csv_files(args_array)

    # Set a flag file_processed to ensure that the bulk insert succeeds
    # This should be true, in case the database insertion method is not bulk
    file_processed = True

    # If data is to be inserted as bulk csv files, then call the sql function
    if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk':
        # Check for previous attempt to process the file and clean database if required
        args_array['database_connection'].remove_previous_file_records(args_array['document_type'], args_array['file_name'])
        # Loop through each csv file and bulk copy into database
        for key, csv_file in list(args_array['csv_file_array'].items()):
            # Load CSV file into database
            file_processed = args_array['database_connection'].load_csv_bulk_data(args_array, key, csv_file)

    # If the file was successfully processed into the database
    if file_processed:
        # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed"
        USPTOLogger.write_process_log(args_array)
        if "csv" not in args_array['command_args']:
            # Close all the open csv files
            USPTOCSVHandler.delete_csv_files(args_array)

        print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return the file procecssed status
        return file_processed
    else:
        print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))
        # Return None to show database insertion failed
        return None