def verify_link_file(args_array): logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Download the file and append temp location to args array args_array['temp_zip_file_name'] = USPTOProcessLinks.download_zip_file(args_array) # Route to the correct extraction function counts_dict = verification_extract_data_router(args_array) # Store the exptected tag counts in database if counts_dict: file_processed_success = args_array['database_connection'].storeVerificationExtraction(counts_dict, args_array) # Log the file as verified if file_processed_success == True: USPTOLogger.write_verified_log(args_array) else: # Print to stdout and log print("The contents of: " + args_array['file_name'] + " could not be stored into the database! Time Finished: " + time.strftime("%c")) logger.error("The contents of: " + args_array['file_name'] + " could not be stored into the database! Time Finished: " + time.strftime("%c")) else: # Print to stdout and log print("The contents of: " + args_array['file_name'] + " could not be verified. Time Finished: " + time.strftime("%c")) logger.error("The contents of: " + args_array['file_name'] + " could not be verified. Time Finished: " + time.strftime("%c")) # Print to stdout and log print("-- Finished the verificaction process for contents of: " + args_array['file_name'] + " Time Finished: " + time.strftime("%c")) logger.info("Finished the verification process for contents of: " + args_array['file_name'] + " Time Finished: " + time.strftime("%c"))
def process_XML_grant_content(args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") if "database" in args_array["command_args"]: # Pass the database connection to variable database_connection = args_array['database_connection'] # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Set the start time of operation start_time = time.time() # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly #print "The xml format is: " + args_array['uspto_xml_format'] if args_array['uspto_xml_format'] == "gXML4": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<us-patent-grant" in line: patent_xml_started = True xml_string += "<us-patent-grant>" # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</us-patent-grant" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreGrantData.store_grant_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_new_html_characters(line) # Used for gXML2 files elif args_array['uspto_xml_format'] == "gXML2": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # grant bibliographic information if "<PATDOC" in line: patent_xml_started = True xml_string += "<PATDOC>" # Print line with number #print str(line_number) + " : " + line #line_number += 1 # This identifies end of well-formed XML segement for single patent # grant bibliographic information elif "</PATDOC" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreGrantData.store_grant_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: # Check which type of encoding should be used to fix the line string xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close all the open .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required database_connection.remove_previous_file_records(args_array['document_type'], args_array['file_name']) # Load CSV file into database file_processed = database_connection.load_csv_bulk_data(args_array) if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Delete all the open csv files USPTOCSVHandler.delete_csv_files(args_array) # Print message to stdout and log print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return file_processed as success status return file_processed else: # Print message to stdout and log print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None as failed status during database insertion return None
def process_PAIR_content(args_array): # Set the start time of operation start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Extract the .CSV file from the ZIP file csv_file_name = USPTOProcessZipFile.extract_csv_file_from_zip(args_array) # If csv_file_contents is None or False, then return immediately if csv_file_name == None or csv_file_name == False: return False # Set a flag based on filename to call the extraction function args_array['extraction_type'] = set_extraction_type(csv_file_name) csv_output_filename = set_csv_output_filename(csv_file_name) # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ( "database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files( args_array['document_type'], csv_output_filename, args_array['csv_directory'], args_array['extraction_type']) # Open file in read mode with open(csv_file_name, 'r') as read_obj: # Pass the file object to reader() to get the reader object csv_reader = reader(read_obj) # Iterate over each row in the csv using reader object line_cnt = 0 for line in csv_reader: if line_cnt != 0: # Extract the line into array processed_data_array = extract_csv_line(args_array, line) # Store the array into newly formatted CSV USPTOStorePAIRData.store_PAIR_data(processed_data_array, args_array) line_cnt += 1 # If not sandbox mode, then delete the .zip file if args_array['sandbox'] == False and os.path.exists( args_array['temp_zip_file_name']): # Print message to stdout print('[Purging .zip file ' + args_array['temp_zip_file_name'] + '...]') logger.info('Purging .zip file ' + args_array['temp_zip_file_name'] + '...') os.remove(args_array['temp_zip_file_name']) # Close all the open .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array[ 'database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required args_array['database_connection'].remove_previous_file_records( args_array['document_type'], args_array['file_name']) # Loop through each csv file and bulk copy into database for key, csv_file in list(args_array['csv_file_array'].items()): # Only load csv file to database if its for this instance if key == args_array['extraction_type']: # Load CSV file into database file_processed = args_array[ 'database_connection'].load_csv_bulk_data( args_array, key, csv_file) if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Delete all the open csv files USPTOCSVHandler.delete_csv_files(args_array) print( '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info( 'Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return file_processed as success status return file_processed else: print( '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error( 'Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None as failed status during database insertion return None
def process_XML_application_content(args_array): # Process zip file by getting .dat or .txt file and .xml filenames start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_xml_file_from_zip(args_array) # If xml_file_contents is None or False, then return immediately if xml_file_contents == None or xml_file_contents == False: return False # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly if args_array['uspto_xml_format'] == "aXML4": # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # application bibliographic information if "<us-patent-application" in line: patent_xml_started = True xml_string += "<us-patent-application>" # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</us-patent-application" in line: patent_xml_started = False xml_string += "</us-patent-application>" # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data(processed_data_array, args_array) # Reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_new_html_characters(line) elif args_array['uspto_xml_format'] == "aXML1": line_count = 1 # Loop through all lines in the xml file for line in xml_file_contents: # Decode the line from byte-object line = USPTOSanitizer.decode_line(line) # This identifies the start of well formed XML segment for patent # application bibliographic information if "<patent-application-publication" in line: patent_xml_started = True xml_string += "<patent-application-publication>" # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</patent-application-publication" in line: patent_xml_started = False xml_string += "</patent-application-publication>" # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router(xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data(processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close the all the .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required args_array['database_connection'].remove_previous_file_records(args_array['document_type'], args_array['file_name']) # Loop through each csv file and bulk copy into database for key, csv_file in list(args_array['csv_file_array'].items()): # Load CSV file into database file_processed = args_array['database_connection'].load_csv_bulk_data(args_array, key, csv_file) # If the file was successfully processed into the database if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Close all the open csv files USPTOCSVHandler.delete_csv_files(args_array) print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return the file procecssed status return file_processed else: print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None to show database insertion failed return None
def process_class_content(args_array): # Set the start time of operation start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Set the extraction type args_array['extraction_type'] = set_extraction_type(args_array['uspto_xml_format']) # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files(args_array['document_type'], args_array['file_name'], args_array['csv_directory'], args_array['extraction_type']) # Check the classification filetype code and process accordingly if args_array['uspto_xml_format'] == "USCLS": # Open file in read mode with open(args_array['url_link'], 'r') as read_obj: # Iterate over each row in the csv using reader object for line in read_obj: #print(line) # Extract the line into array processed_data_array = return_US_class_dict(line) processed_data_array['FileName'] = args_array['file_name'] # Store the array into newly formatted CSV class_id = str(processed_data_array['Class']) + " " + str(processed_data_array['SubClass']) USPTOStoreClassificationData.store_classification_data(processed_data_array, args_array, class_id) # Titles for CPC classifications elif args_array['uspto_xml_format'] == "CPCCLS": extraction_type = "cpc" # Open file in read mode with open(args_array['url_link'], 'r') as read_obj: # Pass the file object to reader() to get the reader object csv_reader = reader(read_obj) # Iterate over each row in the csv using reader object line_cnt = 0 for line in csv_reader: if line_cnt != 0: # Extract the line into array processed_data_array = extract_CPC_class_dict(line) # Store the array into newly formatted CSV processed_data_array['FileName'] = args_array['file_name'] class_id = str(processed_data_array['Section']) + str(processed_data_array['Class']) + str(processed_data_array['SubClass']) + " " + str(processed_data_array['MainGroup']) + "/" + str(processed_data_array['SubGroup']) USPTOStoreClassificationData.store_classification_data(processed_data_array, args_array, class_id) line_cnt += 1 # USPC to CPC classification concordance table elif args_array['uspto_xml_format'] == "USCPCCLS": # Open file in read mode with open(args_array['url_link'], 'r') as read_obj: # Pass the file object to reader() to get the reader object csv_reader = reader(read_obj) # Iterate over each row in the csv using reader object line_cnt = 0 for line in csv_reader: if line_cnt != 0: # Extract the line into array processed_data_array = extract_USCPC_class_dict(line, args_array['file_name']) if len(processed_data_array) != 0: # Store the array into newly formatted CSV class_id = str(processed_data_array[0]['USClass']) USPTOStoreClassificationData.store_classification_data(processed_data_array, args_array, class_id) line_cnt += 1 # Set a flag file_processed to ensure that the bulk insert succeeds # This should be true, in case the database insertion method is not bulk file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if "database" in args_array["command_args"] and args_array['database_insert_mode'] == 'bulk': # Check for previous attempt to process the file and clean database if required args_array['database_connection'].remove_previous_file_records(args_array['document_type'], args_array['file_name']) # Loop through each csv file and bulk copy into database for key, csv_file in list(args_array['csv_file_array'].items()): # Load CSV file into database file_processed = args_array['database_connection'].load_csv_bulk_data(args_array, key, csv_file) if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Delete all the open csv files USPTOCSVHandler.delete_csv_files(args_array) print('[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.info('Loaded {0} data for {1} into database. Time:{2} Finished Time: {3}'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return file_processed as success status return file_processed else: print('[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) logger.error('Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) # Return None as failed status during database insertion return None
"app_config_file": app_config_file, "allowed_args_array": allowed_args_array, "log_lock_file": log_lock_file, "classification_process_log_file": classification_process_log_file, "classification_text_filename": classification_text_filename, "grant_process_log_file": grant_process_log_file, "application_process_log_file": application_process_log_file, "application_pair_process_log_file": application_pair_process_log_file, "pair_process_log_file": pair_process_log_file, "temp_directory": app_temp_dirpath, "csv_directory": app_csv_dirpath, "sandbox_downloads_dirpath": sandbox_downloads_dirpath } # Setup logger USPTOLogger.setup_logger(args_array['log_level'], app_log_file) # Include logger in the main function logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Perform analysis of command line args and store in args_array args_array["command_args"] = build_command_arguments(sys.argv, args_array) # If command_args are checked OK! Start app if args_array["command_args"]: # Print the ASCII header print_ascii_header() # Set saved app configuration based on current command arguments # and collect existing config settings from file and append to args_array args_array = set_config_using_command_args(args_array)
def process_XML_application_content(args_array): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # If csv file insertion is required, then open all the files # into args_array if "csv" in args_array['command_args'] or ( "database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): args_array['csv_file_array'] = USPTOCSVHandler.open_csv_files( args_array['document_type'], args_array['file_name'], args_array['csv_directory']) # Process zip file by getting .dat or .txt file and .xml filenames start_time = time.time() # Extract the XML file from the ZIP file xml_file_contents = USPTOProcessZipFile.extract_zip_to_array(args_array) # create variables needed to parse the file xml_string = '' patent_xml_started = False # read through the file and append into groups of string. # Send the finished strings to be parsed # Use uspto_xml_format to determine file contents and parse accordingly if args_array['uspto_xml_format'] == "aXML4": # Loop through all lines in the xml file for line in xml_file.readlines(): # This identifies the start of well formed XML segment for patent # application bibliographic information if "<us-patent-application" in line: patent_xml_started = True xml_string += line # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</us-patent-application" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router( xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data( processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_new_html_characters(line) elif args_array['uspto_xml_format'] == "aXML1": line_count = 1 # Loop through all lines in the xml file for line in xml_file.readlines(): # This identifies the start of well formed XML segment for patent # application bibliographic information if "<patent-application-publication" in line: patent_xml_started = True xml_string += line # This identifies end of well-formed XML segement for single patent # application bibliographic information elif "</patent-application-publication" in line: patent_xml_started = False xml_string += line # Call the function extract data processed_data_array = USPTOProcessLinks.extract_data_router( xml_string, args_array) # Call function to write data to csv or database USPTOStoreApplicationData.store_application_data( processed_data_array, args_array) # reset the xml string xml_string = '' # This is used to append lines of file when inside single patent grant elif patent_xml_started == True: xml_string += USPTOSanitizer.replace_old_html_characters(line) # Close the .xml file being read from xml_file.close() # Close the all the .csv files being written to USPTOCSVHandler.close_csv_files(args_array) # Set a flag file_processed to ensure that the bulk insert succeeds file_processed = True # If data is to be inserted as bulk csv files, then call the sql function if args_array['database_insert_mode'] == 'bulk': file_processed = args_array['database_connection'].load_csv_bulk_data( args_array, logger) # If the file was successfully processed into the database if file_processed: # Send the information to USPTOLogger.write_process_log to have log file rewritten to "Processed" USPTOLogger.write_process_log(args_array) if "csv" not in args_array['command_args']: # Close all the open csv files USPTOCSVHandler.delete_csv_files(args_array) # Print message to stdout and log print '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format( args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")) logger.info( '[Loaded {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c"))) else: # Print message to stdout and log print '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]'.format( args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")) logger.info( '[Failed to bulk load {0} data for {1} into database. Time:{2} Finished Time: {3} ]' .format(args_array['document_type'], args_array['url_link'], time.time() - start_time, time.strftime("%c")))