def store_classification_data(processed_data_array, args_array, class_id): # Set process start time start_time = time.time() # Extract some variables from args_array file_name = args_array['file_name'] logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # If the argument specified to store data into csv file or csv is needed for bulk database insertion if "csv" in args_array["command_args"] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): # Process a single classification csv record into a new formatted csv file # Using the already opened csv.csv.DictWriter object stored in args array. # Table name must be appended to the dictionary for later processing if args_array['stdout_level'] == 1: # Print start message to stdout and log print('- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format(args_array['document_type'], file_name, class_id, time.strftime("%c"))) # Try catch is to avoid failing the whole file when # htmlentity characters found or other error occurs try: if type(processed_data_array) == list: for item in processed_data_array: # Move the table name to temp variable and remove from table table_name = item['table_name'] extraction_type = item['extraction_type'] del item['table_name'] del item['extraction_type'] # Write the dictionary of document data to .csv file args_array['csv_file_array'][extraction_type]['csv_writer'].writerow(item) else: # Move the table name to temp variable and remove from table table_name = processed_data_array['table_name'] extraction_type = processed_data_array['extraction_type'] del processed_data_array['table_name'] del processed_data_array['extraction_type'] # Write the dictionary of document data to .csv file args_array['csv_file_array'][extraction_type]['csv_writer'].writerow(processed_data_array) # Append the table onto the array args_array['csv_file_array'][extraction_type]['table_name'] = table_name except Exception as e: print('- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(args_array['document_type'], file_name, class_id, table_name, time.strftime("%c"))) logger.info('- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(args_array['document_type'], file_name, class_id, table_name, time.strftime("%c"))) traceback.print_exc() # If command arg is set to put data into database elif "database" in args_array["command_args"] and args_array['database_insert_mode'] == "each": # Reset the start time start_time = time.time() print('- Starting to write {0} to database. Start Time: {1}'.format(file_name, time.strftime("%c"))) # Strip the metadata item off the array and process it first # Store table name for stdout args_array['table_name'] = processed_data_array['table_name'] del processed_data_array['table_name'] # Build query and pass to database loader args_array['database_connection'].load(SQLProcessor.build_sql_insert_query(processed_data_array, args_array), args_array)
def validate_existing_database_structure(args_array): # If doing a verification of existing parsed database if "verify" in args_array['command_args']: # Connect to database database_connection = SQLProcessor.SQLProcess(database_args) database_connection.connect() # Check if PARSER_VERIFICATION table exists and if not create it database_connection.checkParserVerificationTable(args_array) # Close the database connection database_connection.close() else: pass
def store_application_data(processed_data_array, args_array): # Extract critical variables from args_array uspto_xml_format = args_array['uspto_xml_format'] database_connection = args_array['database_connection'] file_name = args_array['file_name'] # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Set process start time start_time = time.time() # If the argument specified to store data into csv file or csv is needed for bulk database insertion if "csv" in args_array["command_args"] or ( "database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): # Process all the collected application data for one patent record into .csv file # Using the already opened csv.DictWriter object stored in args array. if "processed_application" in processed_data_array and len( processed_data_array['processed_application']): for data_item in processed_data_array["processed_application"]: # Print start message to stdout and log if args_array['stdout_level'] == 1: print( '- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], time.strftime("%c"))) table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['application'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['application'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_agent" in processed_data_array and len( processed_data_array['processed_agent']): for data_item in processed_data_array["processed_agent"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['agent'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['agent'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_assignee" in processed_data_array and len( processed_data_array['processed_assignee']): for data_item in processed_data_array["processed_assignee"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['assignee'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['assignee'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_inventor" in processed_data_array and len( processed_data_array['processed_inventor']): for data_item in processed_data_array["processed_inventor"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['inventor'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['inventor'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_usclass" in processed_data_array and len( processed_data_array['processed_usclass']): for data_item in processed_data_array["processed_usclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['usclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['usclass'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_intclass" in processed_data_array and len( processed_data_array['processed_intclass']): for data_item in processed_data_array["processed_intclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['intclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['intclass'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_cpcclass" in processed_data_array and len( processed_data_array['processed_cpcclass']): for data_item in processed_data_array["processed_cpcclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['cpcclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['cpcclass'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_foreignpriority" in processed_data_array and len( processed_data_array['processed_foreignpriority']): for data_item in processed_data_array["processed_foreignpriority"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['foreignpriority'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['foreignpriority'][ 'table_name'] = table_name except Exception as e: print( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['ApplicationID'], table_name, time.strftime("%c"))) traceback.print_exc() elif "database" in args_array["command_args"] and args_array[ 'database_insert_mode'] == "each": # Print start message to stdout print('- Starting to write {0} to database. Start Time: {1}'.format( file_name, time.strftime("%c"))) # Reset the start time start_time = time.time() # Strip the processed_grant item off the array and process it first processed_application = processed_data_array['processed_application'] del processed_data_array['processed_application'] for item in processed_application: args_array['table_name'] = item['table_name'] args_array['document_id'] = item['ApplicationID'] # Build query and pass to database loader database_connection.load( SQLProcessor.build_sql_insert_query(item, args_array), args_array) # Loop throught the processed_data_array and create sql queries and execute them for key, value in list(processed_data_array.items()): for item in value: args_array['table_name'] = item['table_name'] args_array['document_id'] = item['ApplicationID'] database_connection.load( SQLProcessor.build_sql_insert_query(item, args_array), args_array)
def store_grant_data(processed_data_array, args_array): # Extract critical variables from args_array uspto_xml_format = args_array['uspto_xml_format'] database_connection = args_array['database_connection'] file_name = args_array['file_name'] # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Set process start time start_time = time.time() # If the argument specified to store data into csv file or csv is needed for bulk database insertion if "csv" in args_array["command_args"] or ( "database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"): # Process all the collected grant data for one patent record into csv file # Using the already opened csv.csv.DictWriter object stored in args array. # Table name must be appended to the dictionary for later processing if "processed_grant" in processed_data_array and len( processed_data_array['processed_grant']): for data_item in processed_data_array['processed_grant']: # Print start message to stdout and log print '- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format( args_array['document_type'], file_name, data_item['GrantID'], time.strftime("%c")) #logger.info('- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format(args_array['document_type'], file_name, data_item['GrantID'], time.strftime("%c"))) # Move the table name to temp variable and remove from table table_name = data_item['table_name'] del data_item['table_name'] # Try catch is to avoid failing the whole file when # htmlentity characters found or other error occurs try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['grant'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['grant'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_applicant" in processed_data_array and len( processed_data_array['processed_applicant']): for data_item in processed_data_array['processed_applicant']: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['applicant'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['applicant'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_examiner" in processed_data_array and len( processed_data_array['processed_examiner']): for data_item in processed_data_array['processed_examiner']: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['examiner'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['examiner'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_agent" in processed_data_array and len( processed_data_array['processed_agent']): for data_item in processed_data_array["processed_agent"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['agent'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['agent'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_assignee" in processed_data_array and len( processed_data_array['processed_assignee']): for data_item in processed_data_array["processed_assignee"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['assignee'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['assignee'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_inventor" in processed_data_array and len( processed_data_array['processed_inventor']): for data_item in processed_data_array["processed_inventor"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['inventor'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['inventor'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_gracit" in processed_data_array and len( processed_data_array['processed_gracit']): for data_item in processed_data_array["processed_gracit"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['gracit'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['gracit'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_nonpatcit" in processed_data_array and len( processed_data_array['processed_nonpatcit']): for data_item in processed_data_array["processed_nonpatcit"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['nonpatcit'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['nonpatcit'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_forpatcit" in processed_data_array and len( processed_data_array['processed_forpatcit']): for data_item in processed_data_array["processed_forpatcit"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['forpatcit'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['forpatcit'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_usclass" in processed_data_array and len( processed_data_array['processed_usclass']): for data_item in processed_data_array["processed_usclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['usclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['usclass'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_intclass" in processed_data_array and len( processed_data_array['processed_intclass']): for data_item in processed_data_array["processed_intclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['intclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['intclass'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() if "processed_cpcclass" in processed_data_array and len( processed_data_array['processed_cpcclass']): for data_item in processed_data_array["processed_cpcclass"]: table_name = data_item['table_name'] del data_item['table_name'] try: # Write the dictionary of document data to .csv file args_array['csv_file_array']['cpcclass'][ 'csv_writer'].writerow(data_item) # Append the table onto the array args_array['csv_file_array']['cpcclass'][ 'table_name'] = table_name except Exception as e: print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format( args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c")) logger.info( '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}' .format(args_array['document_type'], file_name, data_item['GrantID'], table_name, time.strftime("%c"))) traceback.print_exc() # If command arg is set to put data into database elif "database" in args_array["command_args"] and args_array[ 'database_insert_mode'] == "each": # Print start message to stdout print '- Starting to write {0} to database. Start Time: {1}'.format( file_name, time.strftime("%c")) # Reset the start time start_time = time.time() # Strip the processed_grant item off the array and process it first processed_grant = processed_data_array['processed_grant'] del processed_data_array['processed_grant'] for item in processed_grant: # Store table name for stdout args_array['table_name'] = item['table_name'] args_array['document_id'] = item['GrantID'] # Build query and pass to database loader database_connection.load( SQLProcessor.build_sql_insert_query(item, args_array), args_array, logger) # Loop throught the processed_data_array and create sql queries and execute them for key, value in processed_data_array.items(): for item in value: # Store table name for stdout args_array['table_name'] = item['table_name'] args_array['document_id'] = item['GrantID'] # Build query and pass to database loader database_connection.load( SQLProcessor.build_sql_insert_query(item, args_array), args_array, logger)
def main_process(link_queue, args_array, spooling_value): # Import logger logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Check the spooling value in args_array and set a wait time args_array['spooling_value'] = spooling_value if args_array['spooling_value'] > 4: print('[Sleeping thread for initial spooling thread number ' + str(spooling_value) + '...]') logger.info('Sleeping thread for initial spooling thread number ' + str(spooling_value) + '...') time.sleep( (args_array['spooling_value']) * args_array['thread_spool_delay']) print('[Thread number ' + str(spooling_value) + ' is waking from sleep...]') logger.info('Thread number ' + str(spooling_value) + ' is waking from sleep...') args_array['spooling_value'] = 0 # Print message to stdout print('Process {0} is starting to work! Start Time: {1}'.format( os.getpid(), time.strftime("%c"))) # Set process start time process_start_time = time.time() # Create the database connection here so that each process uses its own connection, # hopefully to increase the bandwith to the database. if "database" in args_array["command_args"]: # Create a database connection for each thread processes database_connection = SQLProcessor.SQLProcess(database_args) database_connection.connect() args_array['database_connection'] = database_connection # Go through each link in the array passed in. while not link_queue.empty(): #for item in link_pile: # Get the next item in the queue item = link_queue.get() # Separate link item into link and file_type and append to args_array for item args_array['url_link'] = item[0] args_array['uspto_xml_format'] = item[1] args_array['document_type'] = item[3] # file_name is used to keep track of the .zip base filename args_array['file_name'] = os.path.basename( args_array['url_link']).replace(".zip", "") # Set process time start_time = time.time() # Start the main processing of each link in link_pile array print("Processing .zip file: " + args_array['url_link'] + " Started at: " + time.strftime("%c")) # If using item by item database insertion check if the args_array['file_name'] # has previously been partially processed. # If it has, then remove all records from the previous partial processing. # If it has not, then insert into STARTED_FILES as having been started. if "database" in args_array['command_args'] and args_array[ 'database_insert_mode'] != "bulk": database_connection.remove_previous_file_records( args_array['document_type'], args_array['file_name']) # Call the function to collect patent data from each link # and store it to specified place (csv and/or database) try: USPTOProcessLinks.process_link_file(args_array) # Print and log notification that one .zip package is finished print( '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]' .format(time.time() - start_time, time.strftime("%c"))) logger.info( 'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]' .format(time.time() - start_time, time.strftime("%c"))) except Exception as e: # Print and log general fail comment print("Processing a file failed... " + args_array['file_name'] + " from link " + args_array['url_link'] + " at: " + time.strftime("%c")) logger.error("Processing a file failed... " + args_array['file_name'] + " from link " + args_array['url_link']) # Print traceback traceback.print_exc() # Print exception information to file exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # TODO: check for unprocessed (will have to add "processing" flag.) and add a check before starting # processing to avoid collisions of link piles. Make link_pile loop into a function and # then call it again. OR... make link pile a super global, and somehow be able to check against # other processes and rebalance and pop off from link piles. # Print message that process is finished print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'. format(time.time() - process_start_time, time.strftime("%c")))
def verification_process(link_queue, args_array, database_args, spooling_value): # Set process start time process_start_time = time.time() logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction") # Check the delay value in args_array and set a wait time args_array['spooling_value'] = spooling_value if args_array['spooling_value'] > 4: print('[Sleeping thread number ' + str(spooling_value) + ' to wait for download...]') logger.info('Sleeping thread for initial spooling thread number ' + str(spooling_value) + '...') time.sleep( (args_array['spooling_value']) * args_array['thread_spool_delay']) print('[Thread number ' + str(spooling_value) + ' is waking from sleep...]') logger.info('Thread number ' + str(spooling_value) + ' is waking from sleep...') args_array['spooling_value'] = 0 print('Process {0} is starting to work! Start Time: {1}'.format( os.getpid(), time.strftime("%c"))) # Create a database connection for each thread processes database_connection = SQLProcessor.SQLProcess(database_args) database_connection.connect() args_array['database_connection'] = database_connection # Go through each link in link_queue while not link_queue.empty(): # Set process time start_time = time.time() # Get the next item in the queue item = link_queue.get() # Separate link item into (1) link url, (2) file format type, # and (3) the document type and append to args_array to be # passed with the item through parsing route args_array['url_link'] = item[0] args_array['uspto_xml_format'] = item[1] args_array['document_type'] = item[-1] # file_name is used to keep track of the downloaded file's # base filename (no file extension) args_array['file_name'] = os.path.basename( args_array['url_link']).replace(".zip", "").replace(".csv", "").replace( ".txt", "") print("-- Verifying " + args_array['uspto_xml_format'] + " file: " + args_array['file_name'] + " Started at: " + time.strftime("%c")) # Call function to verify data for each link # and store the expected values in the PARSER_VERIFICATION table try: USPTOVerifyLinks.verify_link_file(args_array) # Print and log notification that one .zip package is finished print( '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]' .format(time.time() - start_time, time.strftime("%c"))) logger.info( 'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]' .format(time.time() - start_time, time.strftime("%c"))) except Exception as e: # Print and log general fail comment print("Processing a file failed... " + args_array['file_name'] + " from link " + args_array['url_link'] + " at: " + time.strftime("%c")) logger.error("Processing a file failed... " + args_array['file_name'] + " from link " + args_array['url_link']) traceback.print_exc() exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Exception: " + str(exc_type) + " in Filename: " + str(fname) + " on Line: " + str(exc_tb.tb_lineno) + " Traceback: " + traceback.format_exc()) # At this point all links have been processed # # TODO: check logs files again for unprocessed files # (will have to add an additional "processing" flag to links in log file.) # processing to avoid collisions of link piles. Make link_pile loop into a function and # then call it again. OR... make link pile a super global, and somehow be able to check against # other processes and rebalance and pop off from link piles. # Print message that process is finished print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'. format(time.time() - process_start_time, time.strftime("%c")))
all_files_processed = "Error" # Else if the read list of unprocessed links is not empty elif len(all_links_array["grants"]) != 0 or len( all_links_array["applications"]) != 0 or len( all_links_array["PAIR"]) != 0 or len( all_links_array["classifications"]) != 0 or len( all_links_array["legal"]) != 0: # If the command args specify to patch missing data # from the UPSTO bulk dataset if "patch" in args_array['command_args']: # Import the USPTOBigQuery module import USPTOBigQuery # Create a database connection database_connection = SQLProcessor.SQLProcess( database_args) #database_connection.connect() #args_array['database_connection'] = database_connection # Instansiate class object bq = USPTOBigQuery.PatentBigQuery(args_array) # Collect the CPC classification for all 2005 patents json_obj = bq.get_2005_grant_classifications(args_array) # Insert the CPC class into the main database insert_2005_grant_classifications(args_array, json_obj) # Exit with success status # Since if patch-missing no other process will be done exit(0) # Set the string for the output depeding on whether verification or parsing bulk-data if "verify" in args_array['command_args']: action = "verified" else: action = "collected"
def create_list_queue(args): # Include logger logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction") print("-- Creating a link queue for all Alexa sites in list...") logger.info("-- Creating a link queue for all Alexa sites in list...") # Create a database connection db_conn = SQLProcessor.SQLProcess(database_args, args) db_conn.connect() # Get the highest number in the database next_pos, missing = db_conn.get_next_position(args) # Initialize a queue of queue's qq = [] # Initialize a queue to fill with items list_queue = Queue() # Set the max size of the list queue if args['list_limit'] == None: max_count = args['max_queue_count'] else: max_count = args['list_limit'] # Open the csv file and make into list # First column is only rank, second is url with open(args['alexa_list'], "r") as infile: alexa_list = infile.readlines() print("Starting Alexa List size: " + str(len(alexa_list))) # Append all to the list size = 0 count = 1 for site in alexa_list: # Get the domain arr = site.split(",") # Only add the item if not processed if int(arr[0]) in missing or int(arr[0]) >= next_pos: print("[adding " + arr[-1].strip() + " to the list...]") list_queue.put({ "pos" : arr[0].strip(), "domain" : arr[-1].strip()}) size += 1 if count == max_count: # Put the list queue on to the qq qq.append(list_queue) time.sleep(0.2) # Reinitialize the queue list_queue = Queue() # Reset the counter count = 0 # Increment the position count += 1 # Print message for skipping item else: print("[skipping " + arr[-1].strip() + " from the list...]") # Append the last partially-filled queue qq.append(list_queue) # Return the qq print("-- Finished adding sites to link queue for all Alexa sites in list...") logger.info("-- Finished adding sites to link queue for all Alexa sites in list...") time.sleep(2) print("-- Queue Size: " + str(size)) return qq
def parse_items_thread(database_args, args, qq): # Include logger logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction") # Create a database connection db_conn = SQLProcessor.SQLProcess(database_args, args) db_conn.connect() args['db_conn'] = db_conn # Create a PyCurl object curl = pycurl.Curl() # Keep track of the number I'm on item_num = args['max_queue_count'] # Loop through each queue item in qq for queue in qq: # Pull the queue item off list_queue = queue # Go through each link in link_queue while not list_queue.empty(): # Get item from queue print("[ Process " + str(os.getpid()) + " is picking next item from queue...]") item = list_queue.get() domain = item["domain"] position = item["pos"] # Only process if not found in database already if args['db_conn'].all_already_scraped(len(args['schemes_and_subdomains']), domain, position) == False: for ext in args['schemes_and_subdomains']: # Only process if not found in database already if args['db_conn'].is_already_scraped(ext, domain, position) == False: # Instantiate object data_obj = Headers() # Get headers using pycurl try: # Set some other information in the data object data_obj.tld = domain data_obj.ext = ext.replace("https://","").replace("http://", "") data_obj.url = ext + domain data_obj.position = int(position) print("-- Checking " + ext + domain + " for HTTP headers...") # Set URL value curl.setopt(curl.URL, ext + domain) b_obj = BytesIO() #user_agent = '-H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.2403.89 Safari/537.36"' #command = "curl " + user_agent + " -I https://" + domain #print(command) #output = subprocess.check_output(command, shell=True) curl.setopt(pycurl.FOLLOWLOCATION, args['curl_follow_redirect']) curl.setopt(pycurl.MAXREDIRS, args['curl_max_redirect']) curl.setopt(pycurl.CONNECTTIMEOUT, args['curl_conn_timeout']) curl.setopt(pycurl.TIMEOUT, args['curl_timeout']) curl.setopt(curl.HEADERFUNCTION, data_obj.display_header) curl.setopt(curl.WRITEDATA, b_obj) curl.perform() data_obj.get_http_return_code() data_obj.get_ip() # Only want to do this once since it's for domain and subdomain if "https://" in data_obj.url: data_obj.get_mx_records() #print('Header values:-') #print(data_obj.headers) except Exception as e: data_obj.http_code = 0 print("[ ** HTTP header request failed to respond " + ext + domain + "...]") traceback.print_exc() logger.error("[ ** HTTP header request failed to respond " + ext + domain + "...]") logger.error(traceback.format_exc()) # Store the results to database args['db_conn'].store_headers_to_database(args, data_obj) if len(data_obj.mx): args['db_conn'].store_mx_to_database(args, data_obj) # Delete the object del data_obj # End curl session curl.close() return