Exemple #1
0
def store_classification_data(processed_data_array, args_array, class_id):

    # Set process start time
    start_time = time.time()
    # Extract some variables from args_array
    file_name = args_array['file_name']

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # If the argument specified to store data into csv file or csv is needed for bulk database insertion
    if "csv" in args_array["command_args"] or ("database" in args_array['command_args'] and args_array['database_insert_mode'] == "bulk"):

        # Process a single classification csv record into a new formatted csv file
        # Using the already opened csv.csv.DictWriter object stored in args array.
        # Table name must be appended to the dictionary for later processing
        if args_array['stdout_level'] == 1:
            # Print start message to stdout and log
            print('- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format(args_array['document_type'], file_name, class_id, time.strftime("%c")))
        # Try catch is to avoid failing the whole file when
        # htmlentity characters found or other error occurs
        try:
            if type(processed_data_array) == list:
                for item in processed_data_array:
                    # Move the table name to temp variable and remove from table
                    table_name = item['table_name']
                    extraction_type = item['extraction_type']
                    del item['table_name']
                    del item['extraction_type']
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array'][extraction_type]['csv_writer'].writerow(item)
            else:
                # Move the table name to temp variable and remove from table
                table_name = processed_data_array['table_name']
                extraction_type = processed_data_array['extraction_type']
                del processed_data_array['table_name']
                del processed_data_array['extraction_type']
                # Write the dictionary of document data to .csv file
                args_array['csv_file_array'][extraction_type]['csv_writer'].writerow(processed_data_array)
            # Append the table onto the array
            args_array['csv_file_array'][extraction_type]['table_name'] = table_name
        except Exception as e:
            print('- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(args_array['document_type'], file_name, class_id, table_name, time.strftime("%c")))
            logger.info('- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(args_array['document_type'], file_name, class_id, table_name, time.strftime("%c")))
            traceback.print_exc()

    # If command arg is set to put data into database
    elif "database" in args_array["command_args"] and args_array['database_insert_mode'] == "each":

        # Reset the start time
        start_time = time.time()

        print('- Starting to write {0} to database. Start Time: {1}'.format(file_name, time.strftime("%c")))

        # Strip the metadata item off the array and process it first
        # Store table name for stdout
        args_array['table_name'] = processed_data_array['table_name']
        del processed_data_array['table_name']
        # Build query and pass to database loader
        args_array['database_connection'].load(SQLProcessor.build_sql_insert_query(processed_data_array, args_array), args_array)
Exemple #2
0
def validate_existing_database_structure(args_array):

    # If doing a verification of existing parsed database
    if "verify" in args_array['command_args']:
        # Connect to database
        database_connection = SQLProcessor.SQLProcess(database_args)
        database_connection.connect()
        # Check if PARSER_VERIFICATION table exists and if not create it
        database_connection.checkParserVerificationTable(args_array)
        # Close the database connection
        database_connection.close()
    else:
        pass
Exemple #3
0
def store_application_data(processed_data_array, args_array):

    # Extract critical variables from args_array
    uspto_xml_format = args_array['uspto_xml_format']
    database_connection = args_array['database_connection']
    file_name = args_array['file_name']

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Set process start time
    start_time = time.time()

    # If the argument specified to store data into csv file or csv is needed for bulk database insertion
    if "csv" in args_array["command_args"] or (
            "database" in args_array['command_args']
            and args_array['database_insert_mode'] == "bulk"):

        # Process all the collected application data for one patent record into .csv file
        # Using the already opened csv.DictWriter object stored in args array.
        if "processed_application" in processed_data_array and len(
                processed_data_array['processed_application']):
            for data_item in processed_data_array["processed_application"]:
                # Print start message to stdout and log
                if args_array['stdout_level'] == 1:
                    print(
                        '- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'
                        .format(args_array['document_type'],
                                file_name, data_item['ApplicationID'],
                                time.strftime("%c")))
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['application'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['application'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_agent" in processed_data_array and len(
                processed_data_array['processed_agent']):
            for data_item in processed_data_array["processed_agent"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['agent'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['agent'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_assignee" in processed_data_array and len(
                processed_data_array['processed_assignee']):
            for data_item in processed_data_array["processed_assignee"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['assignee'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['assignee'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_inventor" in processed_data_array and len(
                processed_data_array['processed_inventor']):
            for data_item in processed_data_array["processed_inventor"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['inventor'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['inventor'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_usclass" in processed_data_array and len(
                processed_data_array['processed_usclass']):
            for data_item in processed_data_array["processed_usclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['usclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['usclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_intclass" in processed_data_array and len(
                processed_data_array['processed_intclass']):
            for data_item in processed_data_array["processed_intclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['intclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['intclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_cpcclass" in processed_data_array and len(
                processed_data_array['processed_cpcclass']):
            for data_item in processed_data_array["processed_cpcclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['cpcclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['cpcclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_foreignpriority" in processed_data_array and len(
                processed_data_array['processed_foreignpriority']):
            for data_item in processed_data_array["processed_foreignpriority"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['foreignpriority'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['foreignpriority'][
                        'table_name'] = table_name
                except Exception as e:
                    print(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['ApplicationID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()

    elif "database" in args_array["command_args"] and args_array[
            'database_insert_mode'] == "each":

        # Print start message to stdout
        print('- Starting to write {0} to database. Start Time: {1}'.format(
            file_name, time.strftime("%c")))

        # Reset the start time
        start_time = time.time()

        # Strip the processed_grant item off the array and process it first
        processed_application = processed_data_array['processed_application']
        del processed_data_array['processed_application']
        for item in processed_application:
            args_array['table_name'] = item['table_name']
            args_array['document_id'] = item['ApplicationID']
            # Build query and pass to database loader
            database_connection.load(
                SQLProcessor.build_sql_insert_query(item, args_array),
                args_array)

        # Loop throught the processed_data_array and create sql queries and execute them
        for key, value in list(processed_data_array.items()):
            for item in value:
                args_array['table_name'] = item['table_name']
                args_array['document_id'] = item['ApplicationID']
                database_connection.load(
                    SQLProcessor.build_sql_insert_query(item, args_array),
                    args_array)
Exemple #4
0
def store_grant_data(processed_data_array, args_array):

    # Extract critical variables from args_array
    uspto_xml_format = args_array['uspto_xml_format']
    database_connection = args_array['database_connection']
    file_name = args_array['file_name']

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Set process start time
    start_time = time.time()

    # If the argument specified to store data into csv file or csv is needed for bulk database insertion
    if "csv" in args_array["command_args"] or (
            "database" in args_array['command_args']
            and args_array['database_insert_mode'] == "bulk"):

        # Process all the collected grant data for one patent record into csv file
        # Using the already opened csv.csv.DictWriter object stored in args array.
        # Table name must be appended to the dictionary for later processing
        if "processed_grant" in processed_data_array and len(
                processed_data_array['processed_grant']):
            for data_item in processed_data_array['processed_grant']:
                # Print start message to stdout and log
                print '- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format(
                    args_array['document_type'], file_name,
                    data_item['GrantID'], time.strftime("%c"))
                #logger.info('- Starting to write {0} to .csv file {1} for document: {2}. Start Time: {3}'.format(args_array['document_type'], file_name, data_item['GrantID'], time.strftime("%c")))
                # Move the table name to temp variable and remove from table
                table_name = data_item['table_name']
                del data_item['table_name']
                # Try catch is to avoid failing the whole file when
                # htmlentity characters found or other error occurs
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['grant'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['grant'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_applicant" in processed_data_array and len(
                processed_data_array['processed_applicant']):
            for data_item in processed_data_array['processed_applicant']:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['applicant'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['applicant'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_examiner" in processed_data_array and len(
                processed_data_array['processed_examiner']):
            for data_item in processed_data_array['processed_examiner']:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['examiner'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['examiner'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_agent" in processed_data_array and len(
                processed_data_array['processed_agent']):
            for data_item in processed_data_array["processed_agent"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['agent'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['agent'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_assignee" in processed_data_array and len(
                processed_data_array['processed_assignee']):
            for data_item in processed_data_array["processed_assignee"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['assignee'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['assignee'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_inventor" in processed_data_array and len(
                processed_data_array['processed_inventor']):
            for data_item in processed_data_array["processed_inventor"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['inventor'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['inventor'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_gracit" in processed_data_array and len(
                processed_data_array['processed_gracit']):
            for data_item in processed_data_array["processed_gracit"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['gracit'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['gracit'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_nonpatcit" in processed_data_array and len(
                processed_data_array['processed_nonpatcit']):
            for data_item in processed_data_array["processed_nonpatcit"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['nonpatcit'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['nonpatcit'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_forpatcit" in processed_data_array and len(
                processed_data_array['processed_forpatcit']):
            for data_item in processed_data_array["processed_forpatcit"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['forpatcit'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['forpatcit'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_usclass" in processed_data_array and len(
                processed_data_array['processed_usclass']):
            for data_item in processed_data_array["processed_usclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['usclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['usclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_intclass" in processed_data_array and len(
                processed_data_array['processed_intclass']):
            for data_item in processed_data_array["processed_intclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['intclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['intclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()
        if "processed_cpcclass" in processed_data_array and len(
                processed_data_array['processed_cpcclass']):
            for data_item in processed_data_array["processed_cpcclass"]:
                table_name = data_item['table_name']
                del data_item['table_name']
                try:
                    # Write the dictionary of document data to .csv file
                    args_array['csv_file_array']['cpcclass'][
                        'csv_writer'].writerow(data_item)
                    # Append the table onto the array
                    args_array['csv_file_array']['cpcclass'][
                        'table_name'] = table_name
                except Exception as e:
                    print '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'.format(
                        args_array['document_type'], file_name,
                        data_item['GrantID'], table_name, time.strftime("%c"))
                    logger.info(
                        '- Error writing {0} to .csv file {1} for document: {2} into table {3}. Start Time: {4}'
                        .format(args_array['document_type'], file_name,
                                data_item['GrantID'], table_name,
                                time.strftime("%c")))
                    traceback.print_exc()

    # If command arg is set to put data into database
    elif "database" in args_array["command_args"] and args_array[
            'database_insert_mode'] == "each":

        # Print start message to stdout
        print '- Starting to write {0} to database. Start Time: {1}'.format(
            file_name, time.strftime("%c"))

        # Reset the start time
        start_time = time.time()

        # Strip the processed_grant item off the array and process it first
        processed_grant = processed_data_array['processed_grant']
        del processed_data_array['processed_grant']
        for item in processed_grant:
            # Store table name for stdout
            args_array['table_name'] = item['table_name']
            args_array['document_id'] = item['GrantID']
            # Build query and pass to database loader
            database_connection.load(
                SQLProcessor.build_sql_insert_query(item, args_array),
                args_array, logger)

        # Loop throught the processed_data_array and create sql queries and execute them
        for key, value in processed_data_array.items():
            for item in value:
                # Store table name for stdout
                args_array['table_name'] = item['table_name']
                args_array['document_id'] = item['GrantID']
                # Build query and pass to database loader
                database_connection.load(
                    SQLProcessor.build_sql_insert_query(item, args_array),
                    args_array, logger)
Exemple #5
0
def main_process(link_queue, args_array, spooling_value):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Check the spooling value in args_array and set a wait time
    args_array['spooling_value'] = spooling_value
    if args_array['spooling_value'] > 4:
        print('[Sleeping thread for initial spooling thread number ' +
              str(spooling_value) + '...]')
        logger.info('Sleeping thread for initial spooling thread number ' +
                    str(spooling_value) + '...')
        time.sleep(
            (args_array['spooling_value']) * args_array['thread_spool_delay'])
        print('[Thread number ' + str(spooling_value) +
              ' is waking from sleep...]')
        logger.info('Thread number ' + str(spooling_value) +
                    ' is waking from sleep...')

        args_array['spooling_value'] = 0

    # Print message to stdout
    print('Process {0} is starting to work! Start Time: {1}'.format(
        os.getpid(), time.strftime("%c")))
    # Set process start time
    process_start_time = time.time()

    # Create the database connection here so that each process uses its own connection,
    # hopefully to increase the bandwith to the database.
    if "database" in args_array["command_args"]:
        # Create a database connection for each thread processes
        database_connection = SQLProcessor.SQLProcess(database_args)
        database_connection.connect()
        args_array['database_connection'] = database_connection

    # Go through each link in the array passed in.
    while not link_queue.empty():
        #for item in link_pile:

        # Get the next item in the queue
        item = link_queue.get()
        # Separate link item into link and file_type and append to args_array for item
        args_array['url_link'] = item[0]
        args_array['uspto_xml_format'] = item[1]
        args_array['document_type'] = item[3]
        # file_name is used to keep track of the .zip base filename
        args_array['file_name'] = os.path.basename(
            args_array['url_link']).replace(".zip", "")

        # Set process time
        start_time = time.time()

        # Start the main processing of each link in link_pile array
        print("Processing .zip file: " + args_array['url_link'] +
              " Started at: " + time.strftime("%c"))

        # If using item by item database insertion check if the args_array['file_name']
        # has previously been partially processed.
        # If it has, then remove all records from the previous partial processing.
        # If it has not, then insert into STARTED_FILES as having been started.
        if "database" in args_array['command_args'] and args_array[
                'database_insert_mode'] != "bulk":
            database_connection.remove_previous_file_records(
                args_array['document_type'], args_array['file_name'])

        # Call the function to collect patent data from each link
        # and store it to specified place (csv and/or database)
        try:
            USPTOProcessLinks.process_link_file(args_array)
            # Print and log notification that one .zip package is finished
            print(
                '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))
            logger.info(
                'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))

        except Exception as e:
            # Print and log general fail comment
            print("Processing a file failed... " + args_array['file_name'] +
                  " from link " + args_array['url_link'] + " at: " +
                  time.strftime("%c"))
            logger.error("Processing a file failed... " +
                         args_array['file_name'] + " from link " +
                         args_array['url_link'])
            # Print traceback
            traceback.print_exc()
            # Print exception information to file
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Exception: " + str(exc_type) + " in Filename: " +
                         str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                         " Traceback: " + traceback.format_exc())

    # TODO: check for unprocessed (will have to add "processing" flag.) and add a check before starting
    # processing to avoid collisions of link piles.  Make link_pile loop into a function and
    # then call it again.  OR... make link pile a super global, and somehow be able to check against
    # other processes and rebalance and pop off from link piles.

    # Print message that process is finished
    print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'.
          format(time.time() - process_start_time, time.strftime("%c")))
Exemple #6
0
def verification_process(link_queue, args_array, database_args,
                         spooling_value):
    # Set process start time
    process_start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Check the delay value in args_array and set a wait time
    args_array['spooling_value'] = spooling_value
    if args_array['spooling_value'] > 4:
        print('[Sleeping thread number ' + str(spooling_value) +
              ' to wait for download...]')
        logger.info('Sleeping thread for initial spooling thread number ' +
                    str(spooling_value) + '...')
        time.sleep(
            (args_array['spooling_value']) * args_array['thread_spool_delay'])
        print('[Thread number ' + str(spooling_value) +
              ' is waking from sleep...]')
        logger.info('Thread number ' + str(spooling_value) +
                    ' is waking from sleep...')

        args_array['spooling_value'] = 0

    print('Process {0} is starting to work! Start Time: {1}'.format(
        os.getpid(), time.strftime("%c")))

    # Create a database connection for each thread processes
    database_connection = SQLProcessor.SQLProcess(database_args)
    database_connection.connect()
    args_array['database_connection'] = database_connection

    # Go through each link in link_queue
    while not link_queue.empty():

        # Set process time
        start_time = time.time()

        # Get the next item in the queue
        item = link_queue.get()
        # Separate link item into (1) link url, (2) file format type,
        # and (3) the document type and append to args_array to be
        # passed with the item through parsing route
        args_array['url_link'] = item[0]
        args_array['uspto_xml_format'] = item[1]
        args_array['document_type'] = item[-1]

        # file_name is used to keep track of the downloaded file's
        # base filename (no file extension)
        args_array['file_name'] = os.path.basename(
            args_array['url_link']).replace(".zip", "").replace(".csv",
                                                                "").replace(
                                                                    ".txt", "")

        print("-- Verifying " + args_array['uspto_xml_format'] + " file: " +
              args_array['file_name'] + " Started at: " + time.strftime("%c"))

        # Call function to verify data for each link
        # and store the expected values in the PARSER_VERIFICATION table
        try:
            USPTOVerifyLinks.verify_link_file(args_array)
            # Print and log notification that one .zip package is finished
            print(
                '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))
            logger.info(
                'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))

        except Exception as e:
            # Print and log general fail comment
            print("Processing a file failed... " + args_array['file_name'] +
                  " from link " + args_array['url_link'] + " at: " +
                  time.strftime("%c"))
            logger.error("Processing a file failed... " +
                         args_array['file_name'] + " from link " +
                         args_array['url_link'])
            traceback.print_exc()
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Exception: " + str(exc_type) + " in Filename: " +
                         str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                         " Traceback: " + traceback.format_exc())

    # At this point all links have been processed
    #
    # TODO: check logs files again for unprocessed files
    # (will have to add an additional "processing" flag to links in log file.)
    # processing to avoid collisions of link piles.  Make link_pile loop into a function and
    # then call it again.  OR... make link pile a super global, and somehow be able to check against
    # other processes and rebalance and pop off from link piles.

    # Print message that process is finished
    print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'.
          format(time.time() - process_start_time, time.strftime("%c")))
Exemple #7
0
                all_files_processed = "Error"

            # Else if the read list of unprocessed links is not empty
            elif len(all_links_array["grants"]) != 0 or len(
                    all_links_array["applications"]) != 0 or len(
                        all_links_array["PAIR"]) != 0 or len(
                            all_links_array["classifications"]) != 0 or len(
                                all_links_array["legal"]) != 0:

                # If the command args specify to patch missing data
                # from the UPSTO bulk dataset
                if "patch" in args_array['command_args']:
                    # Import the USPTOBigQuery module
                    import USPTOBigQuery
                    # Create a database connection
                    database_connection = SQLProcessor.SQLProcess(
                        database_args)
                    #database_connection.connect()
                    #args_array['database_connection'] = database_connection
                    # Instansiate class object
                    bq = USPTOBigQuery.PatentBigQuery(args_array)
                    # Collect the CPC classification for all 2005 patents
                    json_obj = bq.get_2005_grant_classifications(args_array)
                    # Insert the CPC class into the main database
                    insert_2005_grant_classifications(args_array, json_obj)
                    # Exit with success status
                    # Since if patch-missing no other process will be done
                    exit(0)

                # Set the string for the output depeding on whether verification or parsing bulk-data
                if "verify" in args_array['command_args']: action = "verified"
                else: action = "collected"
Exemple #8
0
def create_list_queue(args):

    # Include logger
    logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction")

    print("-- Creating a link queue for all Alexa sites in list...")
    logger.info("-- Creating a link queue for all Alexa sites in list...")

    # Create a database connection
    db_conn = SQLProcessor.SQLProcess(database_args, args)
    db_conn.connect()
    # Get the highest number in the database
    next_pos, missing = db_conn.get_next_position(args)

    # Initialize a queue of queue's
    qq = []
    # Initialize a queue to fill with items
    list_queue = Queue()

    # Set the max size of the list queue
    if args['list_limit'] == None: max_count = args['max_queue_count']
    else: max_count = args['list_limit']

    # Open the csv file and make into list
    # First column is only rank, second is url
    with open(args['alexa_list'], "r") as infile:
        alexa_list = infile.readlines()
    print("Starting Alexa List size: " + str(len(alexa_list)))

    # Append all to the list
    size = 0
    count = 1
    for site in alexa_list:
        # Get the domain
        arr = site.split(",")
        # Only add the item if not processed
        if int(arr[0]) in missing or int(arr[0]) >= next_pos:
            print("[adding " + arr[-1].strip() + " to the list...]")
            list_queue.put({ "pos" : arr[0].strip(), "domain" : arr[-1].strip()})
            size += 1
            if count == max_count:
                # Put the list queue on to the qq
                qq.append(list_queue)
                time.sleep(0.2)
                # Reinitialize the queue
                list_queue = Queue()
                # Reset the counter
                count = 0
            # Increment the position
            count += 1
        # Print message for skipping item
        else: print("[skipping " + arr[-1].strip() + " from the list...]")

    # Append the last partially-filled queue
    qq.append(list_queue)
    # Return the qq
    print("-- Finished adding sites to link queue for all Alexa sites in list...")
    logger.info("-- Finished adding sites to link queue for all Alexa sites in list...")
    time.sleep(2)
    print("-- Queue Size: " + str(size))

    return qq
Exemple #9
0
def parse_items_thread(database_args, args, qq):

    # Include logger
    logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction")

    # Create a database connection
    db_conn = SQLProcessor.SQLProcess(database_args, args)
    db_conn.connect()
    args['db_conn'] = db_conn

    # Create a PyCurl object
    curl = pycurl.Curl()
    # Keep track of the number I'm on
    item_num = args['max_queue_count']

    # Loop through each queue item in qq
    for queue in qq:

        # Pull the queue item off
        list_queue = queue

        # Go through each link in link_queue
        while not list_queue.empty():

            # Get item from queue
            print("[ Process " + str(os.getpid()) + " is picking next item from queue...]")
            item = list_queue.get()
            domain = item["domain"]
            position = item["pos"]

            # Only process if not found in database already
            if args['db_conn'].all_already_scraped(len(args['schemes_and_subdomains']), domain, position) == False:

                for ext in args['schemes_and_subdomains']:

                    # Only process if not found in database already
                    if args['db_conn'].is_already_scraped(ext, domain, position) == False:

                        # Instantiate object
                        data_obj = Headers()

                        # Get headers using pycurl
                        try:

                            # Set some other information in the data object
                            data_obj.tld = domain
                            data_obj.ext = ext.replace("https://","").replace("http://", "")
                            data_obj.url = ext + domain
                            data_obj.position = int(position)

                            print("-- Checking " + ext + domain + " for HTTP headers...")
                            # Set URL value
                            curl.setopt(curl.URL, ext + domain)
                            b_obj = BytesIO()
                            #user_agent = '-H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.2403.89 Safari/537.36"'
                            #command = "curl " + user_agent + " -I https://" + domain
                            #print(command)
                            #output = subprocess.check_output(command, shell=True)
                            curl.setopt(pycurl.FOLLOWLOCATION, args['curl_follow_redirect'])
                            curl.setopt(pycurl.MAXREDIRS, args['curl_max_redirect'])
                            curl.setopt(pycurl.CONNECTTIMEOUT, args['curl_conn_timeout'])
                            curl.setopt(pycurl.TIMEOUT, args['curl_timeout'])
                            curl.setopt(curl.HEADERFUNCTION, data_obj.display_header)
                            curl.setopt(curl.WRITEDATA, b_obj)
                            curl.perform()

                            data_obj.get_http_return_code()
                            data_obj.get_ip()
                            # Only want to do this once since it's for domain and subdomain
                            if "https://" in data_obj.url: data_obj.get_mx_records()
                            #print('Header values:-')
                            #print(data_obj.headers)

                        except Exception as e:
                            data_obj.http_code = 0
                            print("[ ** HTTP header request failed to respond " + ext + domain + "...]")
                            traceback.print_exc()
                            logger.error("[ ** HTTP header request failed to respond " + ext + domain + "...]")
                            logger.error(traceback.format_exc())

                        # Store the results to database
                        args['db_conn'].store_headers_to_database(args, data_obj)
                        if len(data_obj.mx): args['db_conn'].store_mx_to_database(args, data_obj)

                        # Delete the object
                        del data_obj

    # End curl session
    curl.close()
    return