Exemple #1
0
def validate_existing_database_structure(args_array):

    # If doing a verification of existing parsed database
    if "verify" in args_array['command_args']:
        # Connect to database
        database_connection = SQLProcessor.SQLProcess(database_args)
        database_connection.connect()
        # Check if PARSER_VERIFICATION table exists and if not create it
        database_connection.checkParserVerificationTable(args_array)
        # Close the database connection
        database_connection.close()
    else:
        pass
Exemple #2
0
def main_process(link_queue, args_array, spooling_value):

    # Import logger
    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Check the spooling value in args_array and set a wait time
    args_array['spooling_value'] = spooling_value
    if args_array['spooling_value'] > 4:
        print('[Sleeping thread for initial spooling thread number ' +
              str(spooling_value) + '...]')
        logger.info('Sleeping thread for initial spooling thread number ' +
                    str(spooling_value) + '...')
        time.sleep(
            (args_array['spooling_value']) * args_array['thread_spool_delay'])
        print('[Thread number ' + str(spooling_value) +
              ' is waking from sleep...]')
        logger.info('Thread number ' + str(spooling_value) +
                    ' is waking from sleep...')

        args_array['spooling_value'] = 0

    # Print message to stdout
    print('Process {0} is starting to work! Start Time: {1}'.format(
        os.getpid(), time.strftime("%c")))
    # Set process start time
    process_start_time = time.time()

    # Create the database connection here so that each process uses its own connection,
    # hopefully to increase the bandwith to the database.
    if "database" in args_array["command_args"]:
        # Create a database connection for each thread processes
        database_connection = SQLProcessor.SQLProcess(database_args)
        database_connection.connect()
        args_array['database_connection'] = database_connection

    # Go through each link in the array passed in.
    while not link_queue.empty():
        #for item in link_pile:

        # Get the next item in the queue
        item = link_queue.get()
        # Separate link item into link and file_type and append to args_array for item
        args_array['url_link'] = item[0]
        args_array['uspto_xml_format'] = item[1]
        args_array['document_type'] = item[3]
        # file_name is used to keep track of the .zip base filename
        args_array['file_name'] = os.path.basename(
            args_array['url_link']).replace(".zip", "")

        # Set process time
        start_time = time.time()

        # Start the main processing of each link in link_pile array
        print("Processing .zip file: " + args_array['url_link'] +
              " Started at: " + time.strftime("%c"))

        # If using item by item database insertion check if the args_array['file_name']
        # has previously been partially processed.
        # If it has, then remove all records from the previous partial processing.
        # If it has not, then insert into STARTED_FILES as having been started.
        if "database" in args_array['command_args'] and args_array[
                'database_insert_mode'] != "bulk":
            database_connection.remove_previous_file_records(
                args_array['document_type'], args_array['file_name'])

        # Call the function to collect patent data from each link
        # and store it to specified place (csv and/or database)
        try:
            USPTOProcessLinks.process_link_file(args_array)
            # Print and log notification that one .zip package is finished
            print(
                '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))
            logger.info(
                'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))

        except Exception as e:
            # Print and log general fail comment
            print("Processing a file failed... " + args_array['file_name'] +
                  " from link " + args_array['url_link'] + " at: " +
                  time.strftime("%c"))
            logger.error("Processing a file failed... " +
                         args_array['file_name'] + " from link " +
                         args_array['url_link'])
            # Print traceback
            traceback.print_exc()
            # Print exception information to file
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Exception: " + str(exc_type) + " in Filename: " +
                         str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                         " Traceback: " + traceback.format_exc())

    # TODO: check for unprocessed (will have to add "processing" flag.) and add a check before starting
    # processing to avoid collisions of link piles.  Make link_pile loop into a function and
    # then call it again.  OR... make link pile a super global, and somehow be able to check against
    # other processes and rebalance and pop off from link piles.

    # Print message that process is finished
    print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'.
          format(time.time() - process_start_time, time.strftime("%c")))
Exemple #3
0
def verification_process(link_queue, args_array, database_args,
                         spooling_value):
    # Set process start time
    process_start_time = time.time()

    logger = USPTOLogger.logging.getLogger("USPTO_Database_Construction")

    # Check the delay value in args_array and set a wait time
    args_array['spooling_value'] = spooling_value
    if args_array['spooling_value'] > 4:
        print('[Sleeping thread number ' + str(spooling_value) +
              ' to wait for download...]')
        logger.info('Sleeping thread for initial spooling thread number ' +
                    str(spooling_value) + '...')
        time.sleep(
            (args_array['spooling_value']) * args_array['thread_spool_delay'])
        print('[Thread number ' + str(spooling_value) +
              ' is waking from sleep...]')
        logger.info('Thread number ' + str(spooling_value) +
                    ' is waking from sleep...')

        args_array['spooling_value'] = 0

    print('Process {0} is starting to work! Start Time: {1}'.format(
        os.getpid(), time.strftime("%c")))

    # Create a database connection for each thread processes
    database_connection = SQLProcessor.SQLProcess(database_args)
    database_connection.connect()
    args_array['database_connection'] = database_connection

    # Go through each link in link_queue
    while not link_queue.empty():

        # Set process time
        start_time = time.time()

        # Get the next item in the queue
        item = link_queue.get()
        # Separate link item into (1) link url, (2) file format type,
        # and (3) the document type and append to args_array to be
        # passed with the item through parsing route
        args_array['url_link'] = item[0]
        args_array['uspto_xml_format'] = item[1]
        args_array['document_type'] = item[-1]

        # file_name is used to keep track of the downloaded file's
        # base filename (no file extension)
        args_array['file_name'] = os.path.basename(
            args_array['url_link']).replace(".zip", "").replace(".csv",
                                                                "").replace(
                                                                    ".txt", "")

        print("-- Verifying " + args_array['uspto_xml_format'] + " file: " +
              args_array['file_name'] + " Started at: " + time.strftime("%c"))

        # Call function to verify data for each link
        # and store the expected values in the PARSER_VERIFICATION table
        try:
            USPTOVerifyLinks.verify_link_file(args_array)
            # Print and log notification that one .zip package is finished
            print(
                '[Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))
            logger.info(
                'Finished processing one .zip package! Time consuming:{0} Time Finished: {1}]'
                .format(time.time() - start_time, time.strftime("%c")))

        except Exception as e:
            # Print and log general fail comment
            print("Processing a file failed... " + args_array['file_name'] +
                  " from link " + args_array['url_link'] + " at: " +
                  time.strftime("%c"))
            logger.error("Processing a file failed... " +
                         args_array['file_name'] + " from link " +
                         args_array['url_link'])
            traceback.print_exc()
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            logger.error("Exception: " + str(exc_type) + " in Filename: " +
                         str(fname) + " on Line: " + str(exc_tb.tb_lineno) +
                         " Traceback: " + traceback.format_exc())

    # At this point all links have been processed
    #
    # TODO: check logs files again for unprocessed files
    # (will have to add an additional "processing" flag to links in log file.)
    # processing to avoid collisions of link piles.  Make link_pile loop into a function and
    # then call it again.  OR... make link pile a super global, and somehow be able to check against
    # other processes and rebalance and pop off from link piles.

    # Print message that process is finished
    print('[Process {0} is finished. Time consuming:{1} Time Finished: {1}]'.
          format(time.time() - process_start_time, time.strftime("%c")))
Exemple #4
0
                all_files_processed = "Error"

            # Else if the read list of unprocessed links is not empty
            elif len(all_links_array["grants"]) != 0 or len(
                    all_links_array["applications"]) != 0 or len(
                        all_links_array["PAIR"]) != 0 or len(
                            all_links_array["classifications"]) != 0 or len(
                                all_links_array["legal"]) != 0:

                # If the command args specify to patch missing data
                # from the UPSTO bulk dataset
                if "patch" in args_array['command_args']:
                    # Import the USPTOBigQuery module
                    import USPTOBigQuery
                    # Create a database connection
                    database_connection = SQLProcessor.SQLProcess(
                        database_args)
                    #database_connection.connect()
                    #args_array['database_connection'] = database_connection
                    # Instansiate class object
                    bq = USPTOBigQuery.PatentBigQuery(args_array)
                    # Collect the CPC classification for all 2005 patents
                    json_obj = bq.get_2005_grant_classifications(args_array)
                    # Insert the CPC class into the main database
                    insert_2005_grant_classifications(args_array, json_obj)
                    # Exit with success status
                    # Since if patch-missing no other process will be done
                    exit(0)

                # Set the string for the output depeding on whether verification or parsing bulk-data
                if "verify" in args_array['command_args']: action = "verified"
                else: action = "collected"
Exemple #5
0
def create_list_queue(args):

    # Include logger
    logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction")

    print("-- Creating a link queue for all Alexa sites in list...")
    logger.info("-- Creating a link queue for all Alexa sites in list...")

    # Create a database connection
    db_conn = SQLProcessor.SQLProcess(database_args, args)
    db_conn.connect()
    # Get the highest number in the database
    next_pos, missing = db_conn.get_next_position(args)

    # Initialize a queue of queue's
    qq = []
    # Initialize a queue to fill with items
    list_queue = Queue()

    # Set the max size of the list queue
    if args['list_limit'] == None: max_count = args['max_queue_count']
    else: max_count = args['list_limit']

    # Open the csv file and make into list
    # First column is only rank, second is url
    with open(args['alexa_list'], "r") as infile:
        alexa_list = infile.readlines()
    print("Starting Alexa List size: " + str(len(alexa_list)))

    # Append all to the list
    size = 0
    count = 1
    for site in alexa_list:
        # Get the domain
        arr = site.split(",")
        # Only add the item if not processed
        if int(arr[0]) in missing or int(arr[0]) >= next_pos:
            print("[adding " + arr[-1].strip() + " to the list...]")
            list_queue.put({ "pos" : arr[0].strip(), "domain" : arr[-1].strip()})
            size += 1
            if count == max_count:
                # Put the list queue on to the qq
                qq.append(list_queue)
                time.sleep(0.2)
                # Reinitialize the queue
                list_queue = Queue()
                # Reset the counter
                count = 0
            # Increment the position
            count += 1
        # Print message for skipping item
        else: print("[skipping " + arr[-1].strip() + " from the list...]")

    # Append the last partially-filled queue
    qq.append(list_queue)
    # Return the qq
    print("-- Finished adding sites to link queue for all Alexa sites in list...")
    logger.info("-- Finished adding sites to link queue for all Alexa sites in list...")
    time.sleep(2)
    print("-- Queue Size: " + str(size))

    return qq
Exemple #6
0
def parse_items_thread(database_args, args, qq):

    # Include logger
    logger = AlexaLogger.logging.getLogger("Alexa_Database_Construction")

    # Create a database connection
    db_conn = SQLProcessor.SQLProcess(database_args, args)
    db_conn.connect()
    args['db_conn'] = db_conn

    # Create a PyCurl object
    curl = pycurl.Curl()
    # Keep track of the number I'm on
    item_num = args['max_queue_count']

    # Loop through each queue item in qq
    for queue in qq:

        # Pull the queue item off
        list_queue = queue

        # Go through each link in link_queue
        while not list_queue.empty():

            # Get item from queue
            print("[ Process " + str(os.getpid()) + " is picking next item from queue...]")
            item = list_queue.get()
            domain = item["domain"]
            position = item["pos"]

            # Only process if not found in database already
            if args['db_conn'].all_already_scraped(len(args['schemes_and_subdomains']), domain, position) == False:

                for ext in args['schemes_and_subdomains']:

                    # Only process if not found in database already
                    if args['db_conn'].is_already_scraped(ext, domain, position) == False:

                        # Instantiate object
                        data_obj = Headers()

                        # Get headers using pycurl
                        try:

                            # Set some other information in the data object
                            data_obj.tld = domain
                            data_obj.ext = ext.replace("https://","").replace("http://", "")
                            data_obj.url = ext + domain
                            data_obj.position = int(position)

                            print("-- Checking " + ext + domain + " for HTTP headers...")
                            # Set URL value
                            curl.setopt(curl.URL, ext + domain)
                            b_obj = BytesIO()
                            #user_agent = '-H "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.2403.89 Safari/537.36"'
                            #command = "curl " + user_agent + " -I https://" + domain
                            #print(command)
                            #output = subprocess.check_output(command, shell=True)
                            curl.setopt(pycurl.FOLLOWLOCATION, args['curl_follow_redirect'])
                            curl.setopt(pycurl.MAXREDIRS, args['curl_max_redirect'])
                            curl.setopt(pycurl.CONNECTTIMEOUT, args['curl_conn_timeout'])
                            curl.setopt(pycurl.TIMEOUT, args['curl_timeout'])
                            curl.setopt(curl.HEADERFUNCTION, data_obj.display_header)
                            curl.setopt(curl.WRITEDATA, b_obj)
                            curl.perform()

                            data_obj.get_http_return_code()
                            data_obj.get_ip()
                            # Only want to do this once since it's for domain and subdomain
                            if "https://" in data_obj.url: data_obj.get_mx_records()
                            #print('Header values:-')
                            #print(data_obj.headers)

                        except Exception as e:
                            data_obj.http_code = 0
                            print("[ ** HTTP header request failed to respond " + ext + domain + "...]")
                            traceback.print_exc()
                            logger.error("[ ** HTTP header request failed to respond " + ext + domain + "...]")
                            logger.error(traceback.format_exc())

                        # Store the results to database
                        args['db_conn'].store_headers_to_database(args, data_obj)
                        if len(data_obj.mx): args['db_conn'].store_mx_to_database(args, data_obj)

                        # Delete the object
                        del data_obj

    # End curl session
    curl.close()
    return