Beispiel #1
0
def setup_job():
    """
    Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output
    """
    #IAM
    try:
        setup_iam()
    except:
        print "Error while setting up IAM PROFILE, most likely due to existing profile"
        logging.exception("Error while setting up IAM PROFILE, most likely due to existing profile")
        pass
    #S3 bucket
    from boto.s3.connection import S3Connection
    from cclib.commoncrawl import CommonCrawl
    logging.getLogger('boto').setLevel(logging.CRITICAL)
    import filequeue
    S3 = S3Connection()
    logging.info("Creating bucket "+OUTPUT_S3_BUCKET)
    S3.create_bucket(OUTPUT_S3_BUCKET)
    logging.info("bucket created")
    # SQS
    crawl = CommonCrawl(CRAWL_ID)
    file_list = crawl.get_file_list(FILE_TYPE) # Text files
    file_queue = filequeue.FileQueue(JOB_QUEUE,VISIBILITY_TIMEOUT,file_list)
    logging.debug("Adding "+str(len(file_list))+" "+FILE_TYPE+" files to queue "+JOB_QUEUE)
    file_queue.add_files()
    logging.debug("Finished adding files")
    print "Finished adding files"
Beispiel #2
0
def setup_job():
    """
    Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output
    """
    #IAM
    try:
        setup_iam()
    except:
        print "Error while setting up IAM PROFILE, most likely due to existing profile"
        logging.exception(
            "Error while setting up IAM PROFILE, most likely due to existing profile"
        )
        pass
    #S3 bucket
    from boto.s3.connection import S3Connection
    from cclib.commoncrawl import CommonCrawl
    logging.getLogger('boto').setLevel(logging.CRITICAL)
    import filequeue
    S3 = S3Connection()
    logging.info("Creating bucket " + OUTPUT_S3_BUCKET)
    S3.create_bucket(OUTPUT_S3_BUCKET)
    logging.info("bucket created")
    # SQS
    crawl = CommonCrawl(CRAWL_ID)
    file_list = crawl.get_file_list(FILE_TYPE)  # Text files
    file_queue = filequeue.FileQueue(JOB_QUEUE, VISIBILITY_TIMEOUT, file_list)
    logging.debug("Adding " + str(len(file_list)) + " " + FILE_TYPE +
                  " files to queue " + JOB_QUEUE)
    file_queue.add_files()
    logging.debug("Finished adding files")
    print "Finished adding files"
Beispiel #3
0
def setup_test():
    """
    Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output
    """
    setup_common()
    crawl = CommonCrawl(CRAWL_ID)
    file_list = crawl.get_file_list(FILE_TYPE) # Text files
    file_queue = FileQueue(JOB_QUEUE,VISIBILITY_TIMEOUT,file_list)
    logging.debug("Adding "+str(len(file_list))+" "+FILE_TYPE+" files to queue "+JOB_QUEUE)
    file_queue.add_files(count=5)
    logging.debug("Finished adding files")
    print "Finished adding files"