def setup_job(): """ Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output """ #IAM try: setup_iam() except: print "Error while setting up IAM PROFILE, most likely due to existing profile" logging.exception("Error while setting up IAM PROFILE, most likely due to existing profile") pass #S3 bucket from boto.s3.connection import S3Connection from cclib.commoncrawl import CommonCrawl logging.getLogger('boto').setLevel(logging.CRITICAL) import filequeue S3 = S3Connection() logging.info("Creating bucket "+OUTPUT_S3_BUCKET) S3.create_bucket(OUTPUT_S3_BUCKET) logging.info("bucket created") # SQS crawl = CommonCrawl(CRAWL_ID) file_list = crawl.get_file_list(FILE_TYPE) # Text files file_queue = filequeue.FileQueue(JOB_QUEUE,VISIBILITY_TIMEOUT,file_list) logging.debug("Adding "+str(len(file_list))+" "+FILE_TYPE+" files to queue "+JOB_QUEUE) file_queue.add_files() logging.debug("Finished adding files") print "Finished adding files"
def setup_job(): """ Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output """ #IAM try: setup_iam() except: print "Error while setting up IAM PROFILE, most likely due to existing profile" logging.exception( "Error while setting up IAM PROFILE, most likely due to existing profile" ) pass #S3 bucket from boto.s3.connection import S3Connection from cclib.commoncrawl import CommonCrawl logging.getLogger('boto').setLevel(logging.CRITICAL) import filequeue S3 = S3Connection() logging.info("Creating bucket " + OUTPUT_S3_BUCKET) S3.create_bucket(OUTPUT_S3_BUCKET) logging.info("bucket created") # SQS crawl = CommonCrawl(CRAWL_ID) file_list = crawl.get_file_list(FILE_TYPE) # Text files file_queue = filequeue.FileQueue(JOB_QUEUE, VISIBILITY_TIMEOUT, file_list) logging.debug("Adding " + str(len(file_list)) + " " + FILE_TYPE + " files to queue " + JOB_QUEUE) file_queue.add_files() logging.debug("Finished adding files") print "Finished adding files"
def setup_test(): """ Sets up the queue adds all files (text or warc or wat or wet), creates bucket to store output """ setup_common() crawl = CommonCrawl(CRAWL_ID) file_list = crawl.get_file_list(FILE_TYPE) # Text files file_queue = FileQueue(JOB_QUEUE,VISIBILITY_TIMEOUT,file_list) logging.debug("Adding "+str(len(file_list))+" "+FILE_TYPE+" files to queue "+JOB_QUEUE) file_queue.add_files(count=5) logging.debug("Finished adding files") print "Finished adding files"