import env import time import threading from uuid import getnode as get_mac from recorder import Recorder from sender import Sender from filequeue import FileQueue import mlog import logging mlog.configLoggers(['agent'], logs_folder=env.logs_folder, debug_mode=env.debug_mode) logger = logging.getLogger('agent') env.uuid = '%s.%s' % (env.iface, str(get_mac())) queue = FileQueue(env) sender = Sender(env, queue) sender.start() if env.mode == "live": recorder = Recorder(env, queue) recorder.start() logger.info('Agent %s is up in %s mode', env.uuid, env.mode) while True: time.sleep(1)
for line in fileobj: line = line.strip() if line.startswith('WARC-Target-URI'): count += 1 if "http://" in line: counts[line.split('http://')[1].split('/')[0]] += 1 if "amazon.com" in line.lower(): amazon.append( line.split('WARC-Target-URI:')[1].strip()) except: logging.exception("error while processing file") error = True pass return { 'metdata_lines': count, 'amazon': amazon, 'counts': [(k, v) for k, v in counts.iteritems() if v > 10], "filename": filename, "error": error } if __name__ == '__main__': import sys if "test" in sys.argv: test = True else: test = False crawl = commoncrawl.CommonCrawl(CRAWL_ID) queue = FileQueue(JOB_QUEUE, files=None) process_queue(queue, crawl, test)