Example #1
0
import env
import time
import threading
from uuid import getnode as get_mac
from recorder import Recorder
from sender import Sender
from filequeue import FileQueue

import mlog
import logging
mlog.configLoggers(['agent'],
                   logs_folder=env.logs_folder,
                   debug_mode=env.debug_mode)
logger = logging.getLogger('agent')
env.uuid = '%s.%s' % (env.iface, str(get_mac()))

queue = FileQueue(env)

sender = Sender(env, queue)
sender.start()

if env.mode == "live":
    recorder = Recorder(env, queue)
    recorder.start()
logger.info('Agent %s is up in %s mode', env.uuid, env.mode)

while True:
    time.sleep(1)
Example #2
0
        for line in fileobj:
            line = line.strip()
            if line.startswith('WARC-Target-URI'):
                count += 1
                if "http://" in line:
                    counts[line.split('http://')[1].split('/')[0]] += 1
                    if "amazon.com" in line.lower():
                        amazon.append(
                            line.split('WARC-Target-URI:')[1].strip())
    except:
        logging.exception("error while processing file")
        error = True
        pass
    return {
        'metdata_lines': count,
        'amazon': amazon,
        'counts': [(k, v) for k, v in counts.iteritems() if v > 10],
        "filename": filename,
        "error": error
    }


if __name__ == '__main__':
    import sys
    if "test" in sys.argv:
        test = True
    else:
        test = False
    crawl = commoncrawl.CommonCrawl(CRAWL_ID)
    queue = FileQueue(JOB_QUEUE, files=None)
    process_queue(queue, crawl, test)