def run(src_directory, dst, worker_processes, limit=None, move=False):
    sch = luigi.scheduler.CentralPlannerScheduler()
    idx = 0
    w = luigi.worker.Worker(scheduler=sch,
                            worker_processes=worker_processes)

    uploaded_files = []
    for filename in list_report_files(
        src_directory, aws_access_key_id=config.aws.access_key_id,
            aws_secret_access_key=config.aws.secret_access_key):
        if limit is not None and idx >= limit:
            break
        idx += 1
        logging.info("uploading %s" % filename)
        task = S3CopyRawReport(src=filename, dst=dst, move=move)
        uploaded_files.append(task.output().path)
        w.add(task)
    w.run()
    w.stop()
    uploaded_dates = []
    for uploaded_file in uploaded_files:
        uploaded_date = os.path.basename(os.path.dirname(uploaded_file))
        if uploaded_date not in uploaded_dates:
            uploaded_dates.append(uploaded_date)
    return uploaded_dates
def run(srcs, dst_private, worker_processes=16):
    sch = luigi.scheduler.CentralPlannerScheduler()
    w = luigi.worker.Worker(scheduler=sch,
                            worker_processes=worker_processes)

    for src in srcs:
        logging.info("Adding headers for src: %s" % src)
        report_files = list_report_files(
            src, key_file=config.core.ssh_private_key_file,
            no_host_key_check=True
        )
        task = MoveReportFiles(report_files=list(report_files),
                               dst_private=dst_private)
        w.add(task)
    w.run()
    w.stop()
    return report_files
Example #3
0
    def run(self):
        with get_luigi_target(config.ooni.bridge_db_path).open('r') as f:
            self.bridge_db = json.load(f)

        output = self.output()
        raw_streams = output["raw_streams"].open('w')
        sanitised_streams = output["sanitised_streams"].open('w')

        reports_path = os.path.join(self.src,
                                    self.date.strftime("%Y-%m-%d"))
        logger.debug("listing path %s" % reports_path)
        for filename in list_report_files(reports_path,
                                          config.aws.access_key_id,
                                          config.aws.secret_access_key):
            logger.debug("got filename %s" % filename)
            try:
                self.process_report(filename, sanitised_streams, raw_streams)
            except Exception:
                logger.error("error in processing %s" % filename)
                logger.error(traceback.format_exc())
        raw_streams.close()
        sanitised_streams.close()