def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar":"baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")
Beispiel #2
0
    def test_write_filename(self):
        test_file = os.path.join(self.get_test_dir(), "test.log")
        self.storage.write_filename("foo", '{"bar":"baz"}', test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")

        test_file = os.path.join(self.get_test_dir(), "test2.log")
        # Now test writing an object
        self.storage.write_filename("foo", {"bar": "baz"}, test_file)
        test_file_md5, test_file_size = fileutil.md5file(test_file)
        self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")
    def export_batch(self, data_dir, conn, bucket, files):
        print self.label, "Uploading", ",".join(files)
        if self.dry_run:
            return 0

        # Time the s3funnel call:
        start = datetime.now()
        result = subprocess.call(self.s3f_cmd + files, cwd=data_dir)
        sec = timer.delta_sec(start)

        total_size = 0
        if result == 0:
            # Success! Verify each file's checksum, then truncate it.
            for f in files:
                # Verify checksum and track cumulative size so we can figure out MB/s
                full_filename = os.path.join(data_dir, f)
                md5, size = fileutil.md5file(full_filename)
                total_size += size
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    # TODO: add it to a "failed" queue.
                    print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5)
                    self.bad_records += 1
                    result = -1
                # TODO: else add it to a "succeeded" queue and remove it locally.
        else:
            print "Failed to upload one or more files in the current batch. Error code was", result

        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec)
        return result
    def export_batch(self, data_dir, conn, bucket, files):
        print self.label, "Uploading", ",".join(files)
        if self.dry_run:
            return 0

        # Time the s3funnel call:
        start = datetime.now()
        result = subprocess.call(self.s3f_cmd + files, cwd=data_dir)
        sec = timer.delta_sec(start)

        total_size = 0
        if result == 0:
            # Success! Verify each file's checksum, then truncate it.
            for f in files:
                # Verify checksum and track cumulative size so we can figure out MB/s
                full_filename = os.path.join(data_dir, f)
                md5, size = fileutil.md5file(full_filename)
                total_size += size
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    # TODO: add it to a "failed" queue.
                    print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5)
                    self.bad_records += 1
                    result = -1
                # TODO: else add it to a "succeeded" queue and remove it locally.
        else:
            print "Failed to upload one or more files in the current batch. Error code was", result

        total_mb = float(total_size) / 1024.0 / 1024.0
        print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec)
        return result
    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")
Beispiel #6
0
    def test_write(self):
        dims = ["r1", "a1", "c1", "v1", "b1", "20130102"]
        test_dir = self.get_test_dir()
        test_file = self.schema.get_filename(test_dir, dims)
        self.assertEquals(test_file,
                          test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log")

        self.storage.write("foo", '{"bar":"baz"}', dims)
        md5, size = fileutil.md5file(test_file)
        self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key):
    result = 0
    if len(incoming_files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)

        files = []
        for f in incoming_files:
            full_filename = os.path.join(fetch_cwd, f)
            if os.path.isfile(full_filename):
                md5, size = fileutil.md5file(full_filename)
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    files.append(f)
                else:
                    print "Already downloaded", f
            else:
                files.append(f)
        fetch_cmd = [S3FUNNEL_PATH]
        fetch_cmd.append(bucket.name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        # Fetch in batches of 8 files at a time
        while len(files) > 0:
            current_files = files[0:8]
            files = files[8:]
            start = datetime.now()
            result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd)
            duration_sec = timer.delta_sec(start)
            # TODO: verify MD5s
            downloaded_bytes = sum([
                os.path.getsize(os.path.join(fetch_cwd, f))
                for f in current_files
            ])
            downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (
                downloaded_mb, duration_sec, downloaded_mb / duration_sec)
            if result != 0:
                break
    return result
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key):
    result = 0
    if len(incoming_files) > 0:
        if not os.path.isdir(fetch_cwd):
            os.makedirs(fetch_cwd)

        files = []
        for f in incoming_files:
            full_filename = os.path.join(fetch_cwd, f)
            if os.path.isfile(full_filename):
                md5, size = fileutil.md5file(full_filename)
                # f is the key name - it does not include the full path to the
                # data dir.
                key = bucket.get_key(f)
                # Strip quotes from md5
                remote_md5 = key.etag[1:-1]
                if md5 != remote_md5:
                    files.append(f)
                else:
                    print "Already downloaded", f
            else:
                files.append(f)
        fetch_cmd = [S3FUNNEL_PATH]
        fetch_cmd.append(bucket.name)
        fetch_cmd.append("get")
        fetch_cmd.append("-a")
        fetch_cmd.append(aws_key)
        fetch_cmd.append("-s")
        fetch_cmd.append(aws_secret_key)
        fetch_cmd.append("-t")
        fetch_cmd.append("8")
        # Fetch in batches of 8 files at a time
        while len(files) > 0:
            current_files = files[0:8]
            files = files[8:]
            start = datetime.now()
            result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd)
            duration_sec = timer.delta_sec(start)
            # TODO: verify MD5s
            downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ])
            downloaded_mb = downloaded_bytes / 1024.0 / 1024.0
            print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec)
            if result != 0:
                break
    return result
def main():
    args = get_args()
    logging.basicConfig()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.WARNING)

    if not os.path.exists(args.work_dir):
        os.makedirs(args.work_dir)

    logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(
        args.source_bucket, args.dest_bucket))
    logger.debug("Connecting to S3...")
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    source_bucket = conn.get_bucket(args.source_bucket)
    dest_bucket = conn.get_bucket(args.dest_bucket)

    compress_cmd = [StorageLayout.COMPRESS_PATH
                    ] + StorageLayout.COMPRESSION_ARGS
    prefix = args.prefix
    last_key = ''
    done = False
    total_count = 0
    total_bytes = 0
    start_time = datetime.now()
    dupe_map = {}
    while not done:
        try:
            for k in source_bucket.list(prefix=prefix, marker=last_key):
                if k.name.endswith('/'):
                    logger.debug("Skipping directory '{}'".format(k.name))
                    continue
                if skip_by_date(k.name, args.min_date, logger):
                    logger.debug("Skipping file older than {}: {}".format(
                        args.min_date, k.name))
                    continue
                total_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.info(
                        "Looked at {} total records in {} seconds. Last key was {}"
                        .format(total_count, timer.delta_sec(start_time),
                                last_key))
                logger.debug("Fetching {} from source bucket".format(k.name))
                full_source_filename = os.path.join(args.work_dir, "__source",
                                                    k.name)
                full_dest_filename = os.path.join(args.work_dir, "__dest",
                                                  k.name)

                # Ensure that the necessary local dirs exist:
                for f in [full_source_filename, full_dest_filename]:
                    dirname = os.path.dirname(f)
                    if dirname != '' and not os.path.exists(dirname):
                        os.makedirs(dirname)
                logger.debug("Getting '{}' to '{}'".format(
                    k.name, full_source_filename))
                k.get_contents_to_filename(full_source_filename)

                logger.info("Removing pingIDs...")
                tmp_out_file = full_dest_filename + ".tmp"
                out_handle = open(tmp_out_file, "w")
                logger.debug("Uncompressing...")
                if full_source_filename.endswith(
                        StorageLayout.COMPRESSED_SUFFIX):
                    decompress_cmd = [StorageLayout.COMPRESS_PATH
                                      ] + StorageLayout.DECOMPRESSION_ARGS
                    raw_handle = open(full_source_filename, "rb")
                    # Popen the decompressing version of StorageLayout.COMPRESS_PATH
                    p_decompress = Popen(decompress_cmd,
                                         bufsize=65536,
                                         stdin=raw_handle,
                                         stdout=PIPE,
                                         stderr=sys.stderr)
                    handle = p_decompress.stdout
                else:
                    handle = open(full_source_filename, "r")
                    raw_handle = None

                logger.debug("Generating new pingIDs...")
                for line in handle:
                    # Lines are of the form <key><tab><json payload><newline>.
                    # Split on tab character to get the pieces.
                    key, payload = line.split(u"\t", 1)
                    # Replace key with a fresh UUID:
                    if key in dupe_map:
                        logger.info(
                            "Already saw key {}, skipping any more occurrences"
                            .format(key))
                    else:
                        new_key = str(uuid4())
                        dupe_map[key] = new_key
                        out_handle.write(u"%s\t%s" % (new_key, payload))

                handle.close()
                out_handle.close()
                if raw_handle:
                    raw_handle.close()

                sql_update = None
                empty_result = False
                if os.stat(tmp_out_file).st_size > 0:
                    logger.debug("Compressing new file...")
                    f_comp = open(full_dest_filename, "wb")
                    f_raw = open(tmp_out_file, "r", 1)
                    p_compress = Popen(compress_cmd,
                                       bufsize=65536,
                                       stdin=f_raw,
                                       stdout=f_comp,
                                       stderr=sys.stderr)
                    p_compress.communicate()
                    f_raw.close()
                    f_comp.close()
                    local_md5, size = fu.md5file(full_dest_filename)
                    sql_update = "UPDATE published_files SET " \
                          "file_md5 = '{0}', " \
                          "file_size = {1}, " \
                          "bucket_name = '{2}' " \
                          "WHERE file_name = '{3}';".format(local_md5, size,
                            dest_bucket.name, k.name)
                else:
                    # Don't upload empty files.
                    empty_result = True
                    sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(
                        k.name)
                    logger.debug(
                        "File was empty, skipping: {}".format(tmp_out_file))

                logger.info(
                    "Removing temp output file: {}".format(tmp_out_file))
                os.remove(tmp_out_file)

                if not empty_result and should_run(args.dry_run, logger,
                                                   "Uploading to dest bucket"):
                    dest_key = dest_bucket.new_key(k.name)
                    dest_key.set_contents_from_filename(full_dest_filename)
                    # Compare the md5 to be sure it succeeded.
                    dest_md5 = dest_key.etag[1:-1]
                    local_md5, size = fu.md5file(full_dest_filename)
                    if dest_md5 != local_md5:
                        raise Exception(
                            "Failed to upload {}".format(full_dest_filename))

                if should_run(
                        args.dry_run, logger, "Removing input file: {}".format(
                            full_source_filename)):
                    os.remove(full_source_filename)

                if not empty_result and should_run(
                        args.dry_run, logger,
                        "Removing output file: {}".format(full_dest_filename)):
                    os.remove(full_dest_filename)

                if empty_result or args.source_bucket != args.dest_bucket:
                    if should_run(args.dry_run, logger,
                                  "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info(
                        "Not deleting source: either non-empty or same bucket: {}"
                        .format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    #TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))
      "allowed_values": ["c1", "c2", "c3"]
    },
    {
      "field_name": "appVersion",
      "allowed_values": "*"
    },
    {
      "field_name": "appBuildID",
     "allowed_values": "*"
    },
    {
      "field_name": "submission_date",
      "allowed_values": {
          "min": "20130101",
          "max": "20131231"
      }
    }
  ]
}

try:
    schema = TelemetrySchema(schema_spec)
    storage = StorageLayout(schema, test_dir, 10000)
    test_file_1 = os.path.join(test_dir, "test.log")
    storage.write_filename("foo", '{"bar": "baz"}', test_file_1)
    test_file_1_md5, test_file_1_size = fileutil.md5file(test_file_1)
    assert test_file_1_md5 == "206dd2d33a04802c31d2c74f10cc472b"
    assert storage.clean_newlines("ab\n\ncd\r\n") == "ab  cd  "
finally:
    shutil.rmtree(test_dir)
def main():
    args = get_args()
    logging.basicConfig()
    logger = logging.getLogger(__name__)
    if args.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.WARNING)

    if not os.path.exists(args.work_dir):
        os.makedirs(args.work_dir)

    logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(args.source_bucket, args.dest_bucket))
    logger.debug("Connecting to S3...")
    conn = S3Connection(args.aws_key, args.aws_secret_key)
    source_bucket = conn.get_bucket(args.source_bucket)
    dest_bucket = conn.get_bucket(args.dest_bucket)

    compress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.COMPRESSION_ARGS
    prefix = args.prefix
    last_key = ""
    done = False
    total_count = 0
    total_bytes = 0
    start_time = datetime.now()
    dupe_map = {}
    while not done:
        try:
            for k in source_bucket.list(prefix=prefix, marker=last_key):
                if k.name.endswith("/"):
                    logger.debug("Skipping directory '{}'".format(k.name))
                    continue
                if skip_by_date(k.name, args.min_date, logger):
                    logger.debug("Skipping file older than {}: {}".format(args.min_date, k.name))
                    continue
                total_count += 1
                total_bytes += k.size
                last_key = k.name
                if total_count % 100 == 0:
                    logger.info(
                        "Looked at {} total records in {} seconds. Last key was {}".format(
                            total_count, timer.delta_sec(start_time), last_key
                        )
                    )
                logger.debug("Fetching {} from source bucket".format(k.name))
                full_source_filename = os.path.join(args.work_dir, "__source", k.name)
                full_dest_filename = os.path.join(args.work_dir, "__dest", k.name)

                # Ensure that the necessary local dirs exist:
                for f in [full_source_filename, full_dest_filename]:
                    dirname = os.path.dirname(f)
                    if dirname != "" and not os.path.exists(dirname):
                        os.makedirs(dirname)
                logger.debug("Getting '{}' to '{}'".format(k.name, full_source_filename))
                k.get_contents_to_filename(full_source_filename)

                logger.info("Removing pingIDs...")
                tmp_out_file = full_dest_filename + ".tmp"
                out_handle = open(tmp_out_file, "w")
                logger.debug("Uncompressing...")
                if full_source_filename.endswith(StorageLayout.COMPRESSED_SUFFIX):
                    decompress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.DECOMPRESSION_ARGS
                    raw_handle = open(full_source_filename, "rb")
                    # Popen the decompressing version of StorageLayout.COMPRESS_PATH
                    p_decompress = Popen(
                        decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr
                    )
                    handle = p_decompress.stdout
                else:
                    handle = open(full_source_filename, "r")
                    raw_handle = None

                logger.debug("Generating new pingIDs...")
                for line in handle:
                    # Lines are of the form <key><tab><json payload><newline>.
                    # Split on tab character to get the pieces.
                    key, payload = line.split(u"\t", 1)
                    # Replace key with a fresh UUID:
                    if key in dupe_map:
                        logger.info("Already saw key {}, skipping any more occurrences".format(key))
                    else:
                        new_key = str(uuid4())
                        dupe_map[key] = new_key
                        out_handle.write(u"%s\t%s" % (new_key, payload))

                handle.close()
                out_handle.close()
                if raw_handle:
                    raw_handle.close()

                sql_update = None
                empty_result = False
                if os.stat(tmp_out_file).st_size > 0:
                    logger.debug("Compressing new file...")
                    f_comp = open(full_dest_filename, "wb")
                    f_raw = open(tmp_out_file, "r", 1)
                    p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr)
                    p_compress.communicate()
                    f_raw.close()
                    f_comp.close()
                    local_md5, size = fu.md5file(full_dest_filename)
                    sql_update = (
                        "UPDATE published_files SET "
                        "file_md5 = '{0}', "
                        "file_size = {1}, "
                        "bucket_name = '{2}' "
                        "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name)
                    )
                else:
                    # Don't upload empty files.
                    empty_result = True
                    sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name)
                    logger.debug("File was empty, skipping: {}".format(tmp_out_file))

                logger.info("Removing temp output file: {}".format(tmp_out_file))
                os.remove(tmp_out_file)

                if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"):
                    dest_key = dest_bucket.new_key(k.name)
                    dest_key.set_contents_from_filename(full_dest_filename)
                    # Compare the md5 to be sure it succeeded.
                    dest_md5 = dest_key.etag[1:-1]
                    local_md5, size = fu.md5file(full_dest_filename)
                    if dest_md5 != local_md5:
                        raise Exception("Failed to upload {}".format(full_dest_filename))

                if should_run(args.dry_run, logger, "Removing input file: {}".format(full_source_filename)):
                    os.remove(full_source_filename)

                if not empty_result and should_run(
                    args.dry_run, logger, "Removing output file: {}".format(full_dest_filename)
                ):
                    os.remove(full_dest_filename)

                if empty_result or args.source_bucket != args.dest_bucket:
                    if should_run(args.dry_run, logger, "Deleting from source bucket"):
                        k.delete()
                else:
                    logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name))

                if sql_update is None:
                    logger.error("Missing sql_update :(")
                else:
                    logger.info(sql_update)
                if should_run(args.dry_run, logger, "Notifying coordinator"):
                    # TODO
                    logger.debug("Should be actually notifying coordinator")

            done = True
        except socket.error, e:
            logger.error("Error listing keys: {}".format(e))
            logger.error(traceback.format_exc())
            logger.info("Continuing from last seen key: {}".format(last_key))