def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    log_level = getattr(logging, args.loglevel)
    kwargs = {"format": LOG_FORMAT, "datefmt": LOG_DATEFMT, "level": log_level}
    logging.basicConfig(**kwargs)
    logger = logging.getLogger()

    # Get all sources mapped into a dict
    with open(args.map_file) as mapfile:
        reader = csv.DictReader(mapfile, delimiter="\t")
        ssc_src_map = {}
        for row in reader:
            ssc_src_map[row["ssc_label"]] = row["source_label"]

    ssc_files = os.listdir(args.ssc_dir)
    for ssc in ssc_files:
        if not ssc.endswith(".ssc"):
            logger.debug("%s is not a SSC file, this element will be ignored" % ssc)
            ssc_files.remove(ssc)
    logger.info("%d files ready to be processed" % len(ssc_files))

    with nested(open(args.ds_fn, "w"), open(args.do_fn, "w")) as (ds_f, do_f):
        ds_w = DataSampleWriter(ds_f)
        do_w = DataObjectWriter(do_f)
        for w in ds_w, do_w:
            w.writeheader()
        for ssc_f in ssc_files:
            logger.info("Processing %s (file %d/%d)" % (ssc_f, ssc_files.index(ssc_f) + 1, len(ssc_files)))
            try:
                with open(os.path.join(args.ssc_dir, ssc_f)) as infile:
                    ds_w.writerow(
                        {
                            "label": os.path.basename(infile.name),
                            "source": ssc_src_map[os.path.basename(infile.name)],
                            "device": args.device_label,
                            "device_type": DEVICE_TYPE,
                            "data_sample_type": DATA_SAMPLE_TYPE,
                            "markers_set": args.marker_set_label,
                            "status": "USABLE",
                        }
                    )
                    do_w.writerow(
                        {
                            "path": os.path.realpath(infile.name),
                            "data_sample_label": os.path.basename(infile.name),
                            "mimetype": mimetypes.SSC_FILE,
                            "size": str(os.stat(infile.name).st_size),
                            "sha1": compute_sha1(infile.name),
                        }
                    )
            except KeyError, ke:
                logger.error("File %s has no source mapped, skipping line" % ke)
Esempio n. 2
0
def write_ssc_data_objects_import_file(fname, ssc_data_set):
  fo = csv.DictWriter(open(fname, mode='w'),
                      fieldnames=['path', 'data_sample_label', 'mimetype',
                                  'size', 'sha1'],
                      delimiter='\t')
  fo.writeheader()
  for label, sample_id, device_id, fname in ssc_data_set:
    size = os.stat(fname).st_size
    sha1 = compute_sha1(fname)
    fo.writerow({
      'path' : 'file://' + os.path.realpath(fname),
      'data_sample_label' : label,
      'mimetype' : mimetypes.SSC_FILE,
      'size' : size,
      'sha1' : sha1,
      })
def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    # can't use argparse's mutually exclusive group, one arg is not optional
    if args.input_list and args.ifiles:
        sys.exit("ERROR: no positional arg accepted if --input-list is specified")
    if not args.input_list and not args.ifiles:
        sys.exit("ERROR: no input source has been specified")

    log_level = getattr(logging, args.loglevel)
    kwargs = {"format": LOG_FORMAT, "datefmt": LOG_DATEFMT, "level": log_level}
    if args.logfile:
        kwargs["filename"] = args.logfile
    logging.basicConfig(**kwargs)
    logger = logging.getLogger()

    if args.input_list:
        input_map = get_input_map(args.input_list)
    else:
        input_map = OrderedDict((fn, Everything()) for fn in args.ifiles)
    logger.info("total input files: %d" % len(input_map))

    snp_name_to_label = get_snp_name_to_label(args.annot_file)
    logger.info("total SNPs: %d" % len(snp_name_to_label))

    with nested(open(args.ds_fn, "w"), open(args.do_fn, "w")) as (ds_f, do_f):
        ds_w = DataSampleWriter(ds_f)
        do_w = DataObjectWriter(do_f)
        for w in ds_w, do_w:
            w.writeheader()
        for k, (fn, requested_sample_ids) in enumerate(input_map.iteritems()):
            logger.info("processing %s (%d/%d)" % (fn, k + 1, len(input_map)))
            plate_barcode = os.path.splitext(os.path.basename(fn))[0].split("_")[1]
            with open(fn) as f:
                data_reader = DataReader(f)
                header = data_reader.header
                header["markers_set"] = args.marker_set_label
                n_blocks = header.get("Num Samples", "?")
                for i, block in enumerate(data_reader.get_sample_iterator()):
                    logger.info("processing block %d/%s " % (i + 1, n_blocks))
                    sample_id, out_fn = write_block(
                        block, plate_barcode, args.out_dir, snp_name_to_label, header, requested_sample_ids
                    )
                    if out_fn is None:
                        logger.info("skipped sample %r (not requested)" % sample_id)
                        continue
                    label = os.path.basename(out_fn)
                    out_path = os.path.abspath(out_fn)
                    ds_w.writerow(
                        {
                            "label": label,
                            "source": sample_id,
                            "device": args.device_label,
                            "device_type": DEVICE_TYPE,
                            "data_sample_type": DATA_SAMPLE_TYPE,
                            "markers_set": args.marker_set_label,
                            "status": "USABLE",
                        }
                    )
                    do_w.writerow(
                        {
                            "path": out_path,
                            "data_sample_label": label,
                            "mimetype": mimetypes.SSC_FILE,
                            "size": str(os.stat(out_path).st_size),
                            "sha1": compute_sha1(out_path),
                        }
                    )
                    for outf in ds_f, do_f:
                        outf.flush()
                        os.fsync(outf.fileno())
def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    log_level = getattr(logging, args.loglevel)
    kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level}
    logging.basicConfig(**kwargs)
    logger = logging.getLogger()

    # Get all sources mapped into a dict
    with open(args.map_file) as mapfile:
        reader = csv.DictReader(mapfile, delimiter='\t')
        ssc_src_map = {}
        for row in reader:
            ssc_src_map[row['ssc_label']] = row['source_label']

    ssc_files = os.listdir(args.ssc_dir)
    for ssc in ssc_files:
        if not ssc.endswith('.ssc'):
            logger.debug('%s is not a SSC file, this element will be ignored' %
                         ssc)
            ssc_files.remove(ssc)
    logger.info('%d files ready to be processed' % len(ssc_files))

    with nested(open(args.ds_fn, 'w'), open(args.do_fn, 'w')) as (ds_f, do_f):
        ds_w = DataSampleWriter(ds_f)
        do_w = DataObjectWriter(do_f)
        for w in ds_w, do_w:
            w.writeheader()
        for ssc_f in ssc_files:
            logger.info('Processing %s (file %d/%d)' %
                        (ssc_f, ssc_files.index(ssc_f) + 1, len(ssc_files)))
            try:
                with open(os.path.join(args.ssc_dir, ssc_f)) as infile:
                    ds_w.writerow({
                        'label':
                        os.path.basename(infile.name),
                        'source':
                        ssc_src_map[os.path.basename(infile.name)],
                        'device':
                        args.device_label,
                        'device_type':
                        DEVICE_TYPE,
                        'data_sample_type':
                        DATA_SAMPLE_TYPE,
                        'markers_set':
                        args.marker_set_label,
                        'status':
                        'USABLE',
                    })
                    do_w.writerow({
                        'path':
                        os.path.realpath(infile.name),
                        'data_sample_label':
                        os.path.basename(infile.name),
                        'mimetype':
                        mimetypes.SSC_FILE,
                        'size':
                        str(os.stat(infile.name).st_size),
                        'sha1':
                        compute_sha1(infile.name)
                    })
            except KeyError, ke:
                logger.error('File %s has no source mapped, skipping line' %
                             ke)
def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    # can't use argparse's mutually exclusive group, one arg is not optional
    if args.input_list and args.ifiles:
        sys.exit(
            "ERROR: no positional arg accepted if --input-list is specified")
    if not args.input_list and not args.ifiles:
        sys.exit("ERROR: no input source has been specified")

    log_level = getattr(logging, args.loglevel)
    kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level}
    if args.logfile:
        kwargs['filename'] = args.logfile
    logging.basicConfig(**kwargs)
    logger = logging.getLogger()

    if args.input_list:
        input_map = get_input_map(args.input_list)
    else:
        input_map = OrderedDict((fn, Everything()) for fn in args.ifiles)
    logger.info("total input files: %d" % len(input_map))

    snp_name_to_label = get_snp_name_to_label(args.annot_file)
    logger.info("total SNPs: %d" % len(snp_name_to_label))

    with nested(open(args.ds_fn, 'w'), open(args.do_fn, 'w')) as (ds_f, do_f):
        ds_w = DataSampleWriter(ds_f)
        do_w = DataObjectWriter(do_f)
        for w in ds_w, do_w:
            w.writeheader()
        for k, (fn, requested_sample_ids) in enumerate(input_map.iteritems()):
            logger.info("processing %s (%d/%d)" % (fn, k + 1, len(input_map)))
            plate_barcode = os.path.splitext(
                os.path.basename(fn))[0].split("_")[1]
            with open(fn) as f:
                data_reader = DataReader(f)
                header = data_reader.header
                header['markers_set'] = args.marker_set_label
                n_blocks = header.get('Num Samples', '?')
                for i, block in enumerate(data_reader.get_sample_iterator()):
                    logger.info("processing block %d/%s " % (i + 1, n_blocks))
                    sample_id, out_fn = write_block(block, plate_barcode,
                                                    args.out_dir,
                                                    snp_name_to_label, header,
                                                    requested_sample_ids)
                    if out_fn is None:
                        logger.info("skipped sample %r (not requested)" %
                                    sample_id)
                        continue
                    label = os.path.basename(out_fn)
                    out_path = os.path.abspath(out_fn)
                    ds_w.writerow({
                        "label": label,
                        "source": sample_id,
                        "device": args.device_label,
                        "device_type": DEVICE_TYPE,
                        "data_sample_type": DATA_SAMPLE_TYPE,
                        "markers_set": args.marker_set_label,
                        "status": "USABLE",
                    })
                    do_w.writerow({
                        "path": out_path,
                        "data_sample_label": label,
                        "mimetype": mimetypes.SSC_FILE,
                        "size": str(os.stat(out_path).st_size),
                        "sha1": compute_sha1(out_path),
                    })
                    for outf in ds_f, do_f:
                        outf.flush()
                        os.fsync(outf.fileno())