def main(argv): parser = make_parser() args = parser.parse_args(argv) log_level = getattr(logging, args.loglevel) kwargs = {"format": LOG_FORMAT, "datefmt": LOG_DATEFMT, "level": log_level} logging.basicConfig(**kwargs) logger = logging.getLogger() # Get all sources mapped into a dict with open(args.map_file) as mapfile: reader = csv.DictReader(mapfile, delimiter="\t") ssc_src_map = {} for row in reader: ssc_src_map[row["ssc_label"]] = row["source_label"] ssc_files = os.listdir(args.ssc_dir) for ssc in ssc_files: if not ssc.endswith(".ssc"): logger.debug("%s is not a SSC file, this element will be ignored" % ssc) ssc_files.remove(ssc) logger.info("%d files ready to be processed" % len(ssc_files)) with nested(open(args.ds_fn, "w"), open(args.do_fn, "w")) as (ds_f, do_f): ds_w = DataSampleWriter(ds_f) do_w = DataObjectWriter(do_f) for w in ds_w, do_w: w.writeheader() for ssc_f in ssc_files: logger.info("Processing %s (file %d/%d)" % (ssc_f, ssc_files.index(ssc_f) + 1, len(ssc_files))) try: with open(os.path.join(args.ssc_dir, ssc_f)) as infile: ds_w.writerow( { "label": os.path.basename(infile.name), "source": ssc_src_map[os.path.basename(infile.name)], "device": args.device_label, "device_type": DEVICE_TYPE, "data_sample_type": DATA_SAMPLE_TYPE, "markers_set": args.marker_set_label, "status": "USABLE", } ) do_w.writerow( { "path": os.path.realpath(infile.name), "data_sample_label": os.path.basename(infile.name), "mimetype": mimetypes.SSC_FILE, "size": str(os.stat(infile.name).st_size), "sha1": compute_sha1(infile.name), } ) except KeyError, ke: logger.error("File %s has no source mapped, skipping line" % ke)
def write_ssc_data_objects_import_file(fname, ssc_data_set): fo = csv.DictWriter(open(fname, mode='w'), fieldnames=['path', 'data_sample_label', 'mimetype', 'size', 'sha1'], delimiter='\t') fo.writeheader() for label, sample_id, device_id, fname in ssc_data_set: size = os.stat(fname).st_size sha1 = compute_sha1(fname) fo.writerow({ 'path' : 'file://' + os.path.realpath(fname), 'data_sample_label' : label, 'mimetype' : mimetypes.SSC_FILE, 'size' : size, 'sha1' : sha1, })
def main(argv): parser = make_parser() args = parser.parse_args(argv) # can't use argparse's mutually exclusive group, one arg is not optional if args.input_list and args.ifiles: sys.exit("ERROR: no positional arg accepted if --input-list is specified") if not args.input_list and not args.ifiles: sys.exit("ERROR: no input source has been specified") log_level = getattr(logging, args.loglevel) kwargs = {"format": LOG_FORMAT, "datefmt": LOG_DATEFMT, "level": log_level} if args.logfile: kwargs["filename"] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger() if args.input_list: input_map = get_input_map(args.input_list) else: input_map = OrderedDict((fn, Everything()) for fn in args.ifiles) logger.info("total input files: %d" % len(input_map)) snp_name_to_label = get_snp_name_to_label(args.annot_file) logger.info("total SNPs: %d" % len(snp_name_to_label)) with nested(open(args.ds_fn, "w"), open(args.do_fn, "w")) as (ds_f, do_f): ds_w = DataSampleWriter(ds_f) do_w = DataObjectWriter(do_f) for w in ds_w, do_w: w.writeheader() for k, (fn, requested_sample_ids) in enumerate(input_map.iteritems()): logger.info("processing %s (%d/%d)" % (fn, k + 1, len(input_map))) plate_barcode = os.path.splitext(os.path.basename(fn))[0].split("_")[1] with open(fn) as f: data_reader = DataReader(f) header = data_reader.header header["markers_set"] = args.marker_set_label n_blocks = header.get("Num Samples", "?") for i, block in enumerate(data_reader.get_sample_iterator()): logger.info("processing block %d/%s " % (i + 1, n_blocks)) sample_id, out_fn = write_block( block, plate_barcode, args.out_dir, snp_name_to_label, header, requested_sample_ids ) if out_fn is None: logger.info("skipped sample %r (not requested)" % sample_id) continue label = os.path.basename(out_fn) out_path = os.path.abspath(out_fn) ds_w.writerow( { "label": label, "source": sample_id, "device": args.device_label, "device_type": DEVICE_TYPE, "data_sample_type": DATA_SAMPLE_TYPE, "markers_set": args.marker_set_label, "status": "USABLE", } ) do_w.writerow( { "path": out_path, "data_sample_label": label, "mimetype": mimetypes.SSC_FILE, "size": str(os.stat(out_path).st_size), "sha1": compute_sha1(out_path), } ) for outf in ds_f, do_f: outf.flush() os.fsync(outf.fileno())
def main(argv): parser = make_parser() args = parser.parse_args(argv) log_level = getattr(logging, args.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} logging.basicConfig(**kwargs) logger = logging.getLogger() # Get all sources mapped into a dict with open(args.map_file) as mapfile: reader = csv.DictReader(mapfile, delimiter='\t') ssc_src_map = {} for row in reader: ssc_src_map[row['ssc_label']] = row['source_label'] ssc_files = os.listdir(args.ssc_dir) for ssc in ssc_files: if not ssc.endswith('.ssc'): logger.debug('%s is not a SSC file, this element will be ignored' % ssc) ssc_files.remove(ssc) logger.info('%d files ready to be processed' % len(ssc_files)) with nested(open(args.ds_fn, 'w'), open(args.do_fn, 'w')) as (ds_f, do_f): ds_w = DataSampleWriter(ds_f) do_w = DataObjectWriter(do_f) for w in ds_w, do_w: w.writeheader() for ssc_f in ssc_files: logger.info('Processing %s (file %d/%d)' % (ssc_f, ssc_files.index(ssc_f) + 1, len(ssc_files))) try: with open(os.path.join(args.ssc_dir, ssc_f)) as infile: ds_w.writerow({ 'label': os.path.basename(infile.name), 'source': ssc_src_map[os.path.basename(infile.name)], 'device': args.device_label, 'device_type': DEVICE_TYPE, 'data_sample_type': DATA_SAMPLE_TYPE, 'markers_set': args.marker_set_label, 'status': 'USABLE', }) do_w.writerow({ 'path': os.path.realpath(infile.name), 'data_sample_label': os.path.basename(infile.name), 'mimetype': mimetypes.SSC_FILE, 'size': str(os.stat(infile.name).st_size), 'sha1': compute_sha1(infile.name) }) except KeyError, ke: logger.error('File %s has no source mapped, skipping line' % ke)
def main(argv): parser = make_parser() args = parser.parse_args(argv) # can't use argparse's mutually exclusive group, one arg is not optional if args.input_list and args.ifiles: sys.exit( "ERROR: no positional arg accepted if --input-list is specified") if not args.input_list and not args.ifiles: sys.exit("ERROR: no input source has been specified") log_level = getattr(logging, args.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} if args.logfile: kwargs['filename'] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger() if args.input_list: input_map = get_input_map(args.input_list) else: input_map = OrderedDict((fn, Everything()) for fn in args.ifiles) logger.info("total input files: %d" % len(input_map)) snp_name_to_label = get_snp_name_to_label(args.annot_file) logger.info("total SNPs: %d" % len(snp_name_to_label)) with nested(open(args.ds_fn, 'w'), open(args.do_fn, 'w')) as (ds_f, do_f): ds_w = DataSampleWriter(ds_f) do_w = DataObjectWriter(do_f) for w in ds_w, do_w: w.writeheader() for k, (fn, requested_sample_ids) in enumerate(input_map.iteritems()): logger.info("processing %s (%d/%d)" % (fn, k + 1, len(input_map))) plate_barcode = os.path.splitext( os.path.basename(fn))[0].split("_")[1] with open(fn) as f: data_reader = DataReader(f) header = data_reader.header header['markers_set'] = args.marker_set_label n_blocks = header.get('Num Samples', '?') for i, block in enumerate(data_reader.get_sample_iterator()): logger.info("processing block %d/%s " % (i + 1, n_blocks)) sample_id, out_fn = write_block(block, plate_barcode, args.out_dir, snp_name_to_label, header, requested_sample_ids) if out_fn is None: logger.info("skipped sample %r (not requested)" % sample_id) continue label = os.path.basename(out_fn) out_path = os.path.abspath(out_fn) ds_w.writerow({ "label": label, "source": sample_id, "device": args.device_label, "device_type": DEVICE_TYPE, "data_sample_type": DATA_SAMPLE_TYPE, "markers_set": args.marker_set_label, "status": "USABLE", }) do_w.writerow({ "path": out_path, "data_sample_label": label, "mimetype": mimetypes.SSC_FILE, "size": str(os.stat(out_path).st_size), "sha1": compute_sha1(out_path), }) for outf in ds_f, do_f: outf.flush() os.fsync(outf.fileno())