def write_output(logger, args): serializer = SeqNameSerializer() index = None fields = MARKER_DEF_FIELDS + ("status", "extended_mask") try: index = shelve.open(args.index_file, "r") logger.info("getting extracted sequences") extracted_seqs = get_extracted_seqs(args.input_file) if args.align_file: logger.info("getting sorting order from %r" % (args.align_file)) idx_map = get_sort_idx(args.align_file) max_idx = max(idx_map.itervalues()) fields += ("marker_indx",) with nested(open(args.orig_file), open(args.output_file,'w')) as (f, outf): outf.write("\t".join(fields)+"\n") reader = csv.DictReader(f, delimiter="\t") logger.info("looking up against %r" % args.index_file) for i, r in enumerate(reader): label = r['label'] old_rs_label = r['rs_label'] mask = r['mask'] try: seq, alleles = extracted_seqs[label] except KeyError: rs_label = extended_mask = 'None' status = Status.NO_INFO else: extended_mask = build_mask(seq, alleles) key = build_index_key(seq) tags = index.get(key, []) n_matches = len(tags) if n_matches != 1: logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags)) rs_label = 'None' status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH else: rs_label, _, _, _ = serializer.deserialize(tags[0]) if old_rs_label == "None": status = Status.ADDED else: status = (Status.CONFIRMED if rs_label == old_rs_label else Status.REPLACED) if rs_label == 'None': rs_label = label out_r = [label, rs_label, mask, r['allele_a'], r['allele_b'], status, extended_mask] if args.align_file: try: idx = idx_map[label] except KeyError: max_idx += 1 idx = max_idx out_r.append(str(idx)) outf.write("%s\n" % "\t".join(out_r)) logger.info("processed %d records overall" % (i+1)) finally: if index: index.close()
def main(logger, args): index = None fd, temp_index_fn = tempfile.mkstemp(dir=args.output_dir) os.close(fd) os.remove(temp_index_fn) try: index = shelve.open(temp_index_fn, "n", protocol=HP, writeback=True) with open(args.input_file) as f: bn = os.path.basename(args.input_file) logger.info("processing %r" % bn) reader = csv.reader(f, delimiter="\t") first_seq_len = None for i, r in enumerate(reader): try: tag = r[3] seq = r[-1].upper() if first_seq_len is None: first_seq_len = len(seq) elif len(seq) != first_seq_len: msg = "found input sequences of different length" logger.critical(msg) raise ValueError(msg) except IndexError: msg = "%r: bad input format, bailing out" % bn logger.critical(msg) raise ValueError(msg) else: key = build_index_key(seq) index.setdefault(key, []).append(tag) if (i + 1) % SYNC_INTERVAL == 0: logger.info("processed %d records: syncing db" % (i + 1)) index.sync() logger.info("processed %d records overall" % (i + 1)) finally: if index: index.close() final_output_fn = os.path.join( args.output_dir, "dbsnp_index_%s_%d.db" % (args.reftag, first_seq_len)) os.rename(temp_index_fn, final_output_fn) logger.info("index stored to: %s" % final_output_fn)
def main(logger, args): index = None fd, temp_index_fn = tempfile.mkstemp(dir=args.output_dir) os.close(fd) os.remove(temp_index_fn) try: index = shelve.open(temp_index_fn, "n", protocol=HP, writeback=True) with open(args.input_file) as f: bn = os.path.basename(args.input_file) logger.info("processing %r" % bn) reader = csv.reader(f, delimiter="\t") first_seq_len = None for i, r in enumerate(reader): try: tag = r[3] seq = r[-1].upper() if first_seq_len is None: first_seq_len = len(seq) elif len(seq) != first_seq_len: msg = "found input sequences of different length" logger.critical(msg) raise ValueError(msg) except IndexError: msg = "%r: bad input format, bailing out" % bn logger.critical(msg) raise ValueError(msg) else: key = build_index_key(seq) index.setdefault(key, []).append(tag) if (i+1) % SYNC_INTERVAL == 0: logger.info("processed %d records: syncing db" % (i+1)) index.sync() logger.info("processed %d records overall" % (i+1)) finally: if index: index.close() final_output_fn = os.path.join( args.output_dir, "dbsnp_index_%s_%d.db" % (args.reftag, first_seq_len) ) os.rename(temp_index_fn, final_output_fn) logger.info("index stored to: %s" % final_output_fn)
def write_output(logger, args): serializer = SeqNameSerializer() index = None fields = MARKER_DEF_FIELDS + ("status", "extended_mask") try: index = shelve.open(args.index_file, "r") logger.info("getting extracted sequences") extracted_seqs = get_extracted_seqs(args.input_file) if args.align_file: logger.info("getting sorting order from %r" % (args.align_file)) idx_map = get_sort_idx(args.align_file) max_idx = max(idx_map.itervalues()) fields += ("marker_indx", ) with nested(open(args.orig_file), open(args.output_file, 'w')) as (f, outf): outf.write("\t".join(fields) + "\n") reader = csv.DictReader(f, delimiter="\t") logger.info("looking up against %r" % args.index_file) for i, r in enumerate(reader): label = r['label'] old_rs_label = r['rs_label'] mask = r['mask'] try: seq, alleles = extracted_seqs[label] except KeyError: rs_label = extended_mask = 'None' status = Status.NO_INFO else: extended_mask = build_mask(seq, alleles) key = build_index_key(seq) tags = index.get(key, []) n_matches = len(tags) if n_matches != 1: logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags)) rs_label = 'None' status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH else: rs_label, _, _, _ = serializer.deserialize(tags[0]) if old_rs_label == "None": status = Status.ADDED else: status = (Status.CONFIRMED if rs_label == old_rs_label else Status.REPLACED) if rs_label == 'None': rs_label = label out_r = [ label, rs_label, mask, r['allele_a'], r['allele_b'], status, extended_mask ] if args.align_file: try: idx = idx_map[label] except KeyError: max_idx += 1 idx = max_idx out_r.append(str(idx)) outf.write("%s\n" % "\t".join(out_r)) logger.info("processed %d records overall" % (i + 1)) finally: if index: index.close()