def write_output(logger, args): serializer = SeqNameSerializer() index = None fields = MARKER_DEF_FIELDS + ("status", "extended_mask") try: index = shelve.open(args.index_file, "r") logger.info("getting extracted sequences") extracted_seqs = get_extracted_seqs(args.input_file) if args.align_file: logger.info("getting sorting order from %r" % (args.align_file)) idx_map = get_sort_idx(args.align_file) max_idx = max(idx_map.itervalues()) fields += ("marker_indx",) with nested(open(args.orig_file), open(args.output_file,'w')) as (f, outf): outf.write("\t".join(fields)+"\n") reader = csv.DictReader(f, delimiter="\t") logger.info("looking up against %r" % args.index_file) for i, r in enumerate(reader): label = r['label'] old_rs_label = r['rs_label'] mask = r['mask'] try: seq, alleles = extracted_seqs[label] except KeyError: rs_label = extended_mask = 'None' status = Status.NO_INFO else: extended_mask = build_mask(seq, alleles) key = build_index_key(seq) tags = index.get(key, []) n_matches = len(tags) if n_matches != 1: logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags)) rs_label = 'None' status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH else: rs_label, _, _, _ = serializer.deserialize(tags[0]) if old_rs_label == "None": status = Status.ADDED else: status = (Status.CONFIRMED if rs_label == old_rs_label else Status.REPLACED) if rs_label == 'None': rs_label = label out_r = [label, rs_label, mask, r['allele_a'], r['allele_b'], status, extended_mask] if args.align_file: try: idx = idx_map[label] except KeyError: max_idx += 1 idx = max_idx out_r.append(str(idx)) outf.write("%s\n" % "\t".join(out_r)) logger.info("processed %d records overall" % (i+1)) finally: if index: index.close()
def get_extracted_seqs(fn): with open(fn) as f: reader = csv.reader(f, delimiter="\t") data = {} serializer = SeqNameSerializer() for r in reader: try: label, _, _, alleles = serializer.deserialize(r[3]) seq = r[-1].upper() except IndexError: raise ValueError("%r: bad input format" % fn) data[label] = (seq, alleles) return data
def __init__(self, ref_tag, outf, outfmt=DEFAULT_OUTPUT_FORMAT, flank_size=DEFAULT_FLANK_SIZE, logger=None): self.logger = logger or NullLogger() self.ref_tag = ref_tag self.outf = outf self.outfmt = outfmt self.flank_size = flank_size self.current_id = None self.current_hits = [] self.serializer = SeqNameSerializer()
def write_output(reader, outf, logger=None): logger = logger or NullLogger() seq_count = 0 name_serializer = SeqNameSerializer() for r in reader: fastq_records = build_fastq_records(r['label'], r['mask'], name_serializer, logger=logger) seq_count += len(fastq_records) for r in fastq_records: outf.write("%s\n" % "\n".join(r)) return seq_count
class SnpHitProcessor(object): HEADER = MARKER_AL_FIELDS def __init__(self, ref_tag, outf, outfmt=DEFAULT_OUTPUT_FORMAT, flank_size=DEFAULT_FLANK_SIZE, logger=None): self.logger = logger or NullLogger() self.ref_tag = ref_tag self.outf = outf self.outfmt = outfmt self.flank_size = flank_size self.current_id = None self.current_hits = [] self.serializer = SeqNameSerializer() def process(self, hit): """ Process a hit in the SAMMapping format, looking for a perfect (edit distance, i.e., NM tag value == 0) and unambiguous (mapping quality > 0) hit. """ name = hit.get_name() id_, allele, snp_offset, _ = self.serializer.deserialize(name) if id_ != self.current_id: if self.current_id is not None: self.dump_current_hits() self.current_id = id_ self.current_hits = [] nm = hit.tag_value('NM') seq = hit.get_seq_5() mapped = hit.is_mapped() if mapped and nm <= 0 and hit.qual > 0: snp_pos = hit.get_untrimmed_pos() + snp_offset chr_code = CHR_CODES.get(hit.tid, 'None') strand = '-' if hit.is_on_reverse() else '+' if self.outfmt == DEFAULT_OUTPUT_FORMAT: r = [ id_, self.ref_tag, str(chr_code), str(snp_pos), strand, allele ] else: if hit.tid is None: self.logger.error("%r: can't use null chr for %r output" % (name, self.outfmt)) return start = snp_pos - self.flank_size - 1 end = snp_pos + self.flank_size r = [hit.tid, str(start), str(end), name, '0', strand] self.current_hits.append(r) else: self.logger.info("%r: mapped:%r; NM:%r; qual:%r" % (name, mapped, nm, hit.qual)) def dump_current_hits(self): nh = len(self.current_hits) if nh != 1: self.logger.warn("hit count for %s: %d != 1" % (self.current_id, nh)) if nh == 0 and self.outfmt == DEFAULT_OUTPUT_FORMAT: self.current_hits.append([ self.current_id, self.ref_tag, DUMMY_AL_VALUES["chromosome"], DUMMY_AL_VALUES["pos"], DUMMY_AL_VALUES["strand"], DUMMY_AL_VALUES["allele"], ]) if self.outfmt == DEFAULT_OUTPUT_FORMAT: for hit in self.current_hits: hit.append(str(nh)) assert hit[0] == self.current_id self.write_row(hit) else: if nh == 1: self.write_row(self.current_hits[0]) def write_row(self, data): self.outf.write("\t".join(data) + "\n") def write_header(self): if self.outfmt == DEFAULT_OUTPUT_FORMAT: self.write_row(self.HEADER) def close_open_handles(self): self.outf.close()
def write_output(logger, args): serializer = SeqNameSerializer() index = None fields = MARKER_DEF_FIELDS + ("status", "extended_mask") try: index = shelve.open(args.index_file, "r") logger.info("getting extracted sequences") extracted_seqs = get_extracted_seqs(args.input_file) if args.align_file: logger.info("getting sorting order from %r" % (args.align_file)) idx_map = get_sort_idx(args.align_file) max_idx = max(idx_map.itervalues()) fields += ("marker_indx", ) with nested(open(args.orig_file), open(args.output_file, 'w')) as (f, outf): outf.write("\t".join(fields) + "\n") reader = csv.DictReader(f, delimiter="\t") logger.info("looking up against %r" % args.index_file) for i, r in enumerate(reader): label = r['label'] old_rs_label = r['rs_label'] mask = r['mask'] try: seq, alleles = extracted_seqs[label] except KeyError: rs_label = extended_mask = 'None' status = Status.NO_INFO else: extended_mask = build_mask(seq, alleles) key = build_index_key(seq) tags = index.get(key, []) n_matches = len(tags) if n_matches != 1: logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags)) rs_label = 'None' status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH else: rs_label, _, _, _ = serializer.deserialize(tags[0]) if old_rs_label == "None": status = Status.ADDED else: status = (Status.CONFIRMED if rs_label == old_rs_label else Status.REPLACED) if rs_label == 'None': rs_label = label out_r = [ label, rs_label, mask, r['allele_a'], r['allele_b'], status, extended_mask ] if args.align_file: try: idx = idx_map[label] except KeyError: max_idx += 1 idx = max_idx out_r.append(str(idx)) outf.write("%s\n" % "\t".join(out_r)) logger.info("processed %d records overall" % (i + 1)) finally: if index: index.close()
class SnpHitProcessor(object): HEADER = MARKER_AL_FIELDS def __init__(self, ref_tag, outf, outfmt=DEFAULT_OUTPUT_FORMAT, flank_size=DEFAULT_FLANK_SIZE, logger=None): self.logger = logger or NullLogger() self.ref_tag = ref_tag self.outf = outf self.outfmt = outfmt self.flank_size = flank_size self.current_id = None self.current_hits = [] self.serializer = SeqNameSerializer() def process(self, hit): """ Process a hit in the SAMMapping format, looking for a perfect (edit distance, i.e., NM tag value == 0) and unambiguous (mapping quality > 0) hit. """ name = hit.get_name() id_, allele, snp_offset, _ = self.serializer.deserialize(name) if id_ != self.current_id: if self.current_id is not None: self.dump_current_hits() self.current_id = id_ self.current_hits = [] nm = hit.tag_value('NM') seq = hit.get_seq_5() mapped = hit.is_mapped() if mapped and nm <= 0 and hit.qual > 0: snp_pos = hit.get_untrimmed_pos() + snp_offset chr_code = CHR_CODES.get(hit.tid, 'None') strand = '-' if hit.is_on_reverse() else '+' if self.outfmt == DEFAULT_OUTPUT_FORMAT: r = [id_, self.ref_tag, str(chr_code), str(snp_pos), strand, allele] else: if hit.tid is None: self.logger.error("%r: can't use null chr for %r output" % (name, self.outfmt)) return start = snp_pos - self.flank_size - 1 end = snp_pos + self.flank_size r = [hit.tid, str(start), str(end), name, '0', strand] self.current_hits.append(r) else: self.logger.info("%r: mapped:%r; NM:%r; qual:%r" % (name, mapped, nm, hit.qual)) def dump_current_hits(self): nh = len(self.current_hits) if nh != 1: self.logger.warn("hit count for %s: %d != 1" % (self.current_id, nh)) if nh == 0 and self.outfmt == DEFAULT_OUTPUT_FORMAT: self.current_hits.append([ self.current_id, self.ref_tag, DUMMY_AL_VALUES["chromosome"], DUMMY_AL_VALUES["pos"], DUMMY_AL_VALUES["strand"], DUMMY_AL_VALUES["allele"], ]) if self.outfmt == DEFAULT_OUTPUT_FORMAT: for hit in self.current_hits: hit.append(str(nh)) assert hit[0] == self.current_id self.write_row(hit) else: if nh == 1: self.write_row(self.current_hits[0]) def write_row(self, data): self.outf.write("\t".join(data)+"\n") def write_header(self): if self.outfmt == DEFAULT_OUTPUT_FORMAT: self.write_row(self.HEADER) def close_open_handles(self): self.outf.close()