예제 #1
0
def write_output(logger, args):
  serializer = SeqNameSerializer()
  index = None
  fields = MARKER_DEF_FIELDS + ("status", "extended_mask")
  try:
    index = shelve.open(args.index_file, "r")
    logger.info("getting extracted sequences")
    extracted_seqs = get_extracted_seqs(args.input_file)
    if args.align_file:
      logger.info("getting sorting order from %r" % (args.align_file))
      idx_map = get_sort_idx(args.align_file)
      max_idx = max(idx_map.itervalues())
      fields += ("marker_indx",)
    with nested(open(args.orig_file), open(args.output_file,'w')) as (f, outf):
      outf.write("\t".join(fields)+"\n")
      reader = csv.DictReader(f, delimiter="\t")
      logger.info("looking up against %r" % args.index_file)
      for i, r in enumerate(reader):
        label = r['label']
        old_rs_label = r['rs_label']
        mask = r['mask']
        try:
          seq, alleles = extracted_seqs[label]
        except KeyError:
          rs_label = extended_mask = 'None'
          status = Status.NO_INFO
        else:
          extended_mask = build_mask(seq, alleles)
          key = build_index_key(seq)
          tags = index.get(key, [])
          n_matches = len(tags)
          if n_matches != 1:
            logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags))
            rs_label = 'None'
            status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH
          else:
            rs_label, _, _, _ = serializer.deserialize(tags[0])
            if old_rs_label == "None":
              status = Status.ADDED
            else:
              status = (Status.CONFIRMED if rs_label == old_rs_label
                        else Status.REPLACED)
        if rs_label == 'None':
          rs_label = label
        out_r = [label, rs_label, mask, r['allele_a'], r['allele_b'],
                 status, extended_mask]
        if args.align_file:
          try:
            idx = idx_map[label]
          except KeyError:
            max_idx += 1
            idx = max_idx
          out_r.append(str(idx))
        outf.write("%s\n" % "\t".join(out_r))
      logger.info("processed %d records overall" % (i+1))
  finally:
    if index:
      index.close()
예제 #2
0
def get_extracted_seqs(fn):
  with open(fn) as f:
    reader = csv.reader(f, delimiter="\t")
    data = {}
    serializer = SeqNameSerializer()
    for r in reader:
      try:
        label, _, _, alleles = serializer.deserialize(r[3])
        seq = r[-1].upper()
      except IndexError:
        raise ValueError("%r: bad input format" % fn)
      data[label] = (seq, alleles)
    return data
예제 #3
0
def get_extracted_seqs(fn):
    with open(fn) as f:
        reader = csv.reader(f, delimiter="\t")
        data = {}
        serializer = SeqNameSerializer()
        for r in reader:
            try:
                label, _, _, alleles = serializer.deserialize(r[3])
                seq = r[-1].upper()
            except IndexError:
                raise ValueError("%r: bad input format" % fn)
            data[label] = (seq, alleles)
        return data
예제 #4
0
 def __init__(self,
              ref_tag,
              outf,
              outfmt=DEFAULT_OUTPUT_FORMAT,
              flank_size=DEFAULT_FLANK_SIZE,
              logger=None):
     self.logger = logger or NullLogger()
     self.ref_tag = ref_tag
     self.outf = outf
     self.outfmt = outfmt
     self.flank_size = flank_size
     self.current_id = None
     self.current_hits = []
     self.serializer = SeqNameSerializer()
예제 #5
0
 def __init__(self, ref_tag, outf, outfmt=DEFAULT_OUTPUT_FORMAT,
              flank_size=DEFAULT_FLANK_SIZE, logger=None):
   self.logger = logger or NullLogger()
   self.ref_tag = ref_tag
   self.outf = outf
   self.outfmt = outfmt
   self.flank_size = flank_size
   self.current_id = None
   self.current_hits = []
   self.serializer = SeqNameSerializer()
예제 #6
0
def write_output(reader, outf, logger=None):
    logger = logger or NullLogger()
    seq_count = 0
    name_serializer = SeqNameSerializer()
    for r in reader:
        fastq_records = build_fastq_records(r['label'],
                                            r['mask'],
                                            name_serializer,
                                            logger=logger)
        seq_count += len(fastq_records)
        for r in fastq_records:
            outf.write("%s\n" % "\n".join(r))
    return seq_count
예제 #7
0
class SnpHitProcessor(object):

    HEADER = MARKER_AL_FIELDS

    def __init__(self,
                 ref_tag,
                 outf,
                 outfmt=DEFAULT_OUTPUT_FORMAT,
                 flank_size=DEFAULT_FLANK_SIZE,
                 logger=None):
        self.logger = logger or NullLogger()
        self.ref_tag = ref_tag
        self.outf = outf
        self.outfmt = outfmt
        self.flank_size = flank_size
        self.current_id = None
        self.current_hits = []
        self.serializer = SeqNameSerializer()

    def process(self, hit):
        """
    Process a hit in the SAMMapping format, looking for a perfect
    (edit distance, i.e., NM tag value == 0) and unambiguous (mapping
    quality > 0) hit.
    """
        name = hit.get_name()
        id_, allele, snp_offset, _ = self.serializer.deserialize(name)
        if id_ != self.current_id:
            if self.current_id is not None:
                self.dump_current_hits()
            self.current_id = id_
            self.current_hits = []
        nm = hit.tag_value('NM')
        seq = hit.get_seq_5()
        mapped = hit.is_mapped()
        if mapped and nm <= 0 and hit.qual > 0:
            snp_pos = hit.get_untrimmed_pos() + snp_offset
            chr_code = CHR_CODES.get(hit.tid, 'None')
            strand = '-' if hit.is_on_reverse() else '+'
            if self.outfmt == DEFAULT_OUTPUT_FORMAT:
                r = [
                    id_, self.ref_tag,
                    str(chr_code),
                    str(snp_pos), strand, allele
                ]
            else:
                if hit.tid is None:
                    self.logger.error("%r: can't use null chr for %r output" %
                                      (name, self.outfmt))
                    return
                start = snp_pos - self.flank_size - 1
                end = snp_pos + self.flank_size
                r = [hit.tid, str(start), str(end), name, '0', strand]
            self.current_hits.append(r)
        else:
            self.logger.info("%r: mapped:%r; NM:%r; qual:%r" %
                             (name, mapped, nm, hit.qual))

    def dump_current_hits(self):
        nh = len(self.current_hits)
        if nh != 1:
            self.logger.warn("hit count for %s: %d != 1" %
                             (self.current_id, nh))
        if nh == 0 and self.outfmt == DEFAULT_OUTPUT_FORMAT:
            self.current_hits.append([
                self.current_id,
                self.ref_tag,
                DUMMY_AL_VALUES["chromosome"],
                DUMMY_AL_VALUES["pos"],
                DUMMY_AL_VALUES["strand"],
                DUMMY_AL_VALUES["allele"],
            ])
        if self.outfmt == DEFAULT_OUTPUT_FORMAT:
            for hit in self.current_hits:
                hit.append(str(nh))
                assert hit[0] == self.current_id
                self.write_row(hit)
        else:
            if nh == 1:
                self.write_row(self.current_hits[0])

    def write_row(self, data):
        self.outf.write("\t".join(data) + "\n")

    def write_header(self):
        if self.outfmt == DEFAULT_OUTPUT_FORMAT:
            self.write_row(self.HEADER)

    def close_open_handles(self):
        self.outf.close()
예제 #8
0
def write_output(logger, args):
    serializer = SeqNameSerializer()
    index = None
    fields = MARKER_DEF_FIELDS + ("status", "extended_mask")
    try:
        index = shelve.open(args.index_file, "r")
        logger.info("getting extracted sequences")
        extracted_seqs = get_extracted_seqs(args.input_file)
        if args.align_file:
            logger.info("getting sorting order from %r" % (args.align_file))
            idx_map = get_sort_idx(args.align_file)
            max_idx = max(idx_map.itervalues())
            fields += ("marker_indx", )
        with nested(open(args.orig_file), open(args.output_file,
                                               'w')) as (f, outf):
            outf.write("\t".join(fields) + "\n")
            reader = csv.DictReader(f, delimiter="\t")
            logger.info("looking up against %r" % args.index_file)
            for i, r in enumerate(reader):
                label = r['label']
                old_rs_label = r['rs_label']
                mask = r['mask']
                try:
                    seq, alleles = extracted_seqs[label]
                except KeyError:
                    rs_label = extended_mask = 'None'
                    status = Status.NO_INFO
                else:
                    extended_mask = build_mask(seq, alleles)
                    key = build_index_key(seq)
                    tags = index.get(key, [])
                    n_matches = len(tags)
                    if n_matches != 1:
                        logger.warning("%r maps to %d tags: %r" %
                                       (label, n_matches, tags))
                        rs_label = 'None'
                        status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH
                    else:
                        rs_label, _, _, _ = serializer.deserialize(tags[0])
                        if old_rs_label == "None":
                            status = Status.ADDED
                        else:
                            status = (Status.CONFIRMED if rs_label
                                      == old_rs_label else Status.REPLACED)
                if rs_label == 'None':
                    rs_label = label
                out_r = [
                    label, rs_label, mask, r['allele_a'], r['allele_b'],
                    status, extended_mask
                ]
                if args.align_file:
                    try:
                        idx = idx_map[label]
                    except KeyError:
                        max_idx += 1
                        idx = max_idx
                    out_r.append(str(idx))
                outf.write("%s\n" % "\t".join(out_r))
            logger.info("processed %d records overall" % (i + 1))
    finally:
        if index:
            index.close()
예제 #9
0
class SnpHitProcessor(object):

  HEADER = MARKER_AL_FIELDS

  def __init__(self, ref_tag, outf, outfmt=DEFAULT_OUTPUT_FORMAT,
               flank_size=DEFAULT_FLANK_SIZE, logger=None):
    self.logger = logger or NullLogger()
    self.ref_tag = ref_tag
    self.outf = outf
    self.outfmt = outfmt
    self.flank_size = flank_size
    self.current_id = None
    self.current_hits = []
    self.serializer = SeqNameSerializer()

  def process(self, hit):
    """
    Process a hit in the SAMMapping format, looking for a perfect
    (edit distance, i.e., NM tag value == 0) and unambiguous (mapping
    quality > 0) hit.
    """
    name = hit.get_name()
    id_, allele, snp_offset, _ = self.serializer.deserialize(name)
    if id_ != self.current_id:
      if self.current_id is not None:
        self.dump_current_hits()
      self.current_id = id_
      self.current_hits = []
    nm = hit.tag_value('NM')
    seq = hit.get_seq_5()
    mapped = hit.is_mapped()
    if mapped and nm <= 0 and hit.qual > 0:
      snp_pos = hit.get_untrimmed_pos() + snp_offset
      chr_code = CHR_CODES.get(hit.tid, 'None')
      strand = '-' if hit.is_on_reverse() else '+'
      if self.outfmt == DEFAULT_OUTPUT_FORMAT:
        r = [id_, self.ref_tag, str(chr_code), str(snp_pos), strand, allele]
      else:
        if hit.tid is None:
          self.logger.error("%r: can't use null chr for %r output" %
                            (name, self.outfmt))
          return
        start = snp_pos - self.flank_size - 1
        end = snp_pos + self.flank_size
        r = [hit.tid, str(start), str(end), name, '0', strand]
      self.current_hits.append(r)
    else:
      self.logger.info("%r: mapped:%r; NM:%r; qual:%r" %
                       (name, mapped, nm, hit.qual))

  def dump_current_hits(self):
    nh = len(self.current_hits)
    if nh != 1:
      self.logger.warn("hit count for %s: %d != 1" % (self.current_id, nh))
    if nh == 0 and self.outfmt == DEFAULT_OUTPUT_FORMAT:
      self.current_hits.append([
        self.current_id,
        self.ref_tag,
        DUMMY_AL_VALUES["chromosome"],
        DUMMY_AL_VALUES["pos"],
        DUMMY_AL_VALUES["strand"],
        DUMMY_AL_VALUES["allele"],
        ])
    if self.outfmt == DEFAULT_OUTPUT_FORMAT:
      for hit in self.current_hits:
        hit.append(str(nh))
        assert hit[0] == self.current_id
        self.write_row(hit)
    else:
      if nh == 1:
        self.write_row(self.current_hits[0])

  def write_row(self, data):
    self.outf.write("\t".join(data)+"\n")

  def write_header(self):
    if self.outfmt == DEFAULT_OUTPUT_FORMAT:
      self.write_row(self.HEADER)

  def close_open_handles(self):
    self.outf.close()