Esempio n. 1
0
def write_output(logger, args):
  serializer = SeqNameSerializer()
  index = None
  fields = MARKER_DEF_FIELDS + ("status", "extended_mask")
  try:
    index = shelve.open(args.index_file, "r")
    logger.info("getting extracted sequences")
    extracted_seqs = get_extracted_seqs(args.input_file)
    if args.align_file:
      logger.info("getting sorting order from %r" % (args.align_file))
      idx_map = get_sort_idx(args.align_file)
      max_idx = max(idx_map.itervalues())
      fields += ("marker_indx",)
    with nested(open(args.orig_file), open(args.output_file,'w')) as (f, outf):
      outf.write("\t".join(fields)+"\n")
      reader = csv.DictReader(f, delimiter="\t")
      logger.info("looking up against %r" % args.index_file)
      for i, r in enumerate(reader):
        label = r['label']
        old_rs_label = r['rs_label']
        mask = r['mask']
        try:
          seq, alleles = extracted_seqs[label]
        except KeyError:
          rs_label = extended_mask = 'None'
          status = Status.NO_INFO
        else:
          extended_mask = build_mask(seq, alleles)
          key = build_index_key(seq)
          tags = index.get(key, [])
          n_matches = len(tags)
          if n_matches != 1:
            logger.warning("%r maps to %d tags: %r" % (label, n_matches, tags))
            rs_label = 'None'
            status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH
          else:
            rs_label, _, _, _ = serializer.deserialize(tags[0])
            if old_rs_label == "None":
              status = Status.ADDED
            else:
              status = (Status.CONFIRMED if rs_label == old_rs_label
                        else Status.REPLACED)
        if rs_label == 'None':
          rs_label = label
        out_r = [label, rs_label, mask, r['allele_a'], r['allele_b'],
                 status, extended_mask]
        if args.align_file:
          try:
            idx = idx_map[label]
          except KeyError:
            max_idx += 1
            idx = max_idx
          out_r.append(str(idx))
        outf.write("%s\n" % "\t".join(out_r))
      logger.info("processed %d records overall" % (i+1))
  finally:
    if index:
      index.close()
Esempio n. 2
0
def main(logger, args):
    index = None
    fd, temp_index_fn = tempfile.mkstemp(dir=args.output_dir)
    os.close(fd)
    os.remove(temp_index_fn)
    try:
        index = shelve.open(temp_index_fn, "n", protocol=HP, writeback=True)
        with open(args.input_file) as f:
            bn = os.path.basename(args.input_file)
            logger.info("processing %r" % bn)
            reader = csv.reader(f, delimiter="\t")
            first_seq_len = None
            for i, r in enumerate(reader):
                try:
                    tag = r[3]
                    seq = r[-1].upper()
                    if first_seq_len is None:
                        first_seq_len = len(seq)
                    elif len(seq) != first_seq_len:
                        msg = "found input sequences of different length"
                        logger.critical(msg)
                        raise ValueError(msg)
                except IndexError:
                    msg = "%r: bad input format, bailing out" % bn
                    logger.critical(msg)
                    raise ValueError(msg)
                else:
                    key = build_index_key(seq)
                    index.setdefault(key, []).append(tag)
                    if (i + 1) % SYNC_INTERVAL == 0:
                        logger.info("processed %d records: syncing db" %
                                    (i + 1))
                        index.sync()
            logger.info("processed %d records overall" % (i + 1))
    finally:
        if index:
            index.close()
            final_output_fn = os.path.join(
                args.output_dir,
                "dbsnp_index_%s_%d.db" % (args.reftag, first_seq_len))
            os.rename(temp_index_fn, final_output_fn)
            logger.info("index stored to: %s" % final_output_fn)
Esempio n. 3
0
def main(logger, args):
  index = None
  fd, temp_index_fn = tempfile.mkstemp(dir=args.output_dir)
  os.close(fd)
  os.remove(temp_index_fn)
  try:
    index = shelve.open(temp_index_fn, "n", protocol=HP, writeback=True)
    with open(args.input_file) as f:
      bn = os.path.basename(args.input_file)
      logger.info("processing %r" % bn)
      reader = csv.reader(f, delimiter="\t")
      first_seq_len = None
      for i, r in enumerate(reader):
        try:
          tag = r[3]
          seq = r[-1].upper()
          if first_seq_len is None:
            first_seq_len = len(seq)
          elif len(seq) != first_seq_len:
            msg = "found input sequences of different length"
            logger.critical(msg)
            raise ValueError(msg)
        except IndexError:
          msg = "%r: bad input format, bailing out" % bn
          logger.critical(msg)
          raise ValueError(msg)
        else:
          key = build_index_key(seq)
          index.setdefault(key, []).append(tag)
          if (i+1) % SYNC_INTERVAL == 0:
            logger.info("processed %d records: syncing db" % (i+1))
            index.sync()
      logger.info("processed %d records overall" % (i+1))
  finally:
    if index:
      index.close()
      final_output_fn = os.path.join(
        args.output_dir, "dbsnp_index_%s_%d.db" % (args.reftag, first_seq_len)
        )
      os.rename(temp_index_fn, final_output_fn)
      logger.info("index stored to: %s" % final_output_fn)
Esempio n. 4
0
def write_output(logger, args):
    serializer = SeqNameSerializer()
    index = None
    fields = MARKER_DEF_FIELDS + ("status", "extended_mask")
    try:
        index = shelve.open(args.index_file, "r")
        logger.info("getting extracted sequences")
        extracted_seqs = get_extracted_seqs(args.input_file)
        if args.align_file:
            logger.info("getting sorting order from %r" % (args.align_file))
            idx_map = get_sort_idx(args.align_file)
            max_idx = max(idx_map.itervalues())
            fields += ("marker_indx", )
        with nested(open(args.orig_file), open(args.output_file,
                                               'w')) as (f, outf):
            outf.write("\t".join(fields) + "\n")
            reader = csv.DictReader(f, delimiter="\t")
            logger.info("looking up against %r" % args.index_file)
            for i, r in enumerate(reader):
                label = r['label']
                old_rs_label = r['rs_label']
                mask = r['mask']
                try:
                    seq, alleles = extracted_seqs[label]
                except KeyError:
                    rs_label = extended_mask = 'None'
                    status = Status.NO_INFO
                else:
                    extended_mask = build_mask(seq, alleles)
                    key = build_index_key(seq)
                    tags = index.get(key, [])
                    n_matches = len(tags)
                    if n_matches != 1:
                        logger.warning("%r maps to %d tags: %r" %
                                       (label, n_matches, tags))
                        rs_label = 'None'
                        status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH
                    else:
                        rs_label, _, _, _ = serializer.deserialize(tags[0])
                        if old_rs_label == "None":
                            status = Status.ADDED
                        else:
                            status = (Status.CONFIRMED if rs_label
                                      == old_rs_label else Status.REPLACED)
                if rs_label == 'None':
                    rs_label = label
                out_r = [
                    label, rs_label, mask, r['allele_a'], r['allele_b'],
                    status, extended_mask
                ]
                if args.align_file:
                    try:
                        idx = idx_map[label]
                    except KeyError:
                        max_idx += 1
                        idx = max_idx
                    out_r.append(str(idx))
                outf.write("%s\n" % "\t".join(out_r))
            logger.info("processed %d records overall" % (i + 1))
    finally:
        if index:
            index.close()