def write_output(reader, outf, logger=None):
    logger = logger or NullLogger()
    seq_count = 0
    name_serializer = SeqNameSerializer()
    for r in reader:
        fastq_records = build_fastq_records(r['label'],
                                            r['mask'],
                                            name_serializer,
                                            logger=logger)
        seq_count += len(fastq_records)
        for r in fastq_records:
            outf.write("%s\n" % "\n".join(r))
    return seq_count
Example #2
0
def get_extracted_seqs(fn):
    with open(fn) as f:
        reader = csv.reader(f, delimiter="\t")
        data = {}
        serializer = SeqNameSerializer()
        for r in reader:
            try:
                label, _, _, alleles = serializer.deserialize(r[3])
                seq = r[-1].upper()
            except IndexError:
                raise ValueError("%r: bad input format" % fn)
            data[label] = (seq, alleles)
        return data
Example #3
0
 def __init__(self,
              ref_tag,
              outf,
              outfmt=DEFAULT_OUTPUT_FORMAT,
              flank_size=DEFAULT_FLANK_SIZE,
              logger=None):
     self.logger = logger or NullLogger()
     self.ref_tag = ref_tag
     self.outf = outf
     self.outfmt = outfmt
     self.flank_size = flank_size
     self.current_id = None
     self.current_hits = []
     self.serializer = SeqNameSerializer()
Example #4
0
def write_output(logger, args):
    serializer = SeqNameSerializer()
    index = None
    fields = MARKER_DEF_FIELDS + ("status", "extended_mask")
    try:
        index = shelve.open(args.index_file, "r")
        logger.info("getting extracted sequences")
        extracted_seqs = get_extracted_seqs(args.input_file)
        if args.align_file:
            logger.info("getting sorting order from %r" % (args.align_file))
            idx_map = get_sort_idx(args.align_file)
            max_idx = max(idx_map.itervalues())
            fields += ("marker_indx", )
        with nested(open(args.orig_file), open(args.output_file,
                                               'w')) as (f, outf):
            outf.write("\t".join(fields) + "\n")
            reader = csv.DictReader(f, delimiter="\t")
            logger.info("looking up against %r" % args.index_file)
            for i, r in enumerate(reader):
                label = r['label']
                old_rs_label = r['rs_label']
                mask = r['mask']
                try:
                    seq, alleles = extracted_seqs[label]
                except KeyError:
                    rs_label = extended_mask = 'None'
                    status = Status.NO_INFO
                else:
                    extended_mask = build_mask(seq, alleles)
                    key = build_index_key(seq)
                    tags = index.get(key, [])
                    n_matches = len(tags)
                    if n_matches != 1:
                        logger.warning("%r maps to %d tags: %r" %
                                       (label, n_matches, tags))
                        rs_label = 'None'
                        status = Status.NO_MATCH if n_matches == 0 else Status.MULTI_MATCH
                    else:
                        rs_label, _, _, _ = serializer.deserialize(tags[0])
                        if old_rs_label == "None":
                            status = Status.ADDED
                        else:
                            status = (Status.CONFIRMED if rs_label
                                      == old_rs_label else Status.REPLACED)
                if rs_label == 'None':
                    rs_label = label
                out_r = [
                    label, rs_label, mask, r['allele_a'], r['allele_b'],
                    status, extended_mask
                ]
                if args.align_file:
                    try:
                        idx = idx_map[label]
                    except KeyError:
                        max_idx += 1
                        idx = max_idx
                    out_r.append(str(idx))
                outf.write("%s\n" % "\t".join(out_r))
            logger.info("processed %d records overall" % (i + 1))
    finally:
        if index:
            index.close()