Example #1
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF")
    parser.add_argument("sam_filename")
    parser.add_argument(
        "-i",
        "--input_fasta",
        default=None,
        help="(Optional) input fasta. If given, coverage will be calculated.")

    args = parser.parse_args()

    if not args.sam_filename.endswith('.sam'):
        print >> sys.stderr, "Only accepts files ending in .sam. Abort!"
        sys.exit(-1)

    prefix = args.sam_filename[:-4]
    output_gff3 = prefix + '.gff3'

    q_dict = None
    if args.input_fasta is not None:
        q_dict = dict((r.id, len(r.seq))
                      for r in SeqIO.parse(open(args.input_fasta), 'fasta'))

    with open(output_gff3, 'w') as f:
        recs = [
            convert_sam_rec_to_gff3_rec(r0) for r0 in GMAPSAMReader(
                args.sam_filename, True, query_len_dict=q_dict)
        ]
        BCBio_GFF.write(filter(lambda x: x is not None, recs), f)

    print >> sys.stderr, "Output written to {0}.".format(output_gff3)
Example #2
0
def main():
    from argparse import ArgumentParser

    parser = ArgumentParser("Convert SAM to collapsed GFF format")
    parser.add_argument("sam_filename")

    args = parser.parse_args()

    if not args.sam_filename.endswith('.sam'):
        print >> sys.stderr, "Only accepts files ending in .sam. Abort!"
        sys.exit(-1)

    prefix = args.sam_filename[:-4]
    output_gff = prefix + '.collapsed.gff'

    with open(output_gff, 'w') as f:
        reader = GMAPSAMReader(args.sam_filename, True)
        for r in reader:
            if r.sID == '*': continue
            r.strand = r.flag.strand
            r.seqid = r.qID
            r.chr = r.sID
            r.ref_exons = r.segments
            r.start = r.sStart
            r.end = r.sEnd
            r.cds_exons = None
            write_collapseGFF_format(f, r)

    print >> sys.stderr, "Output written to {0}.".format(output_gff)
Example #3
0
def convert_sam_to_gff3(sam_filename, output_gff3, source, q_dict=None):
    qid_index_dict = Counter()
    with open(output_gff3, 'w') as f:
        recs = [
            convert_sam_rec_to_gff3_rec(r0, source, qid_index_dict)
            for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict)
        ]
        BCBio_GFF.write(filter(lambda x: x is not None, recs), f)
Example #4
0
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file),delimiter=','):
        for k,v in r.iteritems():
            if k=='id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.itervalues():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = fafq_dict.keys()
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GMAPSAMReader(pooled_sam, True)
    for r in reader:
        if r.sID == '*':
            print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID)
            continue
        m = rex_pbid.match(r.qID)
        if m is not None: pbid = m.group(1)
        else: pbid = r.qID
        # convert SAM record to GFF record type
        r.seqid = pbid
        r.chr = r.sID
        r.start, r.end = r.sStart, r.sEnd
        r.strand = r.flag.strand
        r.ref_exons = r.segments
        r.cds_exons = None

        groups_to_write_in = set()
        if pbid not in in_tissue:
            print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
Example #5
0
    def phase_variant(self,
                      sam_filename,
                      input_fa_or_fq,
                      output_prefix,
                      partial_ok=False):
        """
        :param sam_filename: CCS SAM filename. Can be unsorted.
        :param input_fa_or_fq: Input CCS fasta/fastq filename.
        :param output_prefix: Output prefix. Writes to xxx.log.
        :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.

        For each alignment:
        1. discard if did not map to the strand expected
        2. discard if did not map to the full range of variants (unless <partial_ok> is True)
        3. discard if at var positions have non-called bases (outliers)
        """
        f_log = open(output_prefix + '.log', 'w')

        seq_dict = SeqIO.to_dict(
            SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))
        for r in GMAPSAMReader(sam_filename,
                               True,
                               query_len_dict=dict((k, len(seq_dict[k].seq))
                                                   for k in seq_dict)):
            if r.sID == '*':
                f_log.write("Ignore {0} because: unmapped.\n".format(r.qID))
                continue
            if r.flag.strand != self.vc.expected_strand:
                f_log.write("Ignore {0} because: strand is {1}.\n".format(
                    r.qID, r.flag.strand))
                continue  # ignore
            if not partial_ok and (r.sStart > self.min_var_pos
                                   or r.sEnd < self.max_var_pos):
                f_log.write(
                    "Ignore {0} because: aln too short, from {1}-{2}.\n".
                    format(r.qID, r.sStart + 1, r.sEnd))
                continue

            i, msg = self.match_haplotype(r,
                                          str(seq_dict[r.qID].seq).upper(),
                                          partial_ok)
            if i is None:  # read is rejected for reason listed in <msg>
                f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg))
                continue
            else:
                f_log.write("{0} phased: haplotype {1}={2}\n".format(
                    r.qID, i, self.haplotypes[i]))
                print "{0} has haplotype {1}:{2}".format(
                    r.qID, i, self.haplotypes[i])
                self.seq_hap_info[r.qID] = i
Example #6
0
def process_sam_to_wig(sam_filename,
                       output_wig,
                       cov_threshold=200,
                       meta_info=None):
    cov = np.zeros(REF_LENGTH)
    reader = GMAPSAMReader(sam_filename, True)

    f_sam = open(sam_filename[:sam_filename.rfind('.')] + '.metainfo.sam', 'w')
    f_sam.write(reader.header)
    bad_count = 0
    for r in reader:
        tags = ''
        if len(r.segments) > 1:
            for e in r.segments:
                cov[e.start:e.end] += 1
            bad_count += 1
        tags += "\tsg:i:{0}".format(len(r.segments))  # sg: number of segments
        if meta_info is not None:
            seqid = r.qID.split('|')[0]
            if seqid in meta_info:
                tags += "\tst:A:{0}".format(
                    meta_info[seqid]['Sequencing technology']
                    [0])  # st: sequencing technology
            else:
                print("WARNING: Could not find {0} in metadata. Skipping.".
                      format(seqid))
        f_sam.write(r.record_line + tags + '\n')
    f_sam.close()

    for i in range(len(cov)):
        if cov[i] < cov_threshold: cov[i] = 0

    f = open(output_wig, 'w')
    f.write("variableStep chrom=NC_045512v2 start=1")
    for i in range(len(cov)):
        f.write("{0} {1}\n".format(i + 1, cov[i]))
    f.close()
Example #7
0
def sqanti_filter_lite(args):

    fafq_type = 'fasta'
    with open(args.isoforms) as h:
        if h.readline().startswith('@'): fafq_type = 'fastq'

    prefix = args.sqanti_class[:args.sqanti_class.rfind('.')]

    fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w')
    fcsv.write("# classification: {0}\n".format(args.sqanti_class))
    fcsv.write("# isoform: {0}\n".format(args.isoforms))
    fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming))
    fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov))
    fcsv.write("filtered_isoform,reason\n")

    fout = open(prefix + '.filtered_lite.' + fafq_type, 'w')

    seqids_to_keep = []
    total_count = 0
    for r in DictReader(open(args.sqanti_class), delimiter='\t'):
        total_count += 1
        filter_flag, filter_msg = False, ""
        percA = float(r['perc_A_downstream_TTS']) / 100
        assert 0 <= percA <= 1
        min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None
        num_exon = int(r['exons'])
        is_RTS = r['RTS_stage'] == 'TRUE'
        is_canonical = r['all_canonical'] == 'canonical'

        cat = CATEGORY_DICT[r['structural_category']]

        if cat in ['FSM', 'ISM', 'NIC']:
            if (percA >= args.intrapriming and r['polyA_motif'] == 'NA'):
                filter_flag, filter_msg = True, "IntraPriming"
        else:
            if (percA >= args.intrapriming and r['polyA_motif'] == 'NA'):
                filter_flag, filter_msg = True, "IntraPriming"
            elif is_RTS:
                filter_flag, filter_msg = True, "RTSwitching"
            elif (not is_canonical) and (min_cov is None or
                                         (min_cov is not None
                                          and min_cov < args.min_cov)):
                filter_flag, filter_msg = True, "LowCoverage/Non-Canonical"

        if not filter_flag:
            seqids_to_keep.append(r['isoform'])
        else:
            fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg))

    print >> sys.stdout, "{0} isoforms read from {1}. {2} to be kept.".format(
        total_count, args.sqanti_class, len(seqids_to_keep))

    for r in SeqIO.parse(open(args.isoforms), fafq_type):
        if r.id in seqids_to_keep:
            SeqIO.write(r, fout, fafq_type)
    fout.close()
    print >> sys.stdout, "Output written to: {0}".format(fout.name)

    # write out a new .classification.txt, .junctions.txt
    outputClassPath = prefix + '.filtered_lite_classification.txt'
    with open(outputClassPath, 'w') as f:
        reader = DictReader(open(args.sqanti_class), delimiter='\t')
        writer = DictWriter(f, reader.fieldnames, delimiter='\t')
        writer.writeheader()
        for r in reader:
            if r['isoform'] in seqids_to_keep:
                writer.writerow(r)
        print >> sys.stdout, "Output written to: {0}".format(f.name)

    outputJuncPath = prefix + '.filtered_lite_junctions.txt'
    with open(outputJuncPath, 'w') as f:
        reader = DictReader(open(
            args.sqanti_class.replace('_classification', '_junctions')),
                            delimiter='\t')
        writer = DictWriter(f, reader.fieldnames, delimiter='\t')
        writer.writeheader()
        for r in reader:
            if r['isoform'] in seqids_to_keep:
                writer.writerow(r)
        print >> sys.stdout, "Output written to: {0}".format(f.name)

    outputSam = prefix + '.filtered_lite.sam'
    with open(outputSam, 'w') as f:
        reader = GMAPSAMReader(args.sam_file, True)
        f.write(reader.header)
        for r in reader:
            if r.qID in seqids_to_keep:
                f.write(r.record_line + '\n')
        print >> sys.stdout, "Output written to: {0}".format(f.name)

    print >> sys.stderr, "**** Generating SQANTI report...."
    cmd = RSCRIPTPATH + " {d}/{f} {c} {j}".format(
        d=utilitiesPath, f=RSCRIPT_REPORT, c=outputClassPath, j=outputJuncPath)
    if subprocess.check_call(cmd, shell=True) != 0:
        print >> sys.stderr, "ERROR running command: {0}".format(cmd)
        sys.exit(-1)
def evaluate_alignment_sam(input_fa_or_fq,
                           sam_filename,
                           genome_d,
                           output_prefix,
                           junction_info=None):

    h1 = open(output_prefix + '.alignment_report.txt', 'w')
    h2 = open(output_prefix + '.junction_report.txt', 'w')

    w1 = DictWriter(h1, fieldnames=fieldnames_report1)
    w2 = DictWriter(h2, fieldnames=fieldnames_report2)
    w1.writeheader()
    w2.writeheader()

    #fieldnames_report1 = ['seqid', 'coverage', 'identity', 'num_sub', 'num_ins', 'num_del', 'num_exons']
    #fieldnames_report2 = ['seqid', 'donor_pos', 'donor_seq', 'donor_dist', 'acceptor_pos', 'acceptor_seq', 'acceptor_dist']

    query_len_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(
        open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))
    for r in GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict):
        if r.sID == '*':  # unaligned
            rec1 = {
                'seqid': r.qID,
                'coverage': 'NA',
                'identity': 'NA',
                'num_sub': 'NA',
                'num_ins': 'NA',
                'num_del': 'NA',
                'num_exons': 'NA'
            }
            w1.writerow(rec1)
            continue
        rec1 = {
            'seqid': r.qID,
            'coverage': r.qCoverage,
            'identity': r.identity,
            'num_sub': r.num_nonmatches - r.num_del - r.num_ins,
            'num_ins': r.num_ins,
            'num_del': r.num_del,
            'num_exons': len(r.segments)
        }
        w1.writerow(rec1)
        for i in xrange(0, len(r.segments) - 1):
            rec2 = {'seqid': r.qID}
            seq1, seq2 = get_donor_acceptor(genome_d, r.sID, r.flag.strand,
                                            r.segments[i].end - 1,
                                            r.segments[i + 1].start)
            if r.flag.strand == '+':
                rec2['donor_pos'] = "{0}:+:{1}".format(r.sID,
                                                       r.segments[i].end - 1)
                rec2['acceptor_pos'] = "{0}:+:{1}".format(
                    r.sID, r.segments[i + 1].start)
            else:
                rec2['donor_pos'] = "{0}:-:{1}".format(r.sID,
                                                       r.segments[i + 1].start)
                rec2['acceptor_pos'] = "{0}:-:{1}".format(
                    r.sID, r.segments[i].end - 1)
            rec2['donor_seq'] = seq1
            rec2['acceptor_seq'] = seq2
            if junction_info is not None:
                rec2['donor_dist'], rec2[
                    'acceptor_dist'] = get_closest_junction_dist(
                        junction_info, r.sID, r.flag.strand,
                        r.segments[i].end - 1, r.segments[i + 1].start)
            else:
                rec2['donor_dist'] = 'NA'
                rec2['acceptor_dist'] = 'NA'
            w2.writerow(rec2)
Example #9
0
def sqanti_filter_lite(args):

    fafq_type = 'fasta'
    with open(args.isoforms) as h:
        if h.readline().startswith('@'): fafq_type = 'fastq'

    prefix = args.sqanti_class[:args.sqanti_class.rfind('.')]

    fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w')
    fcsv.write("# classification: {0}\n".format(args.sqanti_class))
    fcsv.write("# isoform: {0}\n".format(args.isoforms))
    fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming))
    fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov))
    fcsv.write("filtered_isoform,reason\n")

    fout = open(prefix + '.filtered_lite.' + fafq_type, 'w')

    seqids_to_keep = set()
    total_count = 0
    for r in DictReader(open(args.sqanti_class), delimiter='\t'):
        total_count += 1
        filter_flag, filter_msg = False, ""
        percA = float(r['perc_A_downstream_TTS']) / 100
        assert 0 <= percA <= 1
        runA = 0
        while runA < len(r['seq_A_downstream_TTS']):
            if r['seq_A_downstream_TTS'][runA] != 'A':
                break
            runA += 1
        min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None
        num_exon = int(r['exons'])
        is_RTS = r['RTS_stage'] == 'TRUE'
        is_canonical = r['all_canonical'] == 'canonical'
        is_monoexonic = (num_exon == 1)

        cat = CATEGORY_DICT[r['structural_category']]

        potential_intrapriming = (percA >= args.intrapriming or runA >= args.runAlength) and \
                                 r['polyA_motif'] == 'NA' and \
                                 (r['diff_to_gene_TSS'] == 'NA' or abs(
                                     int(r['diff_to_gene_TTS'])) > args.max_dist_to_known_end)

        if cat in ['FSM']:
            if potential_intrapriming:
                filter_flag, filter_msg = True, "IntraPriming"
            elif args.filter_mono_exonic and is_monoexonic:
                filter_flag, filter_msg = True, "Mono-Exonic"
        else:
            if potential_intrapriming:
                filter_flag, filter_msg = True, "IntraPriming"
            elif args.filter_mono_exonic and is_monoexonic:
                filter_flag, filter_msg = True, "Mono-Exonic"
            elif is_RTS:
                filter_flag, filter_msg = True, "RTSwitching"
            elif (not is_canonical) and (min_cov is None or
                                         (min_cov is not None
                                          and min_cov < args.min_cov)):
                filter_flag, filter_msg = True, "LowCoverage/Non-Canonical"

        if not filter_flag:
            seqids_to_keep.add(r['isoform'])
        else:
            fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg))

    print("{0} isoforms read from {1}. {2} to be kept.".format(
        total_count, args.sqanti_class, len(seqids_to_keep)),
          file=sys.stdout)

    if not args.skipFaFq:
        for r in SeqIO.parse(open(args.isoforms), fafq_type):
            if r.id in seqids_to_keep:
                SeqIO.write(r, fout, fafq_type)
        fout.close()
        print("Output written to: {0}".format(fout.name), file=sys.stdout)

    # write out a new .classification.txt, .junctions.txt
    outputClassPath = prefix + '.filtered_lite_classification.txt'
    with open(outputClassPath, 'w') as f:
        reader = DictReader(open(args.sqanti_class), delimiter='\t')
        writer = DictWriter(f, reader.fieldnames, delimiter='\t')
        writer.writeheader()
        for r in reader:
            if r['isoform'] in seqids_to_keep:
                writer.writerow(r)
        print("Output written to: {0}".format(f.name), file=sys.stdout)

    if not args.skipJunction:
        outputJuncPath = prefix + '.filtered_lite_junctions.txt'
        with open(outputJuncPath, 'w') as f:
            reader = DictReader(open(
                args.sqanti_class.replace('_classification', '_junctions')),
                                delimiter='\t')
            writer = DictWriter(f, reader.fieldnames, delimiter='\t')
            writer.writeheader()
            for r in reader:
                if r['isoform'] in seqids_to_keep:
                    writer.writerow(r)
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if not args.skipGTF:
        outputGTF = prefix + '.filtered_lite.gtf'
        with open(outputGTF, 'w') as f:
            for r in collapseGFFReader(args.gtf_file):
                if r.seqid in seqids_to_keep:
                    write_collapseGFF_format(f, r)
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if args.sam is not None:
        outputSam = prefix + '.filtered_lite.sam'
        with open(outputSam, 'w') as f:
            reader = GMAPSAMReader(args.sam, True)
            f.write(reader.header)
            for r in reader:
                if r.qID in seqids_to_keep:
                    f.write(r.record_line + '\n')
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if args.faa is not None:
        outputFAA = prefix + '.filtered_lite.faa'
        with open(outputFAA, 'w') as f:
            for r in SeqIO.parse(open(args.faa), 'fasta'):
                if r.id in seqids_to_keep:
                    f.write(">{0}\n{1}\n".format(r.description, r.seq))
        print("Output written to: {0}".format(f.name), file=sys.stdout)

    print("**** Generating SQANTI3 report....", file=sys.stderr)
    cmd = RSCRIPTPATH + " {d}/{f} {c} {j} {p} {d}".format(d=utilitiesPath,
                                                          f=RSCRIPT_REPORT,
                                                          c=outputClassPath,
                                                          j=outputJuncPath,
                                                          p="mock")
    if subprocess.check_call(cmd, shell=True) != 0:
        print("ERROR running command: {0}".format(cmd), file=sys.stderr)
        sys.exit(-1)
Example #10
0
def minimap2_against_ref2(sam_filename,
                          query_len_dict,
                          ref_len_dict,
                          is_FL,
                          sID_starts_with_c,
                          ece_penalty=1,
                          ece_min_len=20,
                          same_strand_only=True,
                          max_missed_start=200,
                          max_missed_end=50,
                          full_missed_start=50,
                          full_missed_end=30):
    """
    Excluding criteria:
    (1) self hit
    (2) opposite strand hit  (should already be in the same orientation;
        can override with <same_strand_only> set to False)
    """
    for r in GMAPSAMReader(sam_filename,
                           True,
                           query_len_dict=query_len_dict,
                           ref_len_dict=ref_len_dict):
        missed_q = r.qStart + r.qLen - r.qEnd
        missed_t = r.sStart + r.sLen - r.sEnd

        if sID_starts_with_c:
            # because all consensus should start with
            # c<cluster_index>
            assert r.sID.startswith('c')
            if r.sID.find('/') > 0:
                r.sID = r.sID.split('/')[0]
            if r.sID.endswith('_ref'):
                # probably c<cid>_ref
                cID = int(r.sID[1:-4])
            else:
                cID = int(r.sID[1:])
        else:
            cID = r.sID

        # self hit, useless!
        # opposite strand not allowed!
        if (cID == r.qID or (r.flag.strand == '-' and same_strand_only)):
            yield HitItem(qID=r.qID, cID=cID)
            continue

        # regardless if whether is full-length (is_FL)
        # the query MUST be mapped fully (based on full_missed_start/end)
        if r.qStart > full_missed_start or (r.qLen - r.qEnd) > full_missed_end:
            yield HitItem(qID=r.qID, cID=cID)

        # full-length case: allow up to max_missed_start bp of 5' not aligned
        # and max_missed_end bp of 3' not aligned
        # non-full-length case: not really tested...don't use
        if is_FL and not alignment_missed_start_end_less_than_threshold(r,\
                        max_missed_start, max_missed_end, full_missed_start, full_missed_end):
            yield HitItem(qID=r.qID, cID=cID)
        else:
            ece_arr = eval_sam_alignment(r)

            if alignment_has_large_nonmatch(ece_arr, ece_penalty, ece_min_len):
                yield HitItem(qID=r.qID, cID=cID)
            else:
                yield HitItem(qID=r.qID,
                              cID=cID,
                              qStart=r.qStart,
                              qEnd=r.qEnd,
                              missed_q=missed_q * 1. / r.qLen,
                              missed_t=missed_t * 1. / r.sLen,
                              fakecigar=r.cigar,
                              ece_arr=ece_arr)