Example #1
0
def main():
    parser = argparse.ArgumentParser( description='Report coverage gaps/dips from a samtools mpileup file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input mpileup file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' )
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Reference fasta file, against which reads were aligned.  Needed for low 3-prime end coverage' )
    parser.add_argument('-mcd', '--min_coverage_depth', type=int, required=True, help='Min coverage depth, below which is reported' )
    parser.add_argument('-mcs', '--min_coverage_span', type=int, required=False, help='Coverage window size, the avg of which is calculated for depth cutoff' )
    parser.add_argument('-eb', '--end_buffer', type=int, required=False, default=0, help='If specified, gaps this far from either end of the molecule will not be reported.' )
    args = parser.parse_args()

    # Check, this isn't ready yet:
    if args.min_coverage_span is not None:
        raise Exception("ERROR: Sorry, --min_coverage_span not yet implemented.")

    if args.output_file is None:
        out_fh = sys.stdout
    else:
        out_fh = open(args.output_file, 'wt')

    lengths = utils.fasta_sizes_from_file(args.fasta_file)

    stats = {'total_molecules': 0, 'total_bases': 0, 'depth_sum': 0}
    
    # In mpileup gaps are reported either with positions of coverage 0 OR omitted rows.
    depths = list()
    current_seq_id = None

    for line in open(args.input_file):
        contig, this_coord, base, depth = line.split("\t")[0:4]
        this_coord = int(this_coord)

        # new molecule
        if contig != current_seq_id:
            stats['total_molecules'] += 1
            
            # purge the last one
            if current_seq_id != None:
                print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer)

            depths = [0] * lengths[contig]
            current_seq_id = contig

        depths[this_coord - 1] = depth

    print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats, lengths, args.end_buffer)

    print("INFO: Total molecules: {0}".format(stats['total_molecules']), file=sys.stderr)
    print("INFO: Total bases    : {0}".format(stats['total_bases']), file=sys.stderr)
    print("INFO: Avg cov depth  : {0}x".format(int(stats['depth_sum'] / stats['total_bases'])), file=sys.stderr)
def main():
    parser = argparse.ArgumentParser( description='Parse BLAST output and report coverage of queries')

    ## output file to be written
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='FASTA file of query.fasta (for lengths)' )
    parser.add_argument('-b', '--blast_file', type=str, required=True, help='BLAST output using -m 8 or -m 9 options' )
    #parser.add_argument('-e', '--evalue_cutoff', type=str, required=False, help='E-value cutoff' )
    parser.add_argument('-slpc', '--subject_length_percentage_cutoff', type=int, required=False, help='Ignore hits from transcripts of length > N% relative to the query transcript' )
    parser.add_argument('-sf', '--subject_fasta', type=str, required=False, help='Only required if -slpc is passed.  FASTA file of the sub' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='base name/path for output.  Two files starting with this will be created' )
    args = parser.parse_args()

    qsizes = utils.fasta_sizes_from_file(args.fasta_file)
    
    if args.subject_fasta is not None:
        ssizes = utils.fasta_sizes_from_file(args.subject_fasta)

    all_fh = open("{0}.cov.all.perc.txt".format(args.output_base), 'wt')
    longest_fh = open("{0}.cov.longest.perc.txt".format(args.output_base), 'wt')

    current_query_id = None
    covs_all = None

    match_segments = dict()
    longest_match_length = None

    for line in open(args.blast_file):
        if line.startswith('#'): continue

        line = line.rstrip()
        cols = line.split("\t")
        query_id = cols[0]
        subj_id = cols[1]
        qstart = int(cols[6])
        qend = int(cols[7])

        if args.subject_length_percentage_cutoff is not None:
            perc_length_diff = (ssizes[subj_id] / qsizes[query_id]) * 100
            if perc_length_diff > args.subject_length_percentage_cutoff:
                continue
        
        if query_id != current_query_id:
            if current_query_id is not None:
                # report last coverage
                # first the 'all' coverage
                all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id]
                all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100))

                # now the 'longest' coverage
                longest_cov_transcript_id = None
                longest_cov_perc = None
                for tid in match_segments:
                    # calculate coverage of this tid
                    this_cov = [0] * qsizes[current_query_id]
                    for seg in match_segments[tid]:
                        for i in range(seg[0] - 1, seg[1]):
                            this_cov[i] += 1

                    this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov)
                    if longest_cov_perc is None or this_cov_perc > longest_cov_perc:
                        longest_cov_perc = this_cov_perc
                        longest_cov_transcript_id = tid
                        
                print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format(longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov)))
                longest_fh.write("{0}\t{1:.1f}\t{2}\n".format(current_query_id, longest_cov_perc * 100, longest_cov_transcript_id))

            # now reset and init this transcript
            current_query_id = query_id
            covs_all = [0] * qsizes[query_id]
            longest_match_length = None
            match_segments = dict()

        # now handle this row
        if subj_id not in match_segments:
            match_segments[subj_id] = list()

        match_segments[subj_id].append([qstart, qend])

        for i in range(qstart - 1, qend):
            covs_all[i] += 1

    # handle last ones
    # first the 'all' coverage
    all_cov_perc = (qsizes[current_query_id] - covs_all.count(0)) / qsizes[current_query_id]
    all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100))

    # now the 'longest' coverage
    longest_cov_transcript_id = None
    longest_cov_perc = None
    for tid in match_segments:
        # calculate coverage of this tid
        this_cov = [0] * qsizes[current_query_id]
        for seg in match_segments[tid]:
            for i in range(seg[0] - 1, seg[1]):
                this_cov[i] += 1

        this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov)
        if longest_cov_perc is None or this_cov_perc > longest_cov_perc:
            longest_cov_perc = this_cov_perc
            longest_cov_transcript_id = tid

    print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format(longest_cov_transcript_id, current_query_id, longest_cov_perc * 100, len(this_cov)))
    longest_fh.write("{0}\t{1:.1f}\n".format(current_query_id, longest_cov_perc * 100))  

    all_fh.close()
    longest_fh.close()
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Parse BLAST output and report coverage of queries')

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='FASTA file of query.fasta (for lengths)')
    parser.add_argument('-b',
                        '--blast_file',
                        type=str,
                        required=True,
                        help='BLAST output using -m 8 or -m 9 options')
    #parser.add_argument('-e', '--evalue_cutoff', type=str, required=False, help='E-value cutoff' )
    parser.add_argument(
        '-slpc',
        '--subject_length_percentage_cutoff',
        type=int,
        required=False,
        help=
        'Ignore hits from transcripts of length > N% relative to the query transcript'
    )
    parser.add_argument(
        '-sf',
        '--subject_fasta',
        type=str,
        required=False,
        help='Only required if -slpc is passed.  FASTA file of the sub')
    parser.add_argument(
        '-o',
        '--output_base',
        type=str,
        required=True,
        help=
        'base name/path for output.  Two files starting with this will be created'
    )
    args = parser.parse_args()

    qsizes = utils.fasta_sizes_from_file(args.fasta_file)

    if args.subject_fasta is not None:
        ssizes = utils.fasta_sizes_from_file(args.subject_fasta)

    all_fh = open("{0}.cov.all.perc.txt".format(args.output_base), 'wt')
    longest_fh = open("{0}.cov.longest.perc.txt".format(args.output_base),
                      'wt')

    current_query_id = None
    covs_all = None

    match_segments = dict()
    longest_match_length = None

    for line in open(args.blast_file):
        if line.startswith('#'): continue

        line = line.rstrip()
        cols = line.split("\t")
        query_id = cols[0]
        subj_id = cols[1]
        qstart = int(cols[6])
        qend = int(cols[7])

        if args.subject_length_percentage_cutoff is not None:
            perc_length_diff = (ssizes[subj_id] / qsizes[query_id]) * 100
            if perc_length_diff > args.subject_length_percentage_cutoff:
                continue

        if query_id != current_query_id:
            if current_query_id is not None:
                # report last coverage
                # first the 'all' coverage
                all_cov_perc = (qsizes[current_query_id] -
                                covs_all.count(0)) / qsizes[current_query_id]
                all_fh.write("{0}\t{1:.1f}\n".format(current_query_id,
                                                     all_cov_perc * 100))

                # now the 'longest' coverage
                longest_cov_transcript_id = None
                longest_cov_perc = None
                for tid in match_segments:
                    # calculate coverage of this tid
                    this_cov = [0] * qsizes[current_query_id]
                    for seg in match_segments[tid]:
                        for i in range(seg[0] - 1, seg[1]):
                            this_cov[i] += 1

                    this_cov_perc = (len(this_cov) -
                                     this_cov.count(0)) / len(this_cov)
                    if longest_cov_perc is None or this_cov_perc > longest_cov_perc:
                        longest_cov_perc = this_cov_perc
                        longest_cov_transcript_id = tid

                print(
                    "LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%"
                    .format(longest_cov_transcript_id, current_query_id,
                            longest_cov_perc * 100, len(this_cov)))
                longest_fh.write("{0}\t{1:.1f}\t{2}\n".format(
                    current_query_id, longest_cov_perc * 100,
                    longest_cov_transcript_id))

            # now reset and init this transcript
            current_query_id = query_id
            covs_all = [0] * qsizes[query_id]
            longest_match_length = None
            match_segments = dict()

        # now handle this row
        if subj_id not in match_segments:
            match_segments[subj_id] = list()

        match_segments[subj_id].append([qstart, qend])

        for i in range(qstart - 1, qend):
            covs_all[i] += 1

    # handle last ones
    # first the 'all' coverage
    all_cov_perc = (qsizes[current_query_id] -
                    covs_all.count(0)) / qsizes[current_query_id]
    all_fh.write("{0}\t{1:.1f}\n".format(current_query_id, all_cov_perc * 100))

    # now the 'longest' coverage
    longest_cov_transcript_id = None
    longest_cov_perc = None
    for tid in match_segments:
        # calculate coverage of this tid
        this_cov = [0] * qsizes[current_query_id]
        for seg in match_segments[tid]:
            for i in range(seg[0] - 1, seg[1]):
                this_cov[i] += 1

        this_cov_perc = (len(this_cov) - this_cov.count(0)) / len(this_cov)
        if longest_cov_perc is None or this_cov_perc > longest_cov_perc:
            longest_cov_perc = this_cov_perc
            longest_cov_transcript_id = tid

    print("LOG: transcript {0} covers {1} (len:{3}) best at {2:.1f}%".format(
        longest_cov_transcript_id, current_query_id, longest_cov_perc * 100,
        len(this_cov)))
    longest_fh.write("{0}\t{1:.1f}\n".format(current_query_id,
                                             longest_cov_perc * 100))

    all_fh.close()
    longest_fh.close()
def main():
    parser = argparse.ArgumentParser(
        description='Report coverage gaps/dips from a samtools mpileup file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input mpileup file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-f',
        '--fasta_file',
        type=str,
        required=True,
        help=
        'Reference fasta file, against which reads were aligned.  Needed for low 3-prime end coverage'
    )
    parser.add_argument('-mcd',
                        '--min_coverage_depth',
                        type=int,
                        required=True,
                        help='Min coverage depth, below which is reported')
    parser.add_argument(
        '-mcs',
        '--min_coverage_span',
        type=int,
        required=False,
        help=
        'Coverage window size, the avg of which is calculated for depth cutoff'
    )
    parser.add_argument(
        '-eb',
        '--end_buffer',
        type=int,
        required=False,
        default=0,
        help=
        'If specified, gaps this far from either end of the molecule will not be reported.'
    )
    args = parser.parse_args()

    # Check, this isn't ready yet:
    if args.min_coverage_span is not None:
        raise Exception(
            "ERROR: Sorry, --min_coverage_span not yet implemented.")

    if args.output_file is None:
        out_fh = sys.stdout
    else:
        out_fh = open(args.output_file, 'wt')

    lengths = utils.fasta_sizes_from_file(args.fasta_file)

    stats = {'total_molecules': 0, 'total_bases': 0, 'depth_sum': 0}

    # In mpileup gaps are reported either with positions of coverage 0 OR omitted rows.
    depths = list()
    current_seq_id = None

    for line in open(args.input_file):
        contig, this_coord, base, depth = line.split("\t")[0:4]
        this_coord = int(this_coord)

        # new molecule
        if contig != current_seq_id:
            stats['total_molecules'] += 1

            # purge the last one
            if current_seq_id != None:
                print_spans(current_seq_id, depths, args.min_coverage_depth,
                            out_fh, stats, lengths, args.end_buffer)

            depths = [0] * lengths[contig]
            current_seq_id = contig

        depths[this_coord - 1] = depth

    print_spans(current_seq_id, depths, args.min_coverage_depth, out_fh, stats,
                lengths, args.end_buffer)

    print("INFO: Total molecules: {0}".format(stats['total_molecules']),
          file=sys.stderr)
    print("INFO: Total bases    : {0}".format(stats['total_bases']),
          file=sys.stderr)
    print("INFO: Avg cov depth  : {0}x".format(
        int(stats['depth_sum'] / stats['total_bases'])),
          file=sys.stderr)