Example #1
0
def read_cogent2_aligned_to_genome_sam(input, filename):
    """
    Read cogent2 mapped to a genome.

    Return: dict of {cogent path} --> list of SAM Record; set of mapped genome contigs

    NOTE: (minimap2 was run with --secondary=no so if multiple must be chimeric)
    """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in BioReaders.GMAPSAMReader(
                filename,
                True,
                query_len_dict=dict(
                    (r.id, len(r.seq))
                    for r in SeqIO.parse(open(input), 'fasta'))):
            if r.sID == '*': continue  # unmapped
            d[r.qID].append(r)
            contigs_seen.add(r.sID)
    except IndexError:
        pass
    return dict(d), contigs_seen
Example #2
0
def post_minimap2_processing(ref='cogent.fa',
                             sam='in.trimmed.fa.sam',
                             output_prefix='cogent2',
                             seqrecs=[]):
    good_for = defaultdict(lambda: [])
    reader = BioReaders.GMAPSAMReader(sam,
                                      True,
                                      query_len_dict=dict(((r.id, len(r.seq))
                                                           for r in seqrecs)))
    for r in reader:
        if r.sID == '*': continue  # not mapped
        assert r.sID.startswith('path')  # chr should be path0, path1, etc
        assert 0 < r.qCoverage <= 1
        assert 0 < r.identity <= 1
        if r.qCoverage >= 0.98 and r.identity >= 0.98:
            good_for[r.qID].append(int(r.sID[4:]))

    touse = []
    if len(good_for) == 0:
        log.warning(
            "[BUG] good_for in post_minimap2_processing is empty. Probably from cycles. CHECK!"
        )
    else:
        N = max(max(v) for v in good_for.itervalues()) + 1
        try:
            prob = make_into_lp_problem(good_for.items(), N, add_noise=False)
            prob.solve()
        except:
            prob = make_into_lp_problem(good_for.items(), N, add_noise=True)
            prob.solve()
        for v in prob.variables():
            log.debug("{0} = {1}".format(v.name, v.varValue))
            if v.varValue == 1: touse.append(int(v.name))

    with open(output_prefix + '.fa', 'w') as f:
        for r in SeqIO.parse(open(ref), 'fasta'):
            if int(r.id[4:]) in touse:
                f.write(">{0}\n{1}\n".format(r.id, r.seq))
        # if there are some sequences that didn't map (possibly from cycles)
        # then just use THEMSELVES
        fake_path_i = max(touse) + 1 if len(touse) >= 1 else 0
        for r in seqrecs:
            if r.id not in good_for:
                log.warning(
                    "[BUG] {0} is not fully mapped to cogent in minimap2. \
                Likely cycle issues. Use itself in output.".format(r.id))
                f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq))
                fake_path_i += 1
Example #3
0
def tally_for_a_Cogent_dir(dirname,
                           writer1,
                           writer2,
                           genome1,
                           genome2=None,
                           blastn_filename=None):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam')
    reader = BioReaders.GMAPSAMReader(filename, True, \
                                      query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')))
    for r in reader:
        seq_info[r.qID].append(r)
        contigs_seen.add(r.sID)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(
        os.path.join(dirname, 'cogent2.fa'),
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.sam'))
    if genome2 is not None:
        d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(
            os.path.join(dirname, 'cogent2.fa'),
            os.path.join(dirname, 'cogent2.fa.' + genome2 + '.sam'))

    if blastn_filename is not None:
        qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(
            open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))
        best_of = read_blastn(os.path.join(dirname, blastn_filename),
                              qlen_dict)

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    rec1 = {
        'gene_family': dirname,
        'input_size': len(seq_info),
        'num_Cogent_contigs': len(contigs_seen),
        'num_genome_contig': len(contig_genome1),
        'genome_cov': "{0:.2f}".format(cov1),
        'genome_acc': "{0:.2f}".format(acc1),
        'genome_chimeric': has_chimeric1,
        'genome_contigs': ",".join(contig_genome1)
    }

    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    if genome2 is not None:
        cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
        rec1['num_genome2_contig'] = len(contig_genome2)
        rec1['genome2_cov'] = "{0:.2f}".format(cov2)
        rec1['genome2_acc'] = "{0:.2f}".format(acc2)
        rec1['genome2_chimeric'] = has_chimeric2
        rec1['genome2_contigs'] = ",".join(contig_genome2)
    # (for blastn, optional) best name with best e-value
    if blastn_filename is not None:
        if len(best_of) == 0:
            rec1['num_blastn'] = 0
            rec1['blastn_best'] = 'NA'
        else:
            stuff = list(best_of.values())  # list of (e-value, name)
            stuff.sort()
            rec1['num_blastn'] = sum(_n != 'NA'
                                     for _e, _n in list(best_of.values()))
            rec1['blastn_best'] = '"' + stuff[0][1] + '"'
    writer1.writerow(rec1)

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.sam')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_sam(
            os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.items():
        contigs = [x.sID for x in v]
        acc = sum(x.identity * x.qCoverage for x in v) / sum(x.qCoverage
                                                             for x in v)

        rec2 = {
            'seqid': seqid,
            'gene_family': dirname,
            'Cogent_contig': ",".join(contigs),
            'Cogent_contig_acc': acc
        }

        if not seqid in d3:
            rec2['scaffold'] = 'NA'
            rec2['num_scaffold'] = 0
            rec2['scaffold_coverage'] = 'NA'
            rec2['scaffold_acc'] = 'NA'
            if blastn_filename is not None:
                rec2['blastn_best'] = 'NA'
        else:
            scaffolds = [x.sID for x in d3[seqid]]
            # calculate cov and acc
            c = ClusterTree(0, 0)
            for x in d3[seqid]:
                qlen = x.qLen
                c.insert(x.qStart, x.qEnd, -1)
            cov = sum(_e - _s
                      for _s, _e, _junk in c.getregions()) * 100. / qlen
            acc = sum(x.identity * x.qCoverage
                      for x in d3[seqid]) * 1. / sum(x.qCoverage
                                                     for x in d3[seqid])
            rec2['scaffold'] = ",".join(scaffolds)
            rec2['num_scaffold'] = len(scaffolds)
            rec2['scaffold_coverage'] = cov
            rec2['scaffold_acc'] = acc
            if blastn_filename is not None:
                rec2['blastn_best'] = best_of[seqid][1]
        writer2.writerow(rec2)
Example #4
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2, blastn_filename=None):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam')
    reader = BioReaders.GMAPSAMReader(filename, True, \
                                      query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')))
    for r in reader:
        seq_info[r.qID].append(r)
        contigs_seen.add(r.sID)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome1+'.sam'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome2+'.sam'))

    if blastn_filename is not None:
        qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')),'fasta'))
        best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict)

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}".format(len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2)))
    # (for blastn, optional) best name with best e-value
    if blastn_filename is not None:
        if len(best_of) == 0: f1.write("\t0\tNA\n")
        else:
            stuff = best_of.values() # list of (e-value, name)
            stuff.sort()
            f1.write("\t{0}\t\"{1}\"\n".format(sum(_n!='NA' for _e,_n in best_of.values()), stuff[0][1]))
    else: f1.write("\n")

    in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.'+genome1+'.sam')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.sID for x in v]
        acc = sum(x.identity*x.qCoverage for x in v)/sum(x.qCoverage for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA")
            if blastn_filename is not None: f2.write("\tNA\n")
            else: f2.write("\n")
        else:
            scaffolds = [x.sID for x in d3[seqid]]
            # calculate cov and acc
            c = ClusterTree(0,0)
            for x in d3[seqid]:
                qlen = x.qLen
                c.insert(x.qStart, x.qEnd, -1)
            cov = sum(_e-_s for _s,_e,_junk in c.getregions())*100./qlen
            acc = sum(x.identity*x.qCoverage for x in d3[seqid])*1./sum(x.qCoverage for x in d3[seqid])
            f2.write("{0}\t{1}\t{2}\t{3}".format(",".join(scaffolds), len(scaffolds), cov, acc))
            if blastn_filename is not None: f2.write("\t{0}\n".format(best_of[seqid][1]))
            else: f2.write("\n")