Beispiel #1
0
def read_cogent2_aligned_to_genome_gff(filename):
    """
     Read cogent2 mapped to a genome.

     Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

     NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
     """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
Beispiel #2
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.gff')
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.gff'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome2 + '.gff'))

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info),
                                      len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
        len(contig_genome1), cov1, acc1, has_chimeric1,
        ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
        len(contig_genome2), cov2, acc2, has_chimeric2,
        ",".join(contig_genome2)))

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.gff')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage
                                                            for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname,
                                               ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds),
                                                   len(scaffolds), cov, acc))