Esempio n. 1
0
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
Esempio n. 2
0
class Scaffolder(object) :
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")

    def stop(self) :
        pass

    def _orf_to_query_name(self, name) :
        m = self.orfname_regex.match(name)

        if not m :
            raise ScaffolderError("unexpected query name (%s)" % name)

        return m.group(1)

    def _assembler_gene_name(self, name) :
        m = self.assembler.match(name)

        if not m :
            raise ScaffolderError("unexpected gene name (%s)" % name)

        if m.group(1) :
            return m.group(1)
        else :
            return m.group(2)

    # read the alignment file and return a dictionary keyed on the gene name
    #   - there might be multiple orfs for a single query sequence, so keep a track of the best one
    #@profile
    def read_alignment(self, fname) :
        tmp = defaultdict(dict)
        genes = []

        for s in SeqIO.parse(fname, 'fasta') :
            if not s.description.startswith('query') :
                gene_id     = s.description
                gene_name   = self.db.get_genename_from_geneid(gene_id)
                gene_seq    = str(s.seq)
                gene_prot   = translate(gene_seq)
                gene_start,gene_end = sequence_limits(gene_prot)
                
                genes.append((gene_name, gene_seq))
                continue

            query_id        = self._orf_to_query_name(s.description)
            contig_id,label = self.info.get_contig_from_query(query_id)
            species         = self.param.get_species(label)
            assembler_geneid = self._assembler_gene_name(contig_id)

            seq = str(s.seq).replace('N', '-')
            
            contig_start,contig_end = sequence_limits(seq)

            # pagan bug?
            if (contig_end - contig_start) == 0 :
                continue

            overlap_start = max(contig_start / 3, gene_start)
            overlap_end   = min(contig_end   / 3, gene_end)

            # require all alignments to firmly overlap gene
            if ((overlap_end - overlap_start) * 3) < 100 :
                continue

            ref_identity = self.protein_similarity(gene_prot, 
                                                   translate(seq), 
                                                   overlap_start,
                                                   overlap_end)

            # user defined identity in protein space
            if ref_identity < self.protein_identity :
                continue

            # if we have seen this before
            if (contig_id in tmp[gene_name]) and (ref_identity < tmp[gene_name][contig_id][-1]) : 
                continue

            # first three need to be seq, contig_start, contig_end
            tmp[gene_name][contig_id] = (seq, contig_start, contig_end, label, species, assembler_geneid, s.description, ref_identity)


        # convert from a dict of dicts to a dict of lists
        tmp2 = defaultdict(list)

        self.log.debug("read %s" % fname)
        for gene in tmp :
            self.log.debug("\tgene = %s" % gene)

            for contig in tmp[gene] :
                seq,start,end,label,species,assembler_geneid,queryid,ref_identity = tmp[gene][contig]
                self.log.debug("\t\tquery id = %s (%d,%d)" % (queryid,start,end))
                
                try :
                    tmp2[gene].append(Alignment(contig, assembler_geneid, gene, start, end, seq, label, species))
                
                except ScaffolderError :
                    self.log.debug("empty sequence %s in %s" % (queryid, fname))
                    continue

        return tmp2, genes

    def group_alignments(self, alignments) :
        groups = []

        # for each alignment
        for align in alignments :
            # for each group
            add_to_group = False
            for group in groups :
                # if 'align' overlaps with a member of that group
                # are they were from the same input file (i.e. label)
                for member in group :
                    if member.overlaps(align) and member.from_same_file(align) :
                        add_to_group = True
                        break

                # add to the group
                if add_to_group :
                    group.append(align)
                    break

            if not add_to_group :
                groups.append([ align ])

        return groups

    def group_alignments_by_file(self, alignments) :
        groups = defaultdict(list)

        for a in alignments :
            groups[a.label].append(a)

        return groups

    def group_cannot_be_merged_isoforms(self, group) :
        return self.group_cannot_be_merged(group, consider_isoforms=True)

    def group_cannot_be_merged(self, group, consider_isoforms=False) :
        for i in range(0, len(group)) :
            a = group[i]
            for j in range(i+1, len(group)) :
                b = group[j]
                
                if a.overlaps(b) :
                    if consider_isoforms :
                        if a.isoforms(b) :
                            return True
                    else :
                        if not a.mergeable(b) :
                            return True
        
        return False

    def merge_alignments(self, alignments) :
        global DEBUG

        #if self.testmode != 'none' :
        #    return alignments

        # perform merges of overlaps first
        unmerged_labels = set()
        merged_groups = []

        def label_all(group, label) :
            for i in group :
                i.desc = label

        for group in self.group_alignments(alignments) :
            no_print = False

            # if the assembler thinks these two contigs are isoforms of the
            # same gene then we should not attempt to merge them
            if self.group_cannot_be_merged_isoforms(group) :
                unmerged_labels.add(group[0].label)

                if len(set([ g.gene_id for g in group ])) == 1 :
                    label_all(group, 'single_gene_multiple_isoform')
                else :
                    label_all(group, 'multiple_gene_multiple_isoform')

                merged_groups += group

                no_print = True

            # some contigs mapping to the same region of the gene cannot be
            # merged because the differences between them is too large
            # (defined in Alignment class)
            elif self.group_cannot_be_merged(group) :
                unmerged_labels.add(group[0].label)
                label_all(group, 'conflict')
                merged_groups += group

            # these can be merged trivially
            else :
                merged_groups.append( reduce(operator.add, sorted(group, key=lambda x : x.start)) )


            #if DEBUG and (not no_print) and (len(group) > 1) :
            #    self.print_alignments(group)


        # if there were any conflicts for a given gene in an alignment, then 
        # just output what has been merged, if there were no conflicts for 
        # a gene then concatenate the islands of alignments with N's
        tmp = []
        grouped_by_file = self.group_alignments_by_file(merged_groups)
        for label in grouped_by_file :
            if label in unmerged_labels :
                tmp += grouped_by_file[label]
            else :
                tmp.append( reduce(operator.add, sorted(grouped_by_file[label], key=lambda x : x.start)) )


        return tmp

    def print_alignments(self, alignments) :
        
        f = open('glutton_debug_scaffolds.txt', 'a')

        for a in alignments :
            print >> f, "%s\t%s" % (a.id, a.seq)

        alignment_diff = ""
    
        for column in range(len(alignments[0].seq)) :
            chars = set([ a[column] for a in alignments if a[column] ])
            if ('-' in chars) and (len(chars) > 1) :
                alignment_diff += 'X'
            elif len(chars) < 2 :
                alignment_diff += ' '
            else :
                alignment_diff += 'M'

        print >> f, "difference       \t%s" % alignment_diff
        print >> f, ""

        f.close()

    # XXX now in Alignment class
    def trim_at_ATG(self, seq, pos) :
        trim_pos = pos

        for ind in range(0, pos+3, 3)[::-1] :
            codon = seq[ind : ind+3]

            if codon == start_codon :
                trim_pos = ind
                break

            #if codon in stop_codons :
            #    break

        return ('-' * trim_pos) + seq[trim_pos:]

    def consensus_for_msa(self, reference, alignments, bamfiles) :

        if len(alignments) == 1 :
            a = alignments[0]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

        if self.testmode == 'none' :
            return self.consensus_for_msa_glutton(reference, alignments, bamfiles)

        else :
            lengths = [ len(a.seq.replace('-','')) for a in alignments ]
            identities = []
            coverages = []

            gene_prot = translate(reference.seq)

            for a in alignments :
                # identity
                overlap_start = max(a.start, reference.start)
                overlap_end   = min(a.end  , reference.end  )

                ident = self.protein_similarity(gene_prot, translate(a.seq), overlap_start/3, overlap_end/3)
                identities.append(ident)

                # coverage (depth)
                if a.label in bamfiles :
                    try :
                        id = str(a.id).split()[0]
                        coverages.append(bamfiles[a.label].count(id))
                    except :
                        coverages.append(1)

            coverages = [ c / float(l) for c,l in zip(coverages, lengths) ]

            if self.testmode == 'length' :
                top_hit = sorted(zip(lengths, identities, coverages, range(len(lengths))))[-1][-1]
            elif self.testmode == 'identity' :
                top_hit = sorted(zip(identities, lengths, coverages, range(len(lengths))))[-1][-1]
            else :
                top_hit = sorted(zip(coverages, identities, lengths, range(len(lengths))))[-1][-1]

            a = alignments[top_hit]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            #seq = self.trim_at_ATG(a.seq, reference.start)
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

    #@profile
    def consensus_for_msa_glutton(self, reference, alignments, bamfiles) :

        if len(alignments) == 1 :
            a = alignments[0]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

        # this is buggy if within a species there are FAKE and real bam files
        coverage = []

        for a in alignments :
            numerator = 1
            denominator = len(a.seq.replace('-', ''))

            if a.label in bamfiles :
                try :
                    # BWA only uses the fasta id, but we need to store the complete
                    # description line as an id because soapdenovotrans does not provide
                    # the locus information in the first token, but the second
                    #id = str(a.id).split()[0] # moved to a property in Alignment

                    numerator = bamfiles[a.label].count(a.contig_id)

                except ValueError :
                    pass

            coverage.append(numerator / float(denominator))


        s = "-" * len(alignments[0].seq)
        
        for cov,a in sorted(zip(coverage, alignments)) :
            #tmp = ""
            #for c1,c2 in zip(a.seq[a.start:a.end], s[a.start:a.end]) :
            #    tmp += (c1 if c1 not in ('-','N') else c2)

            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            subseq = a.seq[a.start:a.end]

            if 'N' not in subseq :
                s = s[:a.start] + subseq + s[a.end:]

            else :
                tmp = ""

                for ind,c in enumerate(subseq) :
                    tmp += (c if c != 'N' else s[a.start + ind])

                s = s[:a.start] + tmp + s[a.end:]

        #s = self.trim_at_ATG(s, reference.start)
        return Alignment2(alignments[0].species, alignments[0].gene_name, s, [ a.contig_id for a in alignments ])

    def remove_common_gaps(self, alignment) :
        indices = [-1, len(alignment[0].seq)]
        num_rows = len(alignment)
        
        for index,chars in enumerate(zip(*[ a.seq for a in alignment ])) :
            if (chars.count('-') + chars.count('N')) == num_rows :
            #if chars.count('-') == num_rows :
                indices.append(index)

        indices.sort()

        for a in alignment :
            a.remove_chars(indices)

        return alignment

    def nucleotide_overlap(self, ref, query, start, end) :
        if end <= start :
            return 0

        r = ref[start:end]
        q = query[start:end]

        covered = 0

        for cq,cr in zip(q,r) :
            if (cq,cr) == ('-','-') :
                continue

            if cq == 'N' :
                continue

            covered += 1

        return covered

    def gene_coverage(self, ref, query) :
        total = 0
        covered = 0

        for i in range(ref.start, ref.end) :
            cq = query.seq[i]
            cr = ref.seq[i]

            if (cq,cr) == ('-','-') :
                continue

            if (query.start <= i < query.end) and (cq != 'N') :
                covered += 1

            total += 1

        return covered / float(total)

    def protein_similarity(self, ref, query, start, end) :
        if end <= start :
            return 0.0

        r = ref[start:end]
        q = query[start:end]

        identical = 0
        length = 0

        for cq,cr in zip(q,r) :
            if (cq,cr) == ('-','-') :
                continue

            if cq == 'X' :
                continue

            if cq == cr :
                identical += 1

            length += 1


        return identical / float(length)

    def process_alignments(self, output_files, bam_files) :
        global DEBUG

        counter = -1
        aligned_contigs = defaultdict(set)

        alignment_files = glob(join(self.alignments_dir, 'glutton*.nucleotide'))

        complete_files = 0
        total_files = len(alignment_files)

        stderr.write("\rINFO processed %d / %d alignments " % (complete_files, total_files))
        stderr.flush()

        for fname in alignment_files :
            contigs, genes = self.read_alignment(fname)
            merged_contigs = defaultdict(dict)

            # for each gene, merge the contigs from the same input file
            # and write to output
            for gene_name in contigs :
                for a in contigs[gene_name] :
                    aligned_contigs[a.label].add(a.id)

                    if a.species not in merged_contigs[gene_name] :
                        merged_contigs[gene_name][a.species] = []

                    merged_contigs[gene_name][a.species].append(a)

                for a in self.merge_alignments(contigs[gene_name]) :
                    print >> output_files[a.label], a.format_contig()

            # merge sequences from the same species
            # find stop codon and truncate sequences
            # delete columns with only gaps
            # then write out to a file in self.output_dir
            # in MSA have >species_name contents=gluttonX,gluttonY,gluttonZ
            new_alignment = []
            non_reference_seq = 0

            for gene_name,gene_seq in genes :
                ref = Alignment2(self.db.species, gene_name, gene_seq, [gene_name])
                new_alignment.append(ref)

                #gene_prot = translate(ref.seq)
                ref.prot_id = 1.0
                ref.coverage = 1.0

                for species in merged_contigs[gene_name] :
                    try :
                        tmp = self.consensus_for_msa(ref, merged_contigs[gene_name][species], bam_files)
                        #tmp.truncate_at_stop_codon() # this only needs to be here for the testmodes, otherwise it is redundant

                    except ScaffolderError, se :
                        continue

                    # check length vs alignment_length
                    #if len(tmp) < self.alignment_length :
                    #    continue

                    overlap_start = max(tmp.start, ref.start)
                    overlap_end   = min(tmp.end  , ref.end  )

                    overlap_bases = self.nucleotide_overlap(ref.seq, tmp.seq, overlap_start, overlap_end)

                    if overlap_bases < self.alignment_length :
                        continue

                    coverage = self.gene_coverage(ref, tmp)

                    if coverage < self.min_gene_coverage :
                        continue

                    prot_identity = self.protein_similarity(translate(ref.seq),
                                                            translate(tmp.seq),
                                                            overlap_start / 3,
                                                            overlap_end / 3)

                    if prot_identity < self.protein_identity :
                        continue

                    tmp.prot_id = prot_identity
                    tmp.coverage = coverage

                    new_alignment.append(tmp)
                    non_reference_seq += 1


            if non_reference_seq != 0 :
                counter += 1

                self.write_alignment(join(self.genefamily_msa_dir, "msa%d.fasta" % counter), new_alignment)

                subalignments = defaultdict(list)

                for a in new_alignment :
                    subalignments[a.gene_name].append(a)

                for k,v in subalignments.iteritems() :
                    if len(v) > 1 :
                        self.write_alignment(join(self.gene_msa_dir, "%s.fasta" % k), v)


            complete_files += 1
            stderr.write("\rINFO processed %d / %d alignments " % (complete_files, total_files))
            stderr.flush()

        stderr.write("\rINFO processed %d / %d alignments \n" % (complete_files, total_files))
        stderr.flush()

        self.log.info("created %d multiple sequence alignments" % (counter + 1))

        return aligned_contigs