Beispiel #1
0
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()
Beispiel #2
0
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
Beispiel #3
0
class Scaffolder(object) :
    def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') :
        self.alignments_dir     = join(top_level_directory, 'alignments')
        self.output_dir         = join(top_level_directory, 'postprocessing')
        self.protein_identity   = protein_identity
        self.alignment_length   = alignment_length
        self.min_gene_coverage  = min_gene_coverage
        self.trim               = not do_not_trim
        self.testmode           = testmode

        self.scaffold_dir       = join(self.output_dir, 'scaffolds')
        self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa')
        self.gene_msa_dir       = join(self.output_dir, 'gene_msa')

        check_dir(self.output_dir, create=True)
        check_dir(self.scaffold_dir, create=True)
        check_dir(self.genefamily_msa_dir, create=True)
        check_dir(self.gene_msa_dir, create=True)

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.info = GluttonInformation(self.alignments_dir, self.param, self.db)

        # check reference was the same
        if not self.param.same_reference(self.db) :
            self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename))
            exit(1);

        # perhaps slightly overambitious to exit, just stick to a warning      
        pending,failures = self.info.num_alignments_not_done()
        if pending != 0 :
            self.log.warn("%d alignments were not run!" % pending)

        self.assembler = AssemblerOutput(assembler_name)

        # e.g. query39806_orf1
        self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")

    def stop(self) :
        pass

    def _orf_to_query_name(self, name) :
        m = self.orfname_regex.match(name)

        if not m :
            raise ScaffolderError("unexpected query name (%s)" % name)

        return m.group(1)

    def _assembler_gene_name(self, name) :
        m = self.assembler.match(name)

        if not m :
            raise ScaffolderError("unexpected gene name (%s)" % name)

        if m.group(1) :
            return m.group(1)
        else :
            return m.group(2)

    # read the alignment file and return a dictionary keyed on the gene name
    #   - there might be multiple orfs for a single query sequence, so keep a track of the best one
    #@profile
    def read_alignment(self, fname) :
        tmp = defaultdict(dict)
        genes = []

        for s in SeqIO.parse(fname, 'fasta') :
            if not s.description.startswith('query') :
                gene_id     = s.description
                gene_name   = self.db.get_genename_from_geneid(gene_id)
                gene_seq    = str(s.seq)
                gene_prot   = translate(gene_seq)
                gene_start,gene_end = sequence_limits(gene_prot)
                
                genes.append((gene_name, gene_seq))
                continue

            query_id        = self._orf_to_query_name(s.description)
            contig_id,label = self.info.get_contig_from_query(query_id)
            species         = self.param.get_species(label)
            assembler_geneid = self._assembler_gene_name(contig_id)

            seq = str(s.seq).replace('N', '-')
            
            contig_start,contig_end = sequence_limits(seq)

            # pagan bug?
            if (contig_end - contig_start) == 0 :
                continue

            overlap_start = max(contig_start / 3, gene_start)
            overlap_end   = min(contig_end   / 3, gene_end)

            # require all alignments to firmly overlap gene
            if ((overlap_end - overlap_start) * 3) < 100 :
                continue

            ref_identity = self.protein_similarity(gene_prot, 
                                                   translate(seq), 
                                                   overlap_start,
                                                   overlap_end)

            # user defined identity in protein space
            if ref_identity < self.protein_identity :
                continue

            # if we have seen this before
            if (contig_id in tmp[gene_name]) and (ref_identity < tmp[gene_name][contig_id][-1]) : 
                continue

            # first three need to be seq, contig_start, contig_end
            tmp[gene_name][contig_id] = (seq, contig_start, contig_end, label, species, assembler_geneid, s.description, ref_identity)


        # convert from a dict of dicts to a dict of lists
        tmp2 = defaultdict(list)

        self.log.debug("read %s" % fname)
        for gene in tmp :
            self.log.debug("\tgene = %s" % gene)

            for contig in tmp[gene] :
                seq,start,end,label,species,assembler_geneid,queryid,ref_identity = tmp[gene][contig]
                self.log.debug("\t\tquery id = %s (%d,%d)" % (queryid,start,end))
                
                try :
                    tmp2[gene].append(Alignment(contig, assembler_geneid, gene, start, end, seq, label, species))
                
                except ScaffolderError :
                    self.log.debug("empty sequence %s in %s" % (queryid, fname))
                    continue

        return tmp2, genes

    def group_alignments(self, alignments) :
        groups = []

        # for each alignment
        for align in alignments :
            # for each group
            add_to_group = False
            for group in groups :
                # if 'align' overlaps with a member of that group
                # are they were from the same input file (i.e. label)
                for member in group :
                    if member.overlaps(align) and member.from_same_file(align) :
                        add_to_group = True
                        break

                # add to the group
                if add_to_group :
                    group.append(align)
                    break

            if not add_to_group :
                groups.append([ align ])

        return groups

    def group_alignments_by_file(self, alignments) :
        groups = defaultdict(list)

        for a in alignments :
            groups[a.label].append(a)

        return groups

    def group_cannot_be_merged_isoforms(self, group) :
        return self.group_cannot_be_merged(group, consider_isoforms=True)

    def group_cannot_be_merged(self, group, consider_isoforms=False) :
        for i in range(0, len(group)) :
            a = group[i]
            for j in range(i+1, len(group)) :
                b = group[j]
                
                if a.overlaps(b) :
                    if consider_isoforms :
                        if a.isoforms(b) :
                            return True
                    else :
                        if not a.mergeable(b) :
                            return True
        
        return False

    def merge_alignments(self, alignments) :
        global DEBUG

        #if self.testmode != 'none' :
        #    return alignments

        # perform merges of overlaps first
        unmerged_labels = set()
        merged_groups = []

        def label_all(group, label) :
            for i in group :
                i.desc = label

        for group in self.group_alignments(alignments) :
            no_print = False

            # if the assembler thinks these two contigs are isoforms of the
            # same gene then we should not attempt to merge them
            if self.group_cannot_be_merged_isoforms(group) :
                unmerged_labels.add(group[0].label)

                if len(set([ g.gene_id for g in group ])) == 1 :
                    label_all(group, 'single_gene_multiple_isoform')
                else :
                    label_all(group, 'multiple_gene_multiple_isoform')

                merged_groups += group

                no_print = True

            # some contigs mapping to the same region of the gene cannot be
            # merged because the differences between them is too large
            # (defined in Alignment class)
            elif self.group_cannot_be_merged(group) :
                unmerged_labels.add(group[0].label)
                label_all(group, 'conflict')
                merged_groups += group

            # these can be merged trivially
            else :
                merged_groups.append( reduce(operator.add, sorted(group, key=lambda x : x.start)) )


            #if DEBUG and (not no_print) and (len(group) > 1) :
            #    self.print_alignments(group)


        # if there were any conflicts for a given gene in an alignment, then 
        # just output what has been merged, if there were no conflicts for 
        # a gene then concatenate the islands of alignments with N's
        tmp = []
        grouped_by_file = self.group_alignments_by_file(merged_groups)
        for label in grouped_by_file :
            if label in unmerged_labels :
                tmp += grouped_by_file[label]
            else :
                tmp.append( reduce(operator.add, sorted(grouped_by_file[label], key=lambda x : x.start)) )


        return tmp

    def print_alignments(self, alignments) :
        
        f = open('glutton_debug_scaffolds.txt', 'a')

        for a in alignments :
            print >> f, "%s\t%s" % (a.id, a.seq)

        alignment_diff = ""
    
        for column in range(len(alignments[0].seq)) :
            chars = set([ a[column] for a in alignments if a[column] ])
            if ('-' in chars) and (len(chars) > 1) :
                alignment_diff += 'X'
            elif len(chars) < 2 :
                alignment_diff += ' '
            else :
                alignment_diff += 'M'

        print >> f, "difference       \t%s" % alignment_diff
        print >> f, ""

        f.close()

    # XXX now in Alignment class
    def trim_at_ATG(self, seq, pos) :
        trim_pos = pos

        for ind in range(0, pos+3, 3)[::-1] :
            codon = seq[ind : ind+3]

            if codon == start_codon :
                trim_pos = ind
                break

            #if codon in stop_codons :
            #    break

        return ('-' * trim_pos) + seq[trim_pos:]

    def consensus_for_msa(self, reference, alignments, bamfiles) :

        if len(alignments) == 1 :
            a = alignments[0]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

        if self.testmode == 'none' :
            return self.consensus_for_msa_glutton(reference, alignments, bamfiles)

        else :
            lengths = [ len(a.seq.replace('-','')) for a in alignments ]
            identities = []
            coverages = []

            gene_prot = translate(reference.seq)

            for a in alignments :
                # identity
                overlap_start = max(a.start, reference.start)
                overlap_end   = min(a.end  , reference.end  )

                ident = self.protein_similarity(gene_prot, translate(a.seq), overlap_start/3, overlap_end/3)
                identities.append(ident)

                # coverage (depth)
                if a.label in bamfiles :
                    try :
                        id = str(a.id).split()[0]
                        coverages.append(bamfiles[a.label].count(id))
                    except :
                        coverages.append(1)

            coverages = [ c / float(l) for c,l in zip(coverages, lengths) ]

            if self.testmode == 'length' :
                top_hit = sorted(zip(lengths, identities, coverages, range(len(lengths))))[-1][-1]
            elif self.testmode == 'identity' :
                top_hit = sorted(zip(identities, lengths, coverages, range(len(lengths))))[-1][-1]
            else :
                top_hit = sorted(zip(coverages, identities, lengths, range(len(lengths))))[-1][-1]

            a = alignments[top_hit]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            #seq = self.trim_at_ATG(a.seq, reference.start)
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

    #@profile
    def consensus_for_msa_glutton(self, reference, alignments, bamfiles) :

        if len(alignments) == 1 :
            a = alignments[0]
            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            return Alignment2(a.species, a.gene_name, a.seq, [a.contig_id])

        # this is buggy if within a species there are FAKE and real bam files
        coverage = []

        for a in alignments :
            numerator = 1
            denominator = len(a.seq.replace('-', ''))

            if a.label in bamfiles :
                try :
                    # BWA only uses the fasta id, but we need to store the complete
                    # description line as an id because soapdenovotrans does not provide
                    # the locus information in the first token, but the second
                    #id = str(a.id).split()[0] # moved to a property in Alignment

                    numerator = bamfiles[a.label].count(a.contig_id)

                except ValueError :
                    pass

            coverage.append(numerator / float(denominator))


        s = "-" * len(alignments[0].seq)
        
        for cov,a in sorted(zip(coverage, alignments)) :
            #tmp = ""
            #for c1,c2 in zip(a.seq[a.start:a.end], s[a.start:a.end]) :
            #    tmp += (c1 if c1 not in ('-','N') else c2)

            a.trim_at_ATG(reference.start)
            if self.trim :
                a.truncate_at_stop_codon()
            subseq = a.seq[a.start:a.end]

            if 'N' not in subseq :
                s = s[:a.start] + subseq + s[a.end:]

            else :
                tmp = ""

                for ind,c in enumerate(subseq) :
                    tmp += (c if c != 'N' else s[a.start + ind])

                s = s[:a.start] + tmp + s[a.end:]

        #s = self.trim_at_ATG(s, reference.start)
        return Alignment2(alignments[0].species, alignments[0].gene_name, s, [ a.contig_id for a in alignments ])

    def remove_common_gaps(self, alignment) :
        indices = [-1, len(alignment[0].seq)]
        num_rows = len(alignment)
        
        for index,chars in enumerate(zip(*[ a.seq for a in alignment ])) :
            if (chars.count('-') + chars.count('N')) == num_rows :
            #if chars.count('-') == num_rows :
                indices.append(index)

        indices.sort()

        for a in alignment :
            a.remove_chars(indices)

        return alignment

    def nucleotide_overlap(self, ref, query, start, end) :
        if end <= start :
            return 0

        r = ref[start:end]
        q = query[start:end]

        covered = 0

        for cq,cr in zip(q,r) :
            if (cq,cr) == ('-','-') :
                continue

            if cq == 'N' :
                continue

            covered += 1

        return covered

    def gene_coverage(self, ref, query) :
        total = 0
        covered = 0

        for i in range(ref.start, ref.end) :
            cq = query.seq[i]
            cr = ref.seq[i]

            if (cq,cr) == ('-','-') :
                continue

            if (query.start <= i < query.end) and (cq != 'N') :
                covered += 1

            total += 1

        return covered / float(total)

    def protein_similarity(self, ref, query, start, end) :
        if end <= start :
            return 0.0

        r = ref[start:end]
        q = query[start:end]

        identical = 0
        length = 0

        for cq,cr in zip(q,r) :
            if (cq,cr) == ('-','-') :
                continue

            if cq == 'X' :
                continue

            if cq == cr :
                identical += 1

            length += 1


        return identical / float(length)

    def process_alignments(self, output_files, bam_files) :
        global DEBUG

        counter = -1
        aligned_contigs = defaultdict(set)

        alignment_files = glob(join(self.alignments_dir, 'glutton*.nucleotide'))

        complete_files = 0
        total_files = len(alignment_files)

        stderr.write("\rINFO processed %d / %d alignments " % (complete_files, total_files))
        stderr.flush()

        for fname in alignment_files :
            contigs, genes = self.read_alignment(fname)
            merged_contigs = defaultdict(dict)

            # for each gene, merge the contigs from the same input file
            # and write to output
            for gene_name in contigs :
                for a in contigs[gene_name] :
                    aligned_contigs[a.label].add(a.id)

                    if a.species not in merged_contigs[gene_name] :
                        merged_contigs[gene_name][a.species] = []

                    merged_contigs[gene_name][a.species].append(a)

                for a in self.merge_alignments(contigs[gene_name]) :
                    print >> output_files[a.label], a.format_contig()

            # merge sequences from the same species
            # find stop codon and truncate sequences
            # delete columns with only gaps
            # then write out to a file in self.output_dir
            # in MSA have >species_name contents=gluttonX,gluttonY,gluttonZ
            new_alignment = []
            non_reference_seq = 0

            for gene_name,gene_seq in genes :
                ref = Alignment2(self.db.species, gene_name, gene_seq, [gene_name])
                new_alignment.append(ref)

                #gene_prot = translate(ref.seq)
                ref.prot_id = 1.0
                ref.coverage = 1.0

                for species in merged_contigs[gene_name] :
                    try :
                        tmp = self.consensus_for_msa(ref, merged_contigs[gene_name][species], bam_files)
                        #tmp.truncate_at_stop_codon() # this only needs to be here for the testmodes, otherwise it is redundant

                    except ScaffolderError, se :
                        continue

                    # check length vs alignment_length
                    #if len(tmp) < self.alignment_length :
                    #    continue

                    overlap_start = max(tmp.start, ref.start)
                    overlap_end   = min(tmp.end  , ref.end  )

                    overlap_bases = self.nucleotide_overlap(ref.seq, tmp.seq, overlap_start, overlap_end)

                    if overlap_bases < self.alignment_length :
                        continue

                    coverage = self.gene_coverage(ref, tmp)

                    if coverage < self.min_gene_coverage :
                        continue

                    prot_identity = self.protein_similarity(translate(ref.seq),
                                                            translate(tmp.seq),
                                                            overlap_start / 3,
                                                            overlap_end / 3)

                    if prot_identity < self.protein_identity :
                        continue

                    tmp.prot_id = prot_identity
                    tmp.coverage = coverage

                    new_alignment.append(tmp)
                    non_reference_seq += 1


            if non_reference_seq != 0 :
                counter += 1

                self.write_alignment(join(self.genefamily_msa_dir, "msa%d.fasta" % counter), new_alignment)

                subalignments = defaultdict(list)

                for a in new_alignment :
                    subalignments[a.gene_name].append(a)

                for k,v in subalignments.iteritems() :
                    if len(v) > 1 :
                        self.write_alignment(join(self.gene_msa_dir, "%s.fasta" % k), v)


            complete_files += 1
            stderr.write("\rINFO processed %d / %d alignments " % (complete_files, total_files))
            stderr.flush()

        stderr.write("\rINFO processed %d / %d alignments \n" % (complete_files, total_files))
        stderr.flush()

        self.log.info("created %d multiple sequence alignments" % (counter + 1))

        return aligned_contigs
Beispiel #4
0
class Aligner(object) :
    def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) :
        self.directory = join(top_level_directory, 'alignments')
        self.min_length = min_length # glutton
        self.min_hitidentity = min_hitidentity # blast 
        self.min_hitlength = min_hitlength # blast
        self.max_evalue = max_evalue # blast
        self.min_alignidentity = min_alignidentity # pagan
        self.min_alignoverlap = min_alignoverlap # pagan

        check_dir(self.directory, create=True)

        self.search = All_vs_all_search(batch_size)
        self.cleanup_files = []
        self.q = None

        self.lock = threading.Lock()
        self.complete_jobs = 0
        self.total_jobs = 0

        self.log = get_log()

        self.param = GluttonParameters(top_level_directory)
        self.db = GluttonDB(reference_fname)
        self.param.set_reference(self.db)

        self.resume = self.param.able_to_resume()

        self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume)
        self.param.set_full_checksum()

    def _read_contigs(self) :
        contigs = {}

        for label in self.param.get_sample_ids() :
            accepted = 0
            rejected = { 'length' : 0, 'ambiguous' : 0 }

            fname = self.param.get_contigs(label)

            for r in SeqIO.parse(fname, 'fasta') :
                if len(r) < self.min_length :
                    rejected['length'] += 1
                    continue

                #if 'N' in r :
                #    rejected['ambiguous'] += 1
                #    continue

                qid = self.info.get_query_from_contig(label, r.description)
 
                contigs[qid] = biopy_to_gene(r, qid)
                accepted += 1

            self.log.info("%s: read %d contigs (rejected %d due to length < %d)" % #and %d due to 'N's)" %
                (fname, accepted, rejected['length'], self.min_length)) #, rejected['ambiguous']))

        return contigs

    def stop(self) :
        self.search.stop()
        self.info.update_query_gene_mapping(self.search.get_intermediate_results())
        
        if self.q :
            self.q.stop()

        rm_f(self.cleanup_files)

        self.info.flush()
        self.param.flush()

    def _correct_strand(self, contig, strand) :
        if strand == '-' :
            contig.reverse_complement()

        return contig

    def align(self) :
        self.log.info("starting alignment procedure")

        # convert the names of the contigs to something no program can complain about
        # + filter out the ones that could never have a long enough alignment
        contigs = self._read_contigs()

        pending_contigs = [ contigs[i] for i in self.info.pending_queries() ]

        self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs))

        # depending on when the program was terminated this step may be complete or partially
        # complete 
        if pending_contigs :
            db_fname = self.db.extract_all()
            self.cleanup_files.append(db_fname)

            # do an all vs all search of contigs vs database of transcripts
            # return a dict of tmp ids with gene ids
            self.info.update_query_gene_mapping(
                self.search.process(
                    db_fname, 
                    pending_contigs,
                    self.db.nucleotide,
                    self.min_hitidentity,
                    self.min_hitlength,
                    self.max_evalue)
                )

            rm_f(db_fname)

        # save intermediate results
        self.info.flush()

        # use the database to convert the mapping from tmp id -> gene
        # to gene family -> list of (tmp id, strands)
        genefamily_contig_map = self.info.build_genefamily2contigs()
        
        self.log.info("%d contigs assigned to %d gene families" % 
                (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map)))
        self.log.info("(%d have already been run)" % self.info.len_genefamily2filename())

        if self.info.len_genefamily2filename() == len(genefamily_contig_map) :
            self.log.info("alignment already done, exiting early...")
            return
        else :
            self.log.info("starting alignments...")


        # queue all the alignments up using a work queue and pagan
        self.q = WorkQueue()

        self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename()
        self.complete_jobs = -1
        self._progress()

        for famid in self.sort_keys_by_complexity(genefamily_contig_map) :
            # ignore the jobs that have already been run
            if self.info.in_genefamily2filename(famid) :
                continue

            try :
                # get the alignment and tree from the database
                alignment = self.db.get_alignment(famid)
                tree = alignment.get_tree()

                # get contigs
                job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ]

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        job_contigs,
                        famid,
                        alignment,
                        tree,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )

                # avoid the split code later in the loop...
                continue

            except GluttonDBError, gde :
                # this means we have never heard of this gene family
                self.log.warn(str(gde))
                continue

            except GluttonDBFileError, gdfe :
                # this means we have heard of the gene family, but the
                # alignment files were missing...
                self.log.warn(str(gdfe))

            # okay, the gene family was not aligned for some reason
            # instead we will split the gene family into constituent genes
            # and handle each one separately...

            self.log.warn("gene family was not aligned, breaking down into separate genes...")
            self.total_jobs += (len(genefamily_contig_map[famid]) - 1)

            # collect contigs by gene
            gene2contigs = collections.defaultdict(list)

            for contigid,strand in genefamily_contig_map[famid] :
                try :
                    geneid = self.info.query_to_gene(contigid)

                except KeyError : # this should be impossible
                    self.log.warn("no gene assignment for %s" % contigid)
                    continue

                gene2contigs[geneid].append((contigid, strand))

            # run each gene separately
            for geneid in gene2contigs :
                try :
                    alignment = [ self.db.get_gene(geneid) ]

                except GluttonDBError, gde :
                    self.log.warn(str(gde))
                    continue

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        [ self._correct_strand(contigs[contigid], strand) for contigid,strand in gene2contigs[geneid] ],
                        geneid,
                        alignment,
                        None,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )
Beispiel #5
0
def setup_command(args) :
    if args.setupcmd == 'add' :
        gp = GluttonParameters(args.project, create=True)
        gp.add(args.contigs, args.sample, args.species, args.bam, args.assembler, copy=args.copy)
        gp.flush()

        print >> stderr, "added %s (%s contains %d samples)" % (args.sample, args.project, gp.count())

    elif args.setupcmd == 'remove' :
        gp = GluttonParameters(args.project, create=False)
        gp.remove(args.sample)
        gp.flush()

        print >> stderr, "removed %s (%s contains %d samples)" % (args.sample, args.project, gp.count())

    elif args.setupcmd == 'list' :
        gp = GluttonParameters(args.project, create=False)
        gp.list()

    return 0