Example #1
0
 def _filter_homologues(get_all_homologues, seqs, min_identity, keep_ids=None, nucleotide=False):
     print 'Filtering out close homologues. This will take a wile:'
     command = 'blastn' if nucleotide else 'blastp'
     dbname = ''
     try:
         with user_message('Formatting blast DB', '\n'):
             dbname = BlastCLI.format_tmp_db(seqs, nucleotide)
             if not dbname:
                 print 'Unable to make temporary BLAST database.'
                 return None
         with ProgressCounter('Searching for homologues using local blastp...', len(seqs)) as prg:
             homologues = get_all_homologues(seqs, min_identity, dbname, command, prg)
     except Exception as e:
         print '%s\n' % str(e)
         return None
     finally:
         if dbname:
             shutil.rmtree(os.path.dirname(dbname), ignore_errors=True)
     if not homologues: return seqs
     with user_message('Removing all homologs from each group except the first one...'):
         remove = set()
         if keep_ids: keep_ids = set(keep_ids)
         for seq in seqs:
             if seq.id in remove: continue
             h = homologues.pop(seq.id, set())
             if h:
                 if keep_ids:
                     nhoms = len(h)
                     h -= keep_ids
                     if nhoms != len(h) and seq.id not in keep_ids:
                         h.add(seq.id)
                 remove.update(h)
         return [seq for seq in seqs if seq.id not in remove]
Example #2
0
 def hmmsearch_genes(self,
                     hmms,
                     genome,
                     table='Standard',
                     decorate=False,
                     **kwargs):
     #get _genes
     genes = get_indexes_of_all_genes(genome)
     if not genes: return None
     for gene_id, gi in enumerate(genes):
         genome.features[gi].qualifiers['feature_id'] = gi
         genome.features[gi].qualifiers['gene_id'] = gene_id
     #translate _genes
     with user_message('Translating _genes/CDS of %s' % genome.description,
                       '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_features(genome, genes, table)
     if not translation: return None
     if isinstance(hmms, str): hmms = [hmms]
     results = dict()
     for hmm in hmms:
         with user_message('Performing hmm search.'):
             hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs)
         if not hmm_results: return None
         with user_message('Parsing search results...'):
             #get hit_ids of hmm matches
             hits = dict()
             for result in hmm_results:
                 for hit in result.iterhits():
                     hits[hit.id] = hit
             #get indexes of features where hmm hit
             hit_features = dict()
             for t in translation:
                 if t.id in hits:
                     fid = t.features[0].qualifiers.get('feature_id')
                     if fid is None: continue
                     hit_features[fid] = hits[t.id], t
             if hit_features: results.update(hit_features)
         #decorate genome
         if decorate:
             with user_message('Adding results as annotations...'):
                 hmm_name = os.path.basename(hmm)
                 for f in hit_features:
                     feature = genome.features[f]
                     for hsp in hit_features[f][0]:
                         if feature.strand == 1:
                             hmm_location = FeatureLocation(
                                 feature.location.start + hsp.hit_start * 3,
                                 feature.location.start + hsp.hit_end * 3,
                                 feature.strand)
                         else:
                             hmm_location = FeatureLocation(
                                 feature.location.end - hsp.hit_end * 3,
                                 feature.location.end - hsp.hit_start * 3,
                                 feature.strand)
                         hmm_feature = self.hsp2feature(
                             hmm_name, 'HMM_annotations', hmm_location, hsp)
                         genome.features.append(hmm_feature)
     return results if results else None
Example #3
0
 def hmmsearch_genome(self, hmm, genome, table='Standard', decorate=False, **kwargs):
     #get genes
     genes = get_indexes_of_genes(genome)
     if not genes: return None
     for gene_id, gi in enumerate(genes):
         genome.features[gi].qualifiers['feature_id'] = gi
         genome.features[gi].qualifiers['gene_id'] = gene_id
     #translate genes
     with user_message('Translating genes/CDS of %s' % genome.description, '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate(genome, genes, table)
     if not translation: return None
     with user_message('Performing hmm search.'):
         results = self.hmmsearch_recs(hmm, translation)
     if not results: return None
     with user_message('Parsing search results...'):
         #get hit_ids of hmm matches
         hits = dict()
         for result in results:
             for hit in result.iterhits():
                 hits[hit.id] = hit
         #get indexes of features where hmm hit
         hit_features = dict()
         for t in translation:
             if t.id in hits:
                 fid = t.features[0].qualifiers.get('feature_id')
                 if fid is None: continue
                 hit_features[fid] = hits[t.id], t
     #decorate genome
     if decorate:
         with user_message('Adding results as annotations...'):
             hmm_name = os.path.basename(hmm)
             for f in hit_features:
                 feature = genome.features[f] 
                 for hsp in hit_features[f][0]:
                     if feature.strand == 1:
                         hmm_location = FeatureLocation(feature.location.start+hsp.hit_start*3, 
                                                        feature.location.start+hsp.hit_end*3, 
                                                        feature.strand)
                     else:
                         hmm_location = FeatureLocation(feature.location.end-hsp.hit_end*3, 
                                                        feature.location.end-hsp.hit_start*3, 
                                                        feature.strand)
                     hmm_feature = SeqFeature(hmm_location, type='misc_feature')
                     hmm_feature.qualifiers['hmm_model'] = hmm_name
                     hmm_feature.qualifiers['bitscore'] = hsp.bitscore
                     hmm_feature.qualifiers['psi_evalue'] = hsp.psi_evalue
                     hmm_feature.qualifiers['evalue_cond'] = hsp.evalue_cond
                     hmm_feature.qualifiers['acc_average'] = hsp.acc_avg
                     hmm_feature.qualifiers['bias'] = hsp.bias
                     genome.features.append(hmm_feature)
     print 'Done.\n'
     return hit_features 
Example #4
0
    def _main(self):
        alifile = '/home/allis/Documents/INMI/Fervidicoccales-signature/arb-silva.de_2016-04-06_id331139.fasta'
        group_names = ['Fervidicoccales', 'Acidilobales', 'Desulfurococcales', 'Thermoproteales:Sulfolobales', 'Other']
        predefined_positions = [34, 501, 544, 1244, 1293]
        ref_name = 'Escherichia'
        reference = None
        groups = ListDB()
        with user_message('Loadding initial alignment...', '\n'):
            ali = AlignmentUtils.load_first(alifile)
            if not ali: return 1
        with user_message('Sorting alignment into subgroups...', '\n'):
            for rec in ali:
                if ref_name in rec.description:
                    reference = rec
                    continue
                found = False
                for g in group_names:
                    for k in g.split(':'):
                        if k in rec.description:
                            groups[g] = rec
                            found = True
                            break
                if not found: groups['Other'] = rec
            groups = dict((n, AlignmentExt(groups[n])) for n in groups)
        ali_len = ali.get_alignment_length()
        predefined_positions = [self._col_index(i, reference) for i in predefined_positions]

        print ('\nReference sequence:\n>%s\n%s' %
               (reference.description, str(reference.seq).replace('.', '').replace('-', '')))
        print '\nAlignment: %d seqs, %d columns' % (len(ali), ali_len)
        print print_table([(g, '%d sequences' % len(groups[g])) for g in group_names])
        print

        main_group = group_names[0]
        main_ali = groups[main_group]
        others = group_names[1:]
        for ci in xrange(ali_len):
            main_letter = self.LetterStats(main_ali[:,ci])
            predef = ci in predefined_positions
            if predef or main_letter.freq_no_gaps >= 0.95 and main_letter.freq > 0.5:
                other_letters = [self.LetterStats(groups[g][:,ci]) for g in others]
                if predef or any(l.letter != main_letter.letter for l in other_letters):
                    print ('------------------ E.coli position: %d ---------------------' %
                           (self._ref_index(ci, reference)+1))
                    print print_table([(main_group, str(main_letter))]+
                                      [(g, str(l)) for g, l in zip(others, other_letters)])
                    print
        print 'Done'
Example #5
0
 def blastn_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, **kwargs):
     results = self.s2s_blast_batch(tag_sequences, [subject_record], evalue=evalue, command='blastn', **kwargs)
     if results is None: return False
     with user_message('Adding results as annotations...'):
         annotated = False
         for i, tag in enumerate(tag_sequences):
             if not results[i]: continue
             record = results[i][0]
             if not record: continue
             tag_name = pretty_rec_name(tag)
             if tag_name != tag.id:
                 tag_name += ' (%s)' % tag.id
             for hit in record:
                 for ali in hit.alignments:
                     for hsp in ali.hsps:
                         if hsp.identities / float(hsp.align_length) < min_identity: continue
                         strand = 1 if hsp.sbjct_start < hsp.sbjct_end else -1
                         if strand == 1:
                             location = FeatureLocation(hsp.sbjct_start-1,
                                                        hsp.sbjct_end,
                                                        strand)
                         else:
                             location = FeatureLocation(hsp.sbjct_end-1,
                                                        hsp.sbjct_start,
                                                        strand)
                         feature = self.hsp2feature(tag_name,'blastn_annotations', location, hsp)
                         self.add_program(feature, 'blastn')
                         subject_record.features.append(feature)
                         annotated = True
     return annotated
Example #6
0
 def hmmsearch_genome(self,
                      hmms,
                      genome,
                      table='Standard',
                      decorate=False,
                      **kwargs):
     #translate _genes
     with user_message('Translating whole genome in 6 reading frames',
                       '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_six_frames(genome, table)
     if not translation: return None
     if isinstance(hmms, str): hmms = [hmms]
     results = []
     for hmm in hmms:
         with user_message('Performing hmm search.'):
             hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs)
         if not any(len(r) for r in hmm_results): continue
         results += hmm_results
         #decorate genome
         if decorate:
             translation = dict((t.id, t) for t in translation)
             with user_message('Adding results as annotations...'):
                 hmm_name = os.path.basename(hmm)
                 glen = len(genome)
                 for frame in hmm_results:
                     for hit in frame:
                         frec = translation[hit.id]
                         start = frec.annotations['start']
                         strand = frec.annotations['strand']
                         for hsp in hit:
                             if strand == 1:
                                 hmm_location = FeatureLocation(
                                     start + hsp.hit_start * 3,
                                     start + hsp.hit_end * 3, strand)
                             else:
                                 hmm_location = FeatureLocation(
                                     glen - start - hsp.hit_end * 3,
                                     glen - start - hsp.hit_start * 3,
                                     strand)
                             hmm_feature = self.hsp2feature(
                                 hmm_name, 'HMM_annotations', hmm_location,
                                 hsp)
                             genome.features.append(hmm_feature)
     return results if results else None
Example #7
0
 def replace_node_labels(cls, treefile, labels, schema=None, outfile=None):
     '''Reads a tree from file and replaces node labels
     according to provided mapping. The modified tree is
     returned as DendroPy.Tree object or is written to the 
     provided output file.
     @param labels: dict, replacement table
     @param outfile: the name of the file to write the modified tree
     '''
     with user_message('Loading tree file...', '\n'):
         tree = cls.load(treefile, schema)
         if not tree:
             print 'No tree loaded.'
             return None
     with user_message('Processing tree...', '\n'):
         for leaf in tree.leaf_node_iter():
             label = leaf.taxon.label.replace(' ', '_')
             if label in labels:
                 leaf.taxon.label = labels[label]
     if outfile: tree.write(path=outfile, schema=schema)
     return tree
Example #8
0
 def safe_load(cls, files):
     with user_message("Loading sequences..."):
         try:
             view = cls()
             view.load(files)
         except Exception as e:
             print str(e)
             return None
     if len(view) == 0:
         print 'No sequences were loaded from:\n%s' % files
         view.close()
         return None
     return view
Example #9
0
 def blastp_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, table=11, **kwargs):
     # translate subject in six frames
     with user_message('Translating whole genome in 6 reading frames', '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_six_frames(subject_record, table)
     if not translation: return False
     results = self.s2s_blast_batch(tag_sequences, translation, evalue=evalue, command='blastp', **kwargs)
     if results is None: return False
     with user_message('Adding results as annotations...'):
         annotated = False
         subj_len = len(subject_record)
         for i, tag in enumerate(tag_sequences):
             if not results[i]: continue
             tag_name = pretty_rec_name(tag)
             if tag_name != tag.id:
                 tag_name += ' (%s)' % tag.id
             for frame, record in enumerate(results[i]):
                 if not record: continue
                 frec = translation[frame]
                 start = frec.annotations['start']
                 strand = frec.annotations['strand']
                 for hit in record:
                     for ali in hit.alignments:
                         for hsp in ali.hsps:
                             if hsp.identities / float(hsp.align_length) < min_identity: continue
                             if strand == 1:
                                 location = FeatureLocation(start+(hsp.sbjct_start-1)*3,
                                                            start+hsp.sbjct_end*3,
                                                            strand)
                             else:
                                 location = FeatureLocation(subj_len-start-hsp.sbjct_end*3,
                                                            subj_len-start-hsp.sbjct_start*3,
                                                            strand)
                             feature = self.hsp2feature(tag_name, 'blastp_annotations', location, hsp)
                             self.add_program(feature, 'blastp')
                             subject_record.features.append(feature)
                             annotated = True
     return annotated
Example #10
0
 def process_sequences(seqs, _depth):
     if _depth == 0: return
     with user_message('RingBlast: processing %d sequences of the %d ring.' 
                       % (len(seqs), depth-_depth+1), '\n'): 
         next_ring = blast_filter_fetch(seqs)
         if not next_ring: return
         to_check = []
         next_to_process = []
         for n in next_ring:
             next_seqs = from_shelf(n)
             if not next_seqs: continue 
             for ns in next_seqs:
                 sid = self.base_sid(ns)
                 if sid in extended_set:
                     #FIXME: need to merge sequences properly, instead of replacing 
                     if len(extended_set[sid]) < len(ns):
                         extended_set[sid] = ns 
                 else: to_check.append(ns)
     if not to_check or not check_sequences(to_check, next_to_process): return
     if next_to_process: process_sequences(next_to_process, _depth-1)
Example #11
0
 def process_sequences(seqs, _depth):
     if _depth == 0: return
     with user_message('RingBlast: processing %d sequences of the %d ring.' 
                       % (len(seqs), depth-_depth+1), '\n'): 
         next_ring = blast_filter_fetch(seqs)
         if not next_ring: return
         to_check = []
         next_to_process = []
         for n in next_ring:
             next_seqs = from_shelf(n)
             if not next_seqs: continue 
             for ns in next_seqs:
                 sid = self.base_sid(ns)
                 if sid in extended_set:
                     #FIXME: need to merge sequences properly, instead of replacing 
                     if len(extended_set[sid]) < len(ns):
                         extended_set[sid] = ns 
                 else: to_check.append(ns)
     if not to_check or not check_sequences(to_check, next_to_process): return
     if next_to_process: process_sequences(next_to_process, _depth-1)
Example #12
0
 def annotate_tree(cls, treefile, organisms, outfile=None, schema=None, **kwargs):
     '''
     Annotate input tree with taxonomy information using edge labels and colors.
     @param treefile : a file containing the tree to be annotated
     @param organisms: organisms database
     @param outfile: optional : basename for output file; Note: the last extension will be stripped
     @param schema: data format of the treefile 
     Accepted kwargs:
     @param beautify leafs: bool (True) : replaces IDs in leafs' labels with organism names
     @param mark_leafs: list(str) : mark nodes with the specified labels in bold
     @param collapse_taxa : list(str) : collapses all subtrees belonging to given taxa
     @param collapse_last : bool (False) : changes display method of genus subtrees in Dendroscope to trapezium nodes
     @param collapse_hard: bool (False) : removes collapsed subtrees, leaving a single node
     @param collapse_min_nodes : int (3) : only collapse subtrees with number of leafs greater or equal than this
     @param min_support : float (0: disabled) : nodes with support less that this will be removed from the tree, children being relinked to parents
     @param reroot_at : string ('') : reroot the tree at specified leaf; special value 'midpoint' reroots the at midpoint; special value 'unroot' unroots the tree
     @param lineage_colors : dict : a dictionary of colors as (r, g, b) tuples with lowercase taxons as keys; special value 'auto' causes to automatically assign colors
     @param top_lineage: a Lineage object to be subtracted from lineages of organisms on the tree; if not provided, it is computed automatically 
     '''
     with user_message('Processing tree file...', '\n'):
         tree = cls.load(treefile, schema)
         if not tree:
             print 'No tree loaded.'
             return False
         #need to get the root before beautifying
         new_root = None
         root_name = kwargs.pop('reroot_at', '')
         min_support = kwargs.pop('min_support', False)
         beautify_leafs = kwargs.pop('beautify_leafs', False)
         mark_leafs = kwargs.pop('mark_leafs', set())
         for leaf in tree.leaf_node_iter():
             label = leaf.taxon.label.replace(' ', '_')
             if label in mark_leafs:
                 cls._add_format(leaf, " x=0.0 y=0.0  ft='Ubuntu-BOLDITALIC-14' ll=7;")
             if not new_root and label == root_name: new_root = leaf
             org = organisms.get(label)
             if not org: continue
             leaf.edge.lineage = org.lineage
             if beautify_leafs:
                 leaf.taxon.label = '%s' % (org.description or org.id)
                 if org.id != leaf.taxon.label:
                     leaf.taxon.label += ' (%s)' % org.id
                 leaf.taxon.label = leaf.taxon.label.replace('_', ' ')
             leaf.label = leaf.taxon.label
         if min_support:
             for node in tree.postorder_internal_node_iter(exclude_seed_node=True):
                 try: support = float(node.label)
                 except ValueError: pass
                 if support < min_support and node.edge:
                     node.edge.collapse(adjust_collapsed_head_children_edge_lengths=True)
     #reroot the tree before traversing
     if root_name == 'unroot':
         with user_message('Unrooting tree...'):
             tree.deroot()
     if root_name == 'midpoint':
         with user_message('Rerooting tree at midpoint...'):
             tree.reroot_at_midpoint(update_bipartitions=True)
     elif new_root:
         with user_message('Rerooting tree at %s...' % root_name):
             tree.to_outgroup_position(new_root, update_bipartitions=True)
     else: print 'Node for rerooting not found: %s' % root_name
     #annotate the tree
     with user_message('Adding taxonomy information to the tree...', '\n'):
         top_lineage = kwargs.pop('top_lineage', None) 
         if not isinstance(top_lineage, Lineage): top_lineage = organisms.common_lineage
         colors = kwargs.pop('lineage_colors', None)
         if colors == 'auto': pass#TODO
         cls._set_node_taxonomy(tree.seed_node, top_lineage, None,
                                kwargs.pop('collapse_taxa', []), 
                                kwargs.pop('collapse_last', False), 
                                kwargs.pop('collapse_min_nodes', 3), 
                                kwargs.pop('collapse_hard', False),
                                colors)
     with user_message('Saving resulting tree...'):
         if not outfile: outfile = cls.strip_ext(treefile)+'.out'
         xtreefile = outfile+'.nexml'
         tree.write(path=outfile+'.tre', schema='newick')
         tree.write(path=xtreefile, schema='nexml')
         with open(outfile+'.dot', 'w') as out:
             tree.write_as_dot(out, edge_formatter=lambda e: e.label or '')
     with user_message('Tuning nexml file for Dendroscope...'):
         cls._postprocess_nexml(xtreefile)
     return True
Example #13
0
 def _main(self):
     min_prod = 400
     silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva.fasta'
     alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta'
     add_filename = FilenameParser.strip_ext(
         alifile) + '.with_additions.fasta'
     outgroups = [
         'Thermococcus_chitonophagus', 'SMTZ1-55',
         'contig72135_1581_sunspring_meta'
     ]
     add = ['KF836721.1.1270', 'EU635905.1.1323']
     exclude = [
     ]  #['Thermococcus_chitonophagus', 'SMTZ1-55', 'BA1-16S', 'contig72135_1581_sunspring_meta']
     #load alignment
     if os.path.isfile(add_filename):
         alifile = add_filename
         add_filename = ''
     with user_message('Loadding initial alignment...', '\n'):
         orig_ali = AlignmentUtils.load_first(alifile)
         if not orig_ali: return 1
     #load homologs
     if add_filename:
         with user_message('Loadding additional sequences...', '\n'):
             add_seqs = []
             db = SeqView()
             if db.load(silva_db):
                 for sid in add:
                     seq = db.get(sid)
                     if seq: add_seqs.append(seq)
                     else: print '%s not found in %s' % (sid, silva_db)
         #realign data if needed
         if add_seqs:
             with user_message('Realigning data...', '\n'):
                 add_filename = FilenameParser.strip_ext(
                     alifile) + '.with_additions.fasta'
                 AlignmentUtils.align(
                     list(orig_ali) + add_seqs, add_filename)
                 orig_ali = AlignmentUtils.load_first(add_filename)
                 if not orig_ali: return 2
     #process the alignment
     ali = orig_ali.remove(*exclude).trim()
     for out in outgroups:
         if not ali.index(out):
             print '%s not found in the alignment' % out
             return 3
     ali.sort(key=lambda r: 'zzzzzzzz' if r.id in outgroups else r.id)
     AlignmentUtils.save(
         ali,
         '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.trimmed.fasta'
     )
     args = dict(plen=(20, 40),
                 max_mismatches=8,
                 min_match_mismatches=1,
                 first_match_mismatches=1,
                 first_may_match=1,
                 AT_first=True,
                 outgroup=len(outgroups))
     fprimers = PrimerFinder.find_discriminating_primers(ali, **args)
     rprimers = PrimerFinder.find_discriminating_primers(ali,
                                                         reverse=True,
                                                         **args)
     pairs = PrimerFinder.compile_pairs(fprimers, rprimers, min_prod,
                                        'SSBa')
     if not pairs:
         print '\nNo suitable primer pairs found'
         return 3
     PrimerFinder.print_pairs(pairs)
     orig_ali = PrimerFinder.add_pairs_to_alignment(pairs, orig_ali)
     AlignmentUtils.save(
         orig_ali,
         '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.with_primers.aln.fasta'
     )
     print 'Done'
Example #14
0
    abort_event.set(); sleep(0.1)
    clean_tmp_files()
#end def

if __name__ == '__main__':
    from multiprocessing import Event
    from BioUtils.Tools.Output import user_message
    from BioUtils.SeqUtils import load_files, load_dir
    _pid = os.getpid()
    #setup signal handler
    signal.signal(signal.SIGINT,  sig_handler)
    signal.signal(signal.SIGTERM, sig_handler)
    signal.signal(signal.SIGQUIT, sig_handler)
    
    abort_event = Event()
    with user_message('Loading genomes...', '\n'):
        genomes_dir = u'/home/allis/Documents/INMI/Aerobic-CODH/genomes/'
        genome_names = ['Thermococcus_barophilus_Ch5-complete.gb', 
                        'Thermococcus_onnurineus_NA1-complete-genome.gb',
                        'Thermococcus_sp._ES1.gb',
                        'Thermococcus-DS1-preliminary.gb'] 
        genomes = load_dir(abort_event, genomes_dir, 'gb', r'.*\.gb')
        if not genomes: sys.exit(1)
#        load_files(abort_event, [os.path.join(genomes_dir, f) for f in genome_names], 'gb') 
    
    hmm = u'/home/allis/Documents/INMI/Aerobic-CODH/COX-EC/COX-EC_1.2.99.2_CoxL.hmm'
    
    hmmer = Hmmer(abort_event)
    
    for g in genomes:
        results = hmmer.hmmsearch_genome(hmm, g, table=11, decorate=True)
Example #15
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated genes:\n%s %s' 
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)
Example #16
0
 def ring_blast(self, query, db='nr', evalue=0.001, blast_filter=None, depth=1, command='blastn', **kwargs):
     '''Perform a blast search with the given query to obtain the core set of hits.
     Make another search with each hit as a query.
     If results of the second search contain new hits,
     check if these are reciprocal by yet another search with them
     and checking that results contain hits from the core set and if they are,
     add the to the final set.
     '''
     if isinstance(query, SeqRecord): query = [query]
     def blast_filter_fetch(seqs):
         @MultiprocessingBase.data_mapper
         @shelf_result
         def worker(s):
             r = self.blast_seq(s, db, evalue, command)
             if r and blast_filter: blast_filter(r)
             if r: return self.fetch_results(r, db, what='alignment')
             return None
         results = []
         total = len(seqs)
         prg = ProgressCounter('Performing blast search for %d sequences:' % total, total)
         @MultiprocessingBase.results_assembler
         def assembler(i, res):
             if res: results.append(res)
             prg.count()
         with prg:
             if not self.parallelize2(1, worker, assembler, seqs): return None
             return results
     
     with user_message('RingBlast: building a core set of sequences.', '\n'):
         core_seqs = blast_filter_fetch(query)
         if not core_seqs: return None
         core_seqs = self.unique_seqs(chain.from_iterable(from_shelf(r) for r in core_seqs))
         extended_set = dict((self.base_sid(s), s) for s in core_seqs)
         if depth <= 0: return core_seqs
         core_db = self.format_tmp_db(core_seqs, command.endswith('n'))
         
     def check_sequences(seqs, next_to_process):
         total = len(seqs)
         prg = ProgressCounter('RingBlast: checking %d new sequences:' % total, total)
         @MultiprocessingBase.data_mapper
         def worker(seq):
             res = self.blast_seq(seq, core_db, 100, command)
             if res and blast_filter: blast_filter(res)
             return bool(res), seq
         @MultiprocessingBase.results_assembler
         def assembler(i, res):
             prg.count()
             if not res[0]: return 
             seq = res[1]
             extended_set[self.base_sid(seq)] = seq
             next_to_process.append(seq)
         with prg: return self.parallelize2(1, worker, assembler, seqs)
         
     def process_sequences(seqs, _depth):
         if _depth == 0: return
         with user_message('RingBlast: processing %d sequences of the %d ring.' 
                           % (len(seqs), depth-_depth+1), '\n'): 
             next_ring = blast_filter_fetch(seqs)
             if not next_ring: return
             to_check = []
             next_to_process = []
             for n in next_ring:
                 next_seqs = from_shelf(n)
                 if not next_seqs: continue 
                 for ns in next_seqs:
                     sid = self.base_sid(ns)
                     if sid in extended_set:
                         #FIXME: need to merge sequences properly, instead of replacing 
                         if len(extended_set[sid]) < len(ns):
                             extended_set[sid] = ns 
                     else: to_check.append(ns)
         if not to_check or not check_sequences(to_check, next_to_process): return
         if next_to_process: process_sequences(next_to_process, _depth-1)
         
     process_sequences(core_seqs, depth)
     return extended_set.values()
Example #17
0
    def _main(self):
        query = simple_rec('AAACTGGGGCTAATACCCGATGGGTGAGGAGGCCTGGAATGGTTCTTCACCGAAAAGACGTTGAGACCATGCTTTTCAACGTTGCCTAAGGATGGGGCCGCGTCCGATCAGGTTGTTGGTGGGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGGGAGCGGAAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAAACTCCGCAATGCGCGAAAGCGTGACGGGGCTACCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAATGAGCCTGGGGAATAAGGAGAGGGCAAGCCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTAGGGATGATTATTGGGCTTAAAGCGTCCGTAGCCAGCCCGGCAAGTCTCCCGTTAAATCCAGCGACCTAATCGTTGGGCTGCGGAAGATACTGTTGGGCTAGGGGGCGGGAGAGGCCGACGGTATTCCCGGGGTAGGGGTGAAATCCTATAATCCTGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCTCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACTGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCTCAGTGCCGCAGGGAAGCCATTAAGCCCGCCGCCTGGGAAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGCGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGGCGACAGCAGGATGAGGGCCAGATTGAAGGTCTTGCTTGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGCAACGATCGAGACCCGCACCCTTAGTTGCAACCCCTGCGGAACCCGCAGGGGGCACACTACGGGAACTGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAGAGACAATGGGTTCCAACCTTGAAAGAGGGAGGTAATCCCTAAACCCTGCCTCAGTTGGGATCGAGGGCTGCAACCCGCCCTCGTGAACATGGAATGCCTAGTAATCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTGGGTGAGGCGTGGTCTGTTGGCCGCGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT',
                           'BA2-16S')
        suns_db = '/home/allis/Documents/INMI/SunS-metagenome/BlastDB-big/sunspring_meta'
        silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva'

        additions = [simple_rec('AAACTGGGGCTAATCCCCCATAGGCCTGGGGTACTGGAAGGTCCCCAGGCCGAAAGGG------GACCGTA-----AGGTCCCGCCCGAGGATGGGCCGGCGGCCGATTAGGTAGTTGGTGGGGTAACGGCCCACCAAG--CCGAAGATCGGTACGGGCC-GTGAGAGCGGGAGCCCGGAGATGGACA---CTGAGACACGGGTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCGCAATGCGGGAAACCGCGACGGGGGGACCCCCAGTGCCGTGCCTCTGGC-----ACGGCTTTTCCGGAGTG-TAAAAAGCTCCGGGAATAAGGGCTGGGCAAGGCCGGTGGC-AGCCGCCGCGGTAATACCGGCGGCCCGAGTGGTGGCCACTATTATTGGGCCTAAAGCGGCCGTAGCCGGGCCCGTAAGTCCCTGGCG-AAATCCCACGGCTCAACCGTGGGGCTCGCTGGGGATACTGCGG-GCCTTGGGACCGGGAGAGGCCGGGGGTACC-CCCGGGGTAGGGGTGAAATCCTATAATCCCGGGGGGACCGCCAGT-GGCGAAGGCGCCC--GGCTGGAACGGGTCCGACGGTGAGGGCCGAAGGCC-AGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCTGGCTGTAAAGGATGCGGGCTAGGTGTCGGGCGAG-CTTCGAGCTCGC-CCGGTGCCGTAGGGAAGCCGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACTACAAGGGGTGGAGCGTGCGGTTTAATTGGATTCAACGCCGGGAACCTCACCGGGGGCGACGGCAGGATGAA-GGCCAGGCTGAAGGTCTTGCCGGACGCGCCGAGAGGAG-----------------------------------GTGCATGGCCGCCGTCAGCTCGTACCGTGAGGCGTCCA-CTTAAGTGTGGTAACGAGCGAGACCCGC--GCCCCCAGTTGCCAGTCCCTCCCGCTGGGA---GGGAGGC-ACTCTGGGGGG-ACTGCCGGCGAT-AAGCCGGAGGAAGGGGCGGGCGACGGTAGGTCAGTATG-CCCCGAAACCC-CCGGGCT-ACACGCGCGCTACAATGGGCGGGACAATGGGA-CCCGACCCCGAAAGGGGAAGGGAATCCCCTAAACCCGCCCTCAGTTCGGATCGCGGGCTG-CAACTCGCCCGCGTGAAGC-TGGAAT-CCCTAGTACCCGCGCGTCATCATCGCGCGGCGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCACTCCACCCGAG-CGGGGCCC-GGGTGAGGCCCGATCTCCTTCGGGAGGTCGGGTCGAGCCTGGGCTC-CGTGAGGGGGG-AGAAGTCGTAACAAGGTAGCC------------------------------'.replace('-', ''),
                                'Thermococcus_chitonophagus'),
                     simple_rec('AAACTGGGATTAATACCCACTAAATGATAATACCTGGAATGGCTTATCATTGAAAGAC-TCTGGAAACATGCTTC-CAGCGTCGCCCAAGG-------------------------------------------------------------------------------GGAGCCCGGAGATGGAAA---CTGAGACAAGGTTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCACAATGCGCGAAAGCGTGATGGGGTTATCCCGAGTGCCGTCCGATGAGG-----ATGGCTTTTCCTCGGTG-TAAGGATCCGAGGGAATAAAGGGGGGGCAAGACTGGTGTC-AGCCGCCGCGGTAATACCAGCTCCCTGAGTGGTAAGGACGATTATTTGGCCTAAAGCGTCCGTAGCCGGCTTATCAAGTCTCTTGTT-AAACCCAGTGATTCAATCATTGACCT-GCAAGAGATACTGTTA-TGCTAGAGGACGGGAGAGGTCGACGG---------GGGTAGGGGTGAAATCCTATAATCCTTGGAGGACCACCAGT-GGCGAAGGCGGTC--GACTAGAACGTGCCTGACGGTGAGGGACGAAAGCT-GGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTAG-CTACGAGCTACT-CCAGTGCCGCAGAGAAGTTGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACCACAAGGGGTGAAGGCTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGCTGACAGCAGAGTGAA-GGCCAGACTGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCT-GTTAAGTCAGGCAACGAACGAGACCCCC--ACTGTTAGTTGCCAGCGAATTCCAACGGAAT--GTCGGGC-ACACTAACAGG-ACTGCCACCGAT-AAGGTGGAGGAAGGAGGGGGCAACGGCAGGTCAGTATG-CCCC--------------------------------------------------------------------------------------------------------------GAACTCGCCCTCATGAACA-TGGAAT-CCCTAGTAACCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCAAG-TCGGGTCT-AGATGAGGCGCAGTCTTCT-----TGGCTACGTCGAATCTGGGTTC-GGTGAGGGGGG-AGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT'.replace('-', ''),
                                'SMTZ1-55'),
                     simple_rec('ACTCCGGTTGATCCTGCCGGACCCCACTGCTATCGGGGTAGGACTTAACCATGCGAGTTGTGCGTCCCCAAGCCATGGTGGGGGCGCGGCATACGGCTCAGTAACACGTGGCTAACCTAGCCTTTGGACGGGGACAACCCCGGGAAACTGGGGCTAATCCCCGATGGGTGGGAAGGCCTGGAATGGTTTCCCACCGAAAGGGCGTCTGAACCATGCTTCAGGCGTTGCCGAAGGATGGGGCCGCGGCCGATCAGGTTGTTGGTGAGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGAGAGCGGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGGTGCGAAAACTCCGCGATGCGCGAAAGCGTGACGGGGCTATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAGGGAGCCGGGGGAATAAGGAGAGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTGGGGACAATTATTGGGCTTAAAGCGTCCGTAGCCGGCCCATCAAGTCTCTTGTTAAATCCAGCGATCCAATCGCTGGACTGCGGGAGATACTGCTGGGCTAGGGGGCGGGAGAAGCCGATGGTATTCTCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGTAGGCGGTCGGCTAGAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCCCAGTGCCGCATGGAAGCAATTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAAAGGAACAGCGTTTTGTTGTTCCTCTGGATACCTTACCGGGGGCGACAGCAGGATGAAGGCCAGATTGAAGGTCTTGCTGGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGTAACGATCGAGACCCACACCCCCAGTTGCTACCTCTTCGGAGGGCACTCTAGGGGTACTGCCGCCGATAAGGCGGAGGAAGGAGTGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAAGGACAATGGGTTCTGACCCCGAGAGGGGAAGGTAATCCCGAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACCCGCCCTCGTGAACATGGAATCCCTAGTAATCGCGGGTCACCAGCCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTAGGTGAGGCGTGGTCCTTGTGGCTGTGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTC',
                                'BA1-16S')
                     ]
        #prepare filter
        filt = BlastFilter(lambda a: a.hsps[0].align_length > 1100)
        filt.AND = BlastFilter(lambda a: all(hsp.score > 500 for hsp in a.hsps))
        filt.AND.AND = BlastFilter(lambda a: all(hsp.identities/float(hsp.align_length) > 0.8 for hsp in a.hsps))
        #make ring-blast
        blast = BlastCLI(self.abort_event)
        orig_seqs = blast.ring_blast(query, suns_db, 100, filt, 3)
        if not orig_seqs:
            print 'No blast results.'
            return 1
        nseqs = len(orig_seqs)
        print 'RingBlast to:\n%s\nreturned %d sequences.\n' % (suns_db, nseqs)
        #save an initial alignment
        self.fix_ids(orig_seqs)
        alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta'
        with user_message('Aligning retrieved sequences...', '\n'):
            if not AlignmentUtils.align(orig_seqs+[query]+additions, outfile=alifile): return 3
        #search for additional homologs
        add_seqs = blast.ring_blast(orig_seqs, silva_db, 100, filt, 0)
        if add_seqs:
            self.fix_ids(add_seqs)
            print 'RingBlast to:\n%s\nreturned %d additional sequences.\n' % (silva_db, len(add_seqs))
        #build an alignment
        seqs = orig_seqs+add_seqs+[query]+additions
        alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.fasta'
        with user_message('Aligning retrieved sequences...', '\n'):
            if not AlignmentUtils.align(seqs, outfile=alifile): return 3
        #build a tree 
        treefile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.tre'
        if not PhyloUtils.build_fast_tree(alifile, treefile): return 4
        #annotate the tree
        with open('/home/allis/Documents/INMI/16S/SSBaF4-SSBaR4-1_243072232-iPCR-report.txt') as inp:
#            SSBaF4-SSBaR4_65397396-iPCR-report.txt
            sids = set()
            len_re = re.compile(r'(\s|^)(\d+)(\sbp|\\s*:)?', re.MULTILINE)
            entry = False
            cur_sid = None
            cur_len = -1
            for l in inp:
                if l == '========= histograms and electrophorograms of PCR products of each hit =========': break
                if l.startswith('---'): 
                    entry = False
                    if cur_sid and cur_len > 0 and abs(cur_len-920) < 60:
                        sids.add(cur_sid)
                    cur_sid = None
                    cur_len = -1
                    continue
                if entry or '#' in l:
                    entry = True
                    plen = len_re.search(l)
                    if plen: cur_len = int(plen.group(2))
                    sid = BlastID.extract(l)[0]
                    if sid: cur_sid = sid
        organisms = Organisms.from_records(seqs)
        if PhyloUtils.annotate_tree(treefile, organisms, 
                                    reroot_at='Thermococcus_chitonophagus',
#                                    beautify_leafs=True,
#                                    collapse_taxa=['miscellaneous crenarchaeotic group', 'thaumarchaeota'],
#                                    collapse_last=True,
#                                    collapse_hard=True,
                                    mark_leafs=sids,
#                                    [r.id for r in orig_seqs+[query]+additions],
                                    lineage_colors={'miscellaneous crenarchaeotic group':(0, 0, 255),
                                                    'thaumarchaeta':(255,0,0)},
                                    top_lineage=Lineage('archaea')): return 0
        return 2
Example #18
0
    from multiprocessing import Event
    from BioUtils.Tools.Output import user_message
    from BioUtils.SeqUtils import load_files
    _pid = os.getpid()
    #setup signal handler
    signal.signal(signal.SIGINT, sig_handler)
    signal.signal(signal.SIGTERM, sig_handler)
    signal.signal(signal.SIGQUIT, sig_handler)

    if True:
        #    from DegenPrimer import MultiprocessingBase
        #    MultiprocessingBase.cpu_count = 1
        abort_event = Event()
        lb = BlastCLI(abort_event)

        with user_message('Loading genomes...', '\n'):
            genomes_dir = u'/home/allis/Dropbox/Science/Микра/Thermococcus/sequence/GenBank/Thermococcus'
            genome_names = [
                'Thermococcus_barophilus_Ch5-complete.gb',
                'Thermococcus_onnurineus_NA1-complete-genome.gb',
                'Thermococcus_sp._ES1.gb', 'Thermococcus-DS1-preliminary.gb'
            ]
            genomes = load_files(
                abort_event,
                [os.path.join(genomes_dir, f) for f in genome_names], 'gb')

        ref = genomes[0]
        subj = genomes[1:]

        @shelf_result
        def g2g2shelf():
Example #19
0
 def _main(self):
     min_prod = 400
     silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva.fasta'
     alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta'
     add_filename = FilenameParser.strip_ext(alifile)+'.with_additions.fasta'
     outgroups = ['Thermococcus_chitonophagus', 'SMTZ1-55', 'contig72135_1581_sunspring_meta']
     add = ['KF836721.1.1270','EU635905.1.1323']
     exclude = []#['Thermococcus_chitonophagus', 'SMTZ1-55', 'BA1-16S', 'contig72135_1581_sunspring_meta']
     #load alignment
     if os.path.isfile(add_filename): 
         alifile = add_filename
         add_filename = ''
     with user_message('Loadding initial alignment...', '\n'):
         orig_ali = AlignmentUtils.load_first(alifile)
         if not orig_ali: return 1
     #load homologs
     if add_filename:
         with user_message('Loadding additional sequences...', '\n'):
             add_seqs = []
             db = SeqView()
             if db.load(silva_db):
                 for sid in add:
                     seq = db.get(sid)
                     if seq: add_seqs.append(seq)
                     else: print '%s not found in %s' % (sid, silva_db)
         #realign data if needed
         if add_seqs:
             with user_message('Realigning data...', '\n'):
                 add_filename = FilenameParser.strip_ext(alifile)+'.with_additions.fasta'
                 AlignmentUtils.align(list(orig_ali)+add_seqs, add_filename)
                 orig_ali = AlignmentUtils.load_first(add_filename)
                 if not orig_ali: return 2
     #process the alignment
     ali = orig_ali.remove(*exclude).trim()
     for out in outgroups:
         if not ali.index(out):
             print '%s not found in the alignment' % out
             return 3
     ali.sort(key=lambda r: 'zzzzzzzz' if r.id in outgroups else r.id)
     ali_len = ali.get_alignment_length()
     AlignmentUtils.save(ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.trimmed.fasta')
     args = dict(plen = (20,40),
                 max_mismatches = 8,
                 min_match_mismatches = 1,
                 first_match_mismatches = 1,
                 first_may_match = 1,
                 AT_first=True,
                 outgroup=len(outgroups))
     fprimers = self._find_primers(ali, **args)
     rprimers = self._find_primers(ali.reverse_complement(), **args)
     pairs = []
     for i, (fs, fp) in enumerate(fprimers):
         start = fs
         fprimer = Primer.from_sequences(fp[:-1], 1, 'SSBaF%d' % fs)
         for _j, (rs, rp) in enumerate(rprimers):
             end = ali_len-rs
             if end-start <= min_prod: continue
             pairs.append((fprimer, Primer.from_sequences(rp[:-1], 1, 'SSBaR%d' % (ali_len-rs+1))))
     if not pairs:
         print '\nNo suitable primer pairs found'
         return 3
     added = set()
     for i, (fp, rp) in enumerate(pairs):
         print '\npair %d' % (i+1)
         print '%s: %s' % (fp.id, fp)
         print '%s: %s' % (rp.id, rp)
         if fp.id not in added:
             orig_ali.append(fp.master_sequence+'-'*(orig_ali.get_alignment_length()-len(fp)))
             added.add(fp.id)
         if rp.id not in added:
             orig_ali.append(copy_attrs(rp.master_sequence,
                                        rp.master_sequence.reverse_complement())+
                             '-'*(orig_ali.get_alignment_length()-len(rp)))
             added.add(rp.id)
     print
     orig_ali = AlignmentUtils.align(orig_ali)
     AlignmentUtils.save(orig_ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.with_primers.aln.fasta')
     print 'Done'
Example #20
0
    def _main(self):
        query = simple_rec(
            'AAACTGGGGCTAATACCCGATGGGTGAGGAGGCCTGGAATGGTTCTTCACCGAAAAGACGTTGAGACCATGCTTTTCAACGTTGCCTAAGGATGGGGCCGCGTCCGATCAGGTTGTTGGTGGGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGGGAGCGGAAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAAACTCCGCAATGCGCGAAAGCGTGACGGGGCTACCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAATGAGCCTGGGGAATAAGGAGAGGGCAAGCCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTAGGGATGATTATTGGGCTTAAAGCGTCCGTAGCCAGCCCGGCAAGTCTCCCGTTAAATCCAGCGACCTAATCGTTGGGCTGCGGAAGATACTGTTGGGCTAGGGGGCGGGAGAGGCCGACGGTATTCCCGGGGTAGGGGTGAAATCCTATAATCCTGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCTCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACTGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCTCAGTGCCGCAGGGAAGCCATTAAGCCCGCCGCCTGGGAAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGCGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGGCGACAGCAGGATGAGGGCCAGATTGAAGGTCTTGCTTGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGCAACGATCGAGACCCGCACCCTTAGTTGCAACCCCTGCGGAACCCGCAGGGGGCACACTACGGGAACTGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAGAGACAATGGGTTCCAACCTTGAAAGAGGGAGGTAATCCCTAAACCCTGCCTCAGTTGGGATCGAGGGCTGCAACCCGCCCTCGTGAACATGGAATGCCTAGTAATCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTGGGTGAGGCGTGGTCTGTTGGCCGCGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT',
            'BA2-16S')
        suns_db = '/home/allis/Documents/INMI/SunS-metagenome/BlastDB-big/sunspring_meta'
        silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva'

        additions = [
            simple_rec(
                'AAACTGGGGCTAATCCCCCATAGGCCTGGGGTACTGGAAGGTCCCCAGGCCGAAAGGG------GACCGTA-----AGGTCCCGCCCGAGGATGGGCCGGCGGCCGATTAGGTAGTTGGTGGGGTAACGGCCCACCAAG--CCGAAGATCGGTACGGGCC-GTGAGAGCGGGAGCCCGGAGATGGACA---CTGAGACACGGGTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCGCAATGCGGGAAACCGCGACGGGGGGACCCCCAGTGCCGTGCCTCTGGC-----ACGGCTTTTCCGGAGTG-TAAAAAGCTCCGGGAATAAGGGCTGGGCAAGGCCGGTGGC-AGCCGCCGCGGTAATACCGGCGGCCCGAGTGGTGGCCACTATTATTGGGCCTAAAGCGGCCGTAGCCGGGCCCGTAAGTCCCTGGCG-AAATCCCACGGCTCAACCGTGGGGCTCGCTGGGGATACTGCGG-GCCTTGGGACCGGGAGAGGCCGGGGGTACC-CCCGGGGTAGGGGTGAAATCCTATAATCCCGGGGGGACCGCCAGT-GGCGAAGGCGCCC--GGCTGGAACGGGTCCGACGGTGAGGGCCGAAGGCC-AGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCTGGCTGTAAAGGATGCGGGCTAGGTGTCGGGCGAG-CTTCGAGCTCGC-CCGGTGCCGTAGGGAAGCCGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACTACAAGGGGTGGAGCGTGCGGTTTAATTGGATTCAACGCCGGGAACCTCACCGGGGGCGACGGCAGGATGAA-GGCCAGGCTGAAGGTCTTGCCGGACGCGCCGAGAGGAG-----------------------------------GTGCATGGCCGCCGTCAGCTCGTACCGTGAGGCGTCCA-CTTAAGTGTGGTAACGAGCGAGACCCGC--GCCCCCAGTTGCCAGTCCCTCCCGCTGGGA---GGGAGGC-ACTCTGGGGGG-ACTGCCGGCGAT-AAGCCGGAGGAAGGGGCGGGCGACGGTAGGTCAGTATG-CCCCGAAACCC-CCGGGCT-ACACGCGCGCTACAATGGGCGGGACAATGGGA-CCCGACCCCGAAAGGGGAAGGGAATCCCCTAAACCCGCCCTCAGTTCGGATCGCGGGCTG-CAACTCGCCCGCGTGAAGC-TGGAAT-CCCTAGTACCCGCGCGTCATCATCGCGCGGCGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCACTCCACCCGAG-CGGGGCCC-GGGTGAGGCCCGATCTCCTTCGGGAGGTCGGGTCGAGCCTGGGCTC-CGTGAGGGGGG-AGAAGTCGTAACAAGGTAGCC------------------------------'
                .replace('-', ''), 'Thermococcus_chitonophagus'),
            simple_rec(
                'AAACTGGGATTAATACCCACTAAATGATAATACCTGGAATGGCTTATCATTGAAAGAC-TCTGGAAACATGCTTC-CAGCGTCGCCCAAGG-------------------------------------------------------------------------------GGAGCCCGGAGATGGAAA---CTGAGACAAGGTTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCACAATGCGCGAAAGCGTGATGGGGTTATCCCGAGTGCCGTCCGATGAGG-----ATGGCTTTTCCTCGGTG-TAAGGATCCGAGGGAATAAAGGGGGGGCAAGACTGGTGTC-AGCCGCCGCGGTAATACCAGCTCCCTGAGTGGTAAGGACGATTATTTGGCCTAAAGCGTCCGTAGCCGGCTTATCAAGTCTCTTGTT-AAACCCAGTGATTCAATCATTGACCT-GCAAGAGATACTGTTA-TGCTAGAGGACGGGAGAGGTCGACGG---------GGGTAGGGGTGAAATCCTATAATCCTTGGAGGACCACCAGT-GGCGAAGGCGGTC--GACTAGAACGTGCCTGACGGTGAGGGACGAAAGCT-GGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTAG-CTACGAGCTACT-CCAGTGCCGCAGAGAAGTTGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACCACAAGGGGTGAAGGCTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGCTGACAGCAGAGTGAA-GGCCAGACTGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCT-GTTAAGTCAGGCAACGAACGAGACCCCC--ACTGTTAGTTGCCAGCGAATTCCAACGGAAT--GTCGGGC-ACACTAACAGG-ACTGCCACCGAT-AAGGTGGAGGAAGGAGGGGGCAACGGCAGGTCAGTATG-CCCC--------------------------------------------------------------------------------------------------------------GAACTCGCCCTCATGAACA-TGGAAT-CCCTAGTAACCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCAAG-TCGGGTCT-AGATGAGGCGCAGTCTTCT-----TGGCTACGTCGAATCTGGGTTC-GGTGAGGGGGG-AGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT'
                .replace('-', ''), 'SMTZ1-55'),
            simple_rec(
                'ACTCCGGTTGATCCTGCCGGACCCCACTGCTATCGGGGTAGGACTTAACCATGCGAGTTGTGCGTCCCCAAGCCATGGTGGGGGCGCGGCATACGGCTCAGTAACACGTGGCTAACCTAGCCTTTGGACGGGGACAACCCCGGGAAACTGGGGCTAATCCCCGATGGGTGGGAAGGCCTGGAATGGTTTCCCACCGAAAGGGCGTCTGAACCATGCTTCAGGCGTTGCCGAAGGATGGGGCCGCGGCCGATCAGGTTGTTGGTGAGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGAGAGCGGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGGTGCGAAAACTCCGCGATGCGCGAAAGCGTGACGGGGCTATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAGGGAGCCGGGGGAATAAGGAGAGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTGGGGACAATTATTGGGCTTAAAGCGTCCGTAGCCGGCCCATCAAGTCTCTTGTTAAATCCAGCGATCCAATCGCTGGACTGCGGGAGATACTGCTGGGCTAGGGGGCGGGAGAAGCCGATGGTATTCTCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGTAGGCGGTCGGCTAGAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCCCAGTGCCGCATGGAAGCAATTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAAAGGAACAGCGTTTTGTTGTTCCTCTGGATACCTTACCGGGGGCGACAGCAGGATGAAGGCCAGATTGAAGGTCTTGCTGGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGTAACGATCGAGACCCACACCCCCAGTTGCTACCTCTTCGGAGGGCACTCTAGGGGTACTGCCGCCGATAAGGCGGAGGAAGGAGTGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAAGGACAATGGGTTCTGACCCCGAGAGGGGAAGGTAATCCCGAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACCCGCCCTCGTGAACATGGAATCCCTAGTAATCGCGGGTCACCAGCCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTAGGTGAGGCGTGGTCCTTGTGGCTGTGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTC',
                'BA1-16S'),
            simple_rec(
                'CTGGTGGAAATATAGAAGAGGCCAAATCCGGGGTTCAGGCCGCCCGGGGTAATTACCCGTTGTCGGAGTGGGGGGGGGACGCTATTGGGGCTTAAGCCATCGTTAGCCCGTTTGACCAGGTCTCTTGTTAAATCAGGCGGATTTATTGGTCGATTGCAGGAGATTATGTTCGTCTTAGGGGCCGGAGGAGTCAACAGTATTCCCGGGGTAGGAGTGAATGCCTATATTCCCGGAGGTACCACCAGTGGGGACGCCGTTGGTATAGAACGCGCCGGCCGGTGATGGAATGAAAGTGAGGGAACCGACCCGAATTAGATACCGGGGTATTGCTACCGTTAACCGATGCAGCTTAGGTGTTCGGGTGGTTACTAGCCATTCGAGTGCGCCAGGGAAGCTGTCAGGCTTACCGCTTGGGAAGTGCGGCTGCAGGGCCAAAACTTAAGGAAATCGCCGGGGAAGCACCCCAGGGGGTGAAGCTTGCGCTTTAATGGAATTCACCGCGGTAATTCTCACCGGGGGAGCCACCAGGAGGAAAGCCAGATTAAAGTTCTTGTTGGCGGAGTGGAGAGGAGGTGCATGCCGTTCGCCAGTTCTTCCCGGGAGGTTCTTGTTAGTTCAGCCACCGATGAGGACCGCCATCCCCTGTTGTTATTGGCCTTGCGCCAGGCACACTGGGGAGACCGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGTCCACACGCGAGGGGCAATG',
                '155a'),
            simple_rec(
                'CAAGTCCTATAACCGGTACGGGCCGTGGGAGCGGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAACCTCCGCAATACGCGAAAGCGTGACGGGGTCATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCAGTGTAGACAGCTGGGGGAATAAGGAGAGGGCAAGTCGGGTGTCAGCCGCCGCGGTAATACCCGCTCTCCGAGTGGTGGGGACGCTTATTGGGCCTAAAGCATCCGTAGCCGGCTGGACAAGTCCCCTGTTAAATCCAGCGATTTAATCGTTGGACTGCGGGGGATACTGTCCGGCTAGGGGGCGGGAGAGGCCGACGGTATTTCCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCCCGACGGTGAGGGATGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCAGGCTAGGTGTTCGGGTGGCTACGTGCCACTCGAGTGCCGCAGGGAAGCTGTTAAGCCTGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGAAATCTCACCGGGGGAGACAGCAGGATGAAAGCCAGATTAAAGGTCTTGCTAGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAAGGCAACGATCGAGACTCGCATCCTCTGTTGCTACTACCCTTGCGCCAAGGCACACTGGGGGAGACCGCCGCTCGATAAGGCGGAAGGAAGGAGCGGCCCACGGCAGTCAGTATGCCCCGAATTCCCTCGGCCACACGCAAGCTGCAATG',
                '156a'),
            simple_rec(
                'GGGGATCGGGGCATACTGACCTGCCGTGGCCCGCTCCTTCCTCCGCCTTATCGGCGGCGGTCTCCCCAGTGTGCCTGGCGCAAGGGCAGTAACAACAGGGGATGGGGGTCTCGATCGGTGGCTGGCTTAACAGGAAACCTCACGGGACGAACTGGCGAACGGCATGGACCTTCTCTCAACTTGGCTAAGAAGAACTTTAATCTGGCTTTCATTCTGGTGGCTTCCCCGGTGAGAATTCCGGCGGTGACTCCCAATAAAACGCAAGCTTCACCCCTTGGGGTGGTTCCCCGGCCATTTCTTTAAGGTTCAAGCTTTGCGGCGGTATTCCCAAGCGGCAAGGTTAACAGCTTCCCTGCCGCACTCGAGTGGCACGTAACCACCCGAACAACTAACCTGCATCCGTTACCGGTTGGACTAACCCGGTATCTAATCCGGGTCGCTCCCCCAGCCTTCATTCCTTCACCGTCCGGCGCGGTTCTAAGCGACCGGCTTTCGCACTTGTGGTTCCTCCCGGGGATTATAAGAATTCACCCCTACCCCGGAAATTACGGTCCGGCTCCTCCGGCCCCTAACCCGACACGTAATCCCCCGCCAGTTCAACCGATTAAATCCGCTTGAATTTAACAAGGGGGACCTTGTCCAGCCGGCCTACGGATGCTTTAAGGCCCAATAAGCCGTCCCCACCACTCCGAGAGCGGGTAATAACCGCGGCCGGCCTGACAACCGACCTGGCCTCTCCTAAATCCCCCAGCTGTTCACACTTGGGAAAGGGCATTCCTCAGCGAACGGCACTTCGGGATGAACCCGTCACGCTTTCGCGTAATTGCGGGAAGGTTTCGCGAACTGCTGCGCCCCGTAAAGGCCTGGGTCCTTGTGTCTCAAATTGCCCCATCTCCGGGCTATACGCTCTCCACGGGCCCGTACC',
                '157a')
        ]
        #prepare filter
        filt = BlastFilter(lambda a, r: a.hsps[0].align_length > 1100)
        filt.AND = BlastFilter(
            lambda a, r: all(hsp.score > 500 for hsp in a.hsps))
        filt.AND.AND = BlastFilter(lambda a, r: all(
            hsp.identities / float(hsp.align_length) > 0.8 for hsp in a.hsps))
        #make ring-blast
        blast = BlastCLI(self.abort_event)
        orig_seqs = blast.ring_blast(query, suns_db, 100, filt, 3)
        if not orig_seqs:
            print 'No blast results.'
            return 1
        nseqs = len(orig_seqs)
        print 'RingBlast to:\n%s\nreturned %d sequences.\n' % (suns_db, nseqs)
        #save an initial alignment
        self.fix_ids(orig_seqs)
        alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta'
        with user_message('Aligning retrieved sequences...', '\n'):
            if not AlignmentUtils.align(orig_seqs + [query] + additions,
                                        outfile=alifile):
                return 3
        #search for additional homologs
        add_seqs = blast.ring_blast(orig_seqs, silva_db, 100, filt, 0)
        if add_seqs:
            self.fix_ids(add_seqs)
            print 'RingBlast to:\n%s\nreturned %d additional sequences.\n' % (
                silva_db, len(add_seqs))
        #build an alignment
        seqs = orig_seqs + add_seqs + [query] + additions
        alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.fasta'
        with user_message('Aligning retrieved sequences...', '\n'):
            if not AlignmentUtils.align(seqs, outfile=alifile): return 3
        #build a tree
        treefile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.tre'
        if not PhyloUtils.build_fast_tree(alifile, treefile): return 4
        #annotate the tree
        if False:
            with open(
                    '/home/allis/Documents/INMI/16S/SSBaF4-SSBaR4-1_243072232-iPCR-report.txt'
            ) as inp:
                #            SSBaF4-SSBaR4_65397396-iPCR-report.txt
                sids = set()
                len_re = re.compile(r'(\s|^)(\d+)(\sbp|\\s*:)?', re.MULTILINE)
                entry = False
                cur_sid = None
                cur_len = -1
                for l in inp:
                    if l == '========= histograms and electrophorograms of PCR products of each hit =========':
                        break
                    if l.startswith('---'):
                        entry = False
                        if cur_sid and cur_len > 0 and abs(cur_len - 920) < 60:
                            sids.add(cur_sid)
                        cur_sid = None
                        cur_len = -1
                        continue
                    if entry or '#' in l:
                        entry = True
                        plen = len_re.search(l)
                        if plen: cur_len = int(plen.group(2))
                        sid = BlastID.extract(l)[0]
                        if sid: cur_sid = sid
        organisms = Organisms.from_records(seqs)
        if PhyloUtils.annotate_tree(
                treefile,
                organisms,
                reroot_at='Thermococcus_chitonophagus',
                #                                    beautify_leafs=True,
                #                                    collapse_taxa=['miscellaneous crenarchaeotic group', 'thaumarchaeota'],
                #                                    collapse_last=True,
                #                                    collapse_hard=True,
                #                                    mark_leafs=sids,
                mark_leafs=[r.id for r in orig_seqs + [query] + additions],
                lineage_colors={
                    'miscellaneous crenarchaeotic group': (0, 0, 255),
                    'thaumarchaeta': (255, 0, 0)
                },
                top_lineage=Lineage('archaea')):
            return 0
        return 2
Example #21
0
 def ring_blast(self, query, db='nr', evalue=0.001, blast_filter=None, depth=1, command='blastn', **kwargs):
     '''Perform a blast search with the given query to obtain the core set of hits.
     Make another search with each hit as a query.
     If results of the second search contain new hits,
     check if these are reciprocal by yet another search with them
     and checking that results contain hits from the core set and if they are,
     add the to the final set.
     '''
     if isinstance(query, SeqRecord): query = [query]
     def blast_filter_fetch(seqs):
         @MultiprocessingBase.data_mapper
         @shelf_result
         def worker(s):
             r = self.blast_seq(s, db, evalue, command, **kwargs)
             if r and blast_filter: blast_filter(r)
             if r: return self.fetch_results(r, db, what='alignment')
             return None
         results = []
         total = len(seqs)
         prg = ProgressCounter('Performing blast search for %d sequences:' % total, total)
         @MultiprocessingBase.results_assembler
         def assembler(i, res):
             if res: results.append(res)
             prg.count()
         with prg:
             if not self.parallelize2(1, worker, assembler, seqs): return None
             return results
     
     with user_message('RingBlast: building a core set of sequences.', '\n'):
         core_seqs = blast_filter_fetch(query)
         if not core_seqs: return None
         core_seqs = self.unique_seqs(chain.from_iterable(from_shelf(r) for r in core_seqs))
         extended_set = dict((self.base_sid(s), s) for s in core_seqs)
         if depth <= 0: return core_seqs
         core_db = self.format_tmp_db(core_seqs, command.endswith('n'))
         
     def check_sequences(seqs, next_to_process):
         total = len(seqs)
         prg = ProgressCounter('RingBlast: checking %d new sequences:' % total, total)
         @MultiprocessingBase.data_mapper
         def worker(seq):
             res = self.blast_seq(seq, core_db, 100, command)
             if res and blast_filter: blast_filter(res)
             return bool(res), seq
         @MultiprocessingBase.results_assembler
         def assembler(i, res):
             prg.count()
             if not res[0]: return 
             seq = res[1]
             extended_set[self.base_sid(seq)] = seq
             next_to_process.append(seq)
         with prg: return self.parallelize2(1, worker, assembler, seqs)
         
     def process_sequences(seqs, _depth):
         if _depth == 0: return
         with user_message('RingBlast: processing %d sequences of the %d ring.' 
                           % (len(seqs), depth-_depth+1), '\n'): 
             next_ring = blast_filter_fetch(seqs)
             if not next_ring: return
             to_check = []
             next_to_process = []
             for n in next_ring:
                 next_seqs = from_shelf(n)
                 if not next_seqs: continue 
                 for ns in next_seqs:
                     sid = self.base_sid(ns)
                     if sid in extended_set:
                         #FIXME: need to merge sequences properly, instead of replacing 
                         if len(extended_set[sid]) < len(ns):
                             extended_set[sid] = ns 
                     else: to_check.append(ns)
         if not to_check or not check_sequences(to_check, next_to_process): return
         if next_to_process: process_sequences(next_to_process, _depth-1)
         
     process_sequences(core_seqs, depth)
     return extended_set.values()
Example #22
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated _genes:\n%s %s'
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated _genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating _genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate_features(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)