Beispiel #1
0
    def setUp(self):
        self.maxDiff = None

        self.id_to_taxonomy_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.txt')
        self.input_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.reference_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.read_1_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.read_2_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')

        self._paths_to_clean_up = [
            self.id_to_taxonomy_fp, self.input_seqs_fp, self.reference_seqs_fp,
            self.read_1_seqs_fp, self.read_2_seqs_fp
        ]

        a = open(self.id_to_taxonomy_fp, 'w')
        a.write(rtax_reference_taxonomy)
        a.close()
        b = open(self.reference_seqs_fp, 'w')
        b.write(rtax_reference_fasta)
        b.close()
        c = open(self.input_seqs_fp, 'w')
        c.write(rtax_test_repset_fasta)
        c.close()
        d = open(self.read_1_seqs_fp, 'w')
        d.write(rtax_test_read1_fasta)
        d.close()
        e = open(self.read_2_seqs_fp, 'w')
        e.write(rtax_test_read2_fasta)
        e.close()
Beispiel #2
0
    def setUp(self):
        """Sets up files for testing.
        """
        self.test_db_prot_filename = get_tmp_filename().replace('"', '')
        self.test_db_prot = open(self.test_db_prot_filename, 'w')
        self.test_db_dna_filename = get_tmp_filename().replace('"', '')
        self.test_db_dna = open(self.test_db_dna_filename, 'w')
        self.test_query_filename = get_tmp_filename().replace('"', '')
        self.test_query = open(self.test_query_filename, 'w')

        # write the global variables at the bottom of this file to the
        # temporary test files. Can't use file-like objects because the
        # external application needs actual files.
        self.test_db_prot.write('\n'.join(test_db_prot))
        self.test_db_dna.write('\n'.join(test_db_dna))
        self.test_query.write('\n'.join(test_query))

        # close the files
        self.test_db_prot.close()
        self.test_db_dna.close()
        self.test_query.close()

        # prepare output file path
        self.testout = get_tmp_filename().replace('"', '')

        self.files_to_remove += [self.test_db_prot_filename,
                            self.test_db_dna_filename,
                            self.test_query_filename, self.testout]
Beispiel #3
0
    def test_call_log_file(self):
        """GenericRepSetPicker.__call__ writes log when expected
        """

        tmp_log_filepath = get_tmp_filename(\
         prefix='GenericRepSetPickerTest.test_call_output_to_file_l_',\
         suffix='.txt')
        tmp_result_filepath = get_tmp_filename(\
         prefix='GenericRepSetPickerTest.test_call_output_to_file_r_',\
         suffix='.txt')

        app = GenericRepSetPicker(params=self.params)
        obs = app(self.tmp_seq_filepath, self.tmp_otu_filepath,\
         result_path=tmp_result_filepath,log_path=tmp_log_filepath)

        log_file = open(tmp_log_filepath)
        log_file_str = log_file.read()
        log_file.close()
        # remove the temp files before running the test, so in
        # case it fails the temp file is still cleaned up
        remove(tmp_log_filepath)
        remove(tmp_result_filepath)

        log_file_exp = ["GenericRepSetPicker parameters:",\
         'Algorithm:first',\
         "Application:None",\
         'ChoiceF:first',
         'ChoiceFRequiresSeqs:False',
         "Result path: %s" % tmp_result_filepath,]
        # compare data in log file to fake expected log file
        for i, j in zip(log_file_str.splitlines(), log_file_exp):
            if not i.startswith('ChoiceF:'):  #can't test, different each time
                self.assertEqual(i, j)
    def setUp(self):
        self.id_to_taxonomy_fp = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.txt')
        self.input_seqs_fp = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.fasta')
        self.reference_seqs_fp = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.fasta')

        self._paths_to_clean_up =\
         [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp]

        open(self.id_to_taxonomy_fp, 'w').write(id_to_taxonomy_string)
        open(self.input_seqs_fp, 'w').write(test_seq_coll.toFasta())
        self.test_seqs = test_seq_coll.items()
        open(self.reference_seqs_fp, 'w').write(test_refseq_coll.toFasta())

        self.expected1 = {
            's1': ('Archaea;Euryarchaeota;Halobacteriales;uncultured', 0.0,
                   "AY800210"),
            's2':
            ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.',
             0.0, "EU883771"),
            's3':
            ('Archaea;Crenarchaeota;uncultured;uncultured', 0.0, "EF503699"),
            's4': ('Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium',
                   0.0, "DQ260310"),
            's5':
            ('Archaea;Crenarchaeota;uncultured;uncultured', 0.0, "EF503697"),
            's6': ('No blast hit', None, None),
        }
Beispiel #5
0
    def setUp(self):
        self.maxDiff = None

        self.id_to_taxonomy_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.txt')
        self.input_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.reference_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.read_1_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')
        self.read_2_seqs_fp = get_tmp_filename(\
         prefix='RtaxTaxonAssignerTests_',suffix='.fasta')

        self._paths_to_clean_up = [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp, self.read_1_seqs_fp,self.read_2_seqs_fp]

        a = open(self.id_to_taxonomy_fp,'w')
        a.write(rtax_reference_taxonomy)
        a.close()
        b = open(self.reference_seqs_fp,'w')
        b.write(rtax_reference_fasta)
        b.close()
        c = open(self.input_seqs_fp,'w')
        c.write(rtax_test_repset_fasta)
        c.close()
        d = open(self.read_1_seqs_fp,'w')
        d.write(rtax_test_read1_fasta)
        d.close()
        e = open(self.read_2_seqs_fp,'w')
        e.write(rtax_test_read2_fasta)
        e.close()
Beispiel #6
0
    def setUp(self):
        """ """
        #create a tmp tree file
        self.in_tree1_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk')
        self.in_tree1_file = open(self.in_tree1_fp,'w')
        self.in_tree1_file.write(in_tree1)
        self.in_tree1_file.close()

        #create a tmp tree file (with underscores in tip names)
        self.in_tree2_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk')
        self.in_tree2_file = open(self.in_tree2_fp,'w')
        self.in_tree2_file.write(in_tree2)
        self.in_tree2_file.close()

        #create a tmp trait file
        self.in_trait1_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait1_file=open(self.in_trait1_fp,'w')
        self.in_trait1_file.write(in_trait1)
        self.in_trait1_file.close()

        #create another tmp trait file (need to test table with only single column seperately)
        self.in_trait2_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait2_file=open(self.in_trait2_fp,'w')
        self.in_trait2_file.write(in_trait2)
        self.in_trait2_file.close()

        #create a tmp trait file (with underscores in tip names)
        self.in_trait3_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait3_file=open(self.in_trait3_fp,'w')
        self.in_trait3_file.write(in_trait3)
        self.in_trait3_file.close()

        self.files_to_remove = [self.in_tree1_fp,self.in_trait1_fp,self.in_trait2_fp, self.in_trait3_fp, self.in_tree2_fp]
Beispiel #7
0
    def setUp(self):

        self.tmp_unsorted_fasta_filepath = \
            get_tmp_filename(prefix="uclust_test", suffix=".fasta")
        tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w")
        tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
        tmp_unsorted_fasta.close()

        self.tmp_sorted_fasta_filepath = \
            get_tmp_filename(prefix="uclust_test", suffix=".fasta")
        tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w")
        tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
        tmp_sorted_fasta.close()

        self.tmp_uc_filepath = \
            get_tmp_filename(prefix="uclust_test", suffix=".uc")
        tmp_uc = open(self.tmp_uc_filepath, "w")
        tmp_uc.write('\n'.join(uc_dna_clusters))
        tmp_uc.close()

        self.tmp_clstr_filepath = \
            get_tmp_filename(prefix="uclust_test", suffix=".clstr")

        self.WorkingDir = '/tmp/uclust_test'
        self.tmpdir = '/tmp/'

        self.files_to_remove = [
            self.tmp_unsorted_fasta_filepath, self.tmp_sorted_fasta_filepath,
            self.tmp_uc_filepath, self.tmp_clstr_filepath
        ]
Beispiel #8
0
    def setUp(self):
        self.infernal_test1_input_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_',suffix='.fasta')
        open(self.infernal_test1_input_fp,'w').write(infernal_test1_input_fasta)

        self.infernal_test1_template_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_',suffix='template.sto')
        open(self.infernal_test1_template_fp,'w').\
         write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably 
        # clean them up)
        self.result_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_',suffix='.fasta')
        open(self.result_fp,'w').close()
        
        self.log_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_',suffix='.log')
        open(self.log_fp,'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
            ]

        self.infernal_test1_aligner = InfernalAligner({
                'template_filepath': self.infernal_test1_template_fp,
                })
        self.infernal_test1_expected_aln = \
         LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\
            moltype=DNA)
Beispiel #9
0
    def setUp(self):
        self.infernal_test1_input_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        open(self.infernal_test1_input_fp,
             'w').write(infernal_test1_input_fasta)

        self.infernal_test1_template_fp = get_tmp_filename(
            prefix='InfernalAlignerTests_', suffix='template.sto')
        open(self.infernal_test1_template_fp,'w').\
         write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        self.result_fp = get_tmp_filename(prefix='InfernalAlignerTests_',
                                          suffix='.fasta')
        open(self.result_fp, 'w').close()

        self.log_fp = get_tmp_filename(prefix='InfernalAlignerTests_',
                                       suffix='.log')
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({
            'template_filepath':
            self.infernal_test1_template_fp,
        })
        self.infernal_test1_expected_aln = \
         LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\
            moltype=DNA)
Beispiel #10
0
def ace_for_picrust(tree_path,trait_table_path,method='pic',HALT_EXEC=False):
    '''Runs the Ace application controller given path of tree and trait table and returns a Table'''
    #initialize Ace app controller
    ace=Ace(HALT_EXEC=HALT_EXEC)

    tmp_output_count_path=get_tmp_filename()
    tmp_output_prob_path=get_tmp_filename()

    #quote file names
    tree_path='"{0}"'.format(tree_path)
    trait_table_path='"{0}"'.format(trait_table_path)
    
    as_string = " ".join([tree_path,trait_table_path,method,tmp_output_count_path,tmp_output_prob_path])
    #Run ace here
    result = ace(data=as_string)

    #Load the output into Table objects
    try:
        asr_table=LoadTable(filename=tmp_output_count_path,header=True,sep='\t')
    except IOError:
        raise RuntimeError,\
         ("R reported an error on stderr:"
          " %s" % "\n".join(result["StdErr"].readlines()))
    
    asr_prob_table=LoadTable(filename=tmp_output_prob_path,header=True,sep='\t')

    #Remove tmp files
    remove(tmp_output_count_path)
    remove(tmp_output_prob_path)

    return asr_table,asr_prob_table
Beispiel #11
0
    def setUp(self):
        """ """
        #create a tmp tree file
        self.in_tree1_fp = get_tmp_filename(prefix='CountTests',suffix='.nwk')
        self.in_tree1_file = open(self.in_tree1_fp,'w')
        self.in_tree1_file.write(in_tree1)
        self.in_tree1_file.close()

        #create a tmp tree file (with quoted tip names)
        self.in_tree2_fp = get_tmp_filename(prefix='CountTests',suffix='.nwk')
        self.in_tree2_file = open(self.in_tree2_fp,'w')
        self.in_tree2_file.write(in_tree2)
        self.in_tree2_file.close()

        #create a tmp trait file
        self.in_trait1_fp = get_tmp_filename(prefix='CountTests',suffix='.tsv')
        self.in_trait1_file=open(self.in_trait1_fp,'w')
        self.in_trait1_file.write(in_trait1)
        self.in_trait1_file.close()
        
        #create a tmp trait file (with quoted tip names)
        self.in_trait3_fp = get_tmp_filename(prefix='CountTests',suffix='.tsv')
        self.in_trait3_file=open(self.in_trait3_fp,'w')
        self.in_trait3_file.write(in_trait3)
        self.in_trait3_file.close()

        self.files_to_remove = [self.in_tree1_fp,self.in_trait1_fp]
Beispiel #12
0
 def setUp(self):
     
     self.tmp_unsorted_fasta_filepath = \
      get_tmp_filename(prefix="uclust_test", suffix=".fasta")
     tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath,"w")
     tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
     tmp_unsorted_fasta.close()
     
     self.tmp_sorted_fasta_filepath = \
      get_tmp_filename(prefix = "uclust_test", suffix = ".fasta")
     tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath,"w")
     tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
     tmp_sorted_fasta.close()
     
     self.tmp_uc_filepath = \
      get_tmp_filename(prefix = "uclust_test", suffix = ".uc")
     tmp_uc = open(self.tmp_uc_filepath,"w")
     tmp_uc.write('\n'.join(uc_dna_clusters))
     tmp_uc.close()
      
     self.tmp_clstr_filepath = \
      get_tmp_filename(prefix = "uclust_test", suffix = ".clstr")
      
     self.WorkingDir = '/tmp/uclust_test'
     self.tmpdir = '/tmp/'
     
     self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
                             self.tmp_sorted_fasta_filepath,
                             self.tmp_uc_filepath,
                             self.tmp_clstr_filepath]
Beispiel #13
0
    def setUp(self):
        """ """
        #create a tmp tree file
        self.in_tree1_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk')
        self.in_tree1_file = open(self.in_tree1_fp,'w')
        self.in_tree1_file.write(in_tree1)
        self.in_tree1_file.close()

        #create a tmp tree file (with underscores in tip names)
        self.in_tree2_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk')
        self.in_tree2_file = open(self.in_tree2_fp,'w')
        self.in_tree2_file.write(in_tree2)
        self.in_tree2_file.close()

        #create a tmp trait file
        self.in_trait1_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait1_file=open(self.in_trait1_fp,'w')
        self.in_trait1_file.write(in_trait1)
        self.in_trait1_file.close()

        #create another tmp trait file (need to test table with only single column seperately)
        self.in_trait2_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait2_file=open(self.in_trait2_fp,'w')
        self.in_trait2_file.write(in_trait2)
        self.in_trait2_file.close()

        #create a tmp trait file (with underscores in tip names)
        self.in_trait3_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv')
        self.in_trait3_file=open(self.in_trait3_fp,'w')
        self.in_trait3_file.write(in_trait3)
        self.in_trait3_file.close()

        self.files_to_remove = [self.in_tree1_fp,self.in_trait1_fp,self.in_trait2_fp, self.in_trait3_fp, self.in_tree2_fp]
Beispiel #14
0
def ace_for_picrust(tree_path,trait_table_path,method='pic',HALT_EXEC=False):
    '''Runs the Ace application controller given path of tree and trait table and returns a Table'''
    #initialize Ace app controller
    ace=Ace(HALT_EXEC=HALT_EXEC)

    tmp_output_count_path=get_tmp_filename()
    tmp_output_prob_path=get_tmp_filename()

    #quote file names
    tree_path='"{0}"'.format(tree_path)
    trait_table_path='"{0}"'.format(trait_table_path)

    as_string = " ".join([tree_path,trait_table_path,method,tmp_output_count_path,tmp_output_prob_path])
    #Run ace here
    result = ace(data=as_string)

    #Load the output into Table objects
    try:
        asr_table=LoadTable(filename=tmp_output_count_path,header=True,sep='\t')
    except IOError:
        raise RuntimeError,\
         ("R reported an error on stderr:"
          " %s" % "\n".join(result["StdErr"].readlines()))

    asr_prob_table=LoadTable(filename=tmp_output_prob_path,header=True,sep='\t')

    #Remove tmp files
    remove(tmp_output_count_path)
    remove(tmp_output_prob_path)

    return asr_table,asr_prob_table
Beispiel #15
0
def uclust_search_and_align_from_fasta_filepath(
    query_fasta_filepath,
    subject_fasta_filepath,
    percent_ID=0.75,
    enable_rev_strand_matching=True,
    max_accepts=8,
    max_rejects=32,
    HALT_EXEC=False):
    """ query seqs against subject fasta using uclust, 
    
       return global pw alignment of best match
    """
     
    # Explanation of parameter settings
    #  id - min percent id to count a match
    #  maxaccepts = 8 , searches for best match rather than first match 
    #                   (0 => infinite accepts, or good matches before 
    #                    quitting search)
    #  maxaccepts = 32, 
    #  libonly = True , does not add sequences to the library if they don't
    #                   match something there already. this effectively makes
    #                   uclust a search tool rather than a clustering tool
    
    params = {'--id':percent_ID,
              '--maxaccepts':max_accepts,
              '--maxrejects':max_rejects,
              '--libonly':True,
              '--lib':subject_fasta_filepath}
              
    if enable_rev_strand_matching:
        params['--rev'] = True
    
    # instantiate the application controller
    app = Uclust(params,HALT_EXEC=HALT_EXEC)
    
    # apply uclust
    alignment_filepath = \
     get_tmp_filename(prefix='uclust_alignments',suffix='.fasta')
    uc_filepath = \
     get_tmp_filename(prefix='uclust_results',suffix='.uc')
    input_data = {'--input':query_fasta_filepath,
                  '--fastapairs':alignment_filepath,
                  '--uc':uc_filepath}
    app_result = app(input_data)
    
    # yield the pairwise alignments
    for result in process_uclust_pw_alignment_results(
     app_result['PairwiseAlignments'],app_result['ClusterFile']):
        try:
            yield result
        except GeneratorExit:
            break
    
    # clean up the temp files that were generated
    app_result.cleanUp()
    
    return
Beispiel #16
0
def cdhit_clusters_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT clusters given seqs

    seqs        : dict like collection of sequences
    moltype     : cogent.core.moltype object
    params      : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seqs.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    
    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(int_map.toFasta())
    clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines())

    remapped_clusters = []
    for c in clusters:
        curr = [int_keys[i] for i in c]
        remapped_clusters.append(curr)

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    remove(params['-o'] + '.bak.clstr')

    return remapped_clusters
Beispiel #17
0
    def setUp(self):
        self.SimpleTree = \
          DndParser("((A:0.02,B:0.01)E:0.05,(C:0.01,D:0.01)F:0.05)root;")
    
        self.SimplePolytomyTree = \
          DndParser("((A:0.02,B:0.01,B_prime:0.03)E:0.05,(C:0.01,D:0.01)F:0.05)root;")
    
        self.SimpleTreeTraits =\
            {"A":[1.0,1.0],"E":[1.0,1.0],"F":[0.0,1.0],"D":[0.0,0.0]}
        
        self.PartialReconstructionTree =\
                DndParser("((((B:0.01,C:0.01)I3:0.01,A:0.01)I2:0.01,D:0.01)I1:0.01)root;")

        self.CloseToI3Tree =\
                DndParser("((((B:0.01,C:0.95)I3:0.01,A:0.01)I2:0.95,D:0.05)I1:0.95)root;")
        
        self.CloseToI1Tree =\
                DndParser("((((B:0.95,C:0.95)I3:0.95,A:0.01)I2:0.02,D:0.05)I1:0.05)root;")

        self.BetweenI3AndI1Tree=\
                DndParser("((((B:0.01,C:0.1)I3:0.02,A:0.01)I2:0.02,D:0.05)I1:0.02)root;")


        self.PartialReconstructionTraits =\
                {"B":[1.0,1.0],"C":[1.0,1.0],"I3":[1.0,1.0],"I1":[0.0,1.0],"D":[0.0,1.0]}

        self.GeneCountTraits =\
                {"B":[1.0,1.0],"C":[1.0,2.0],"I3":[1.0,1.0],"I1":[0.0,3.0],"D":[0.0,5.0]}

        #create a tmp trait file
        self.in_trait1_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv')
        self.in_trait1_file=open(self.in_trait1_fp,'w')
        self.in_trait1_file.write(in_trait1)
        self.in_trait1_file.close()

        #create another tmp trait file (with columns in different order)
        self.in_trait2_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv')
        self.in_trait2_file=open(self.in_trait2_fp,'w')
        self.in_trait2_file.write(in_trait2)
        self.in_trait2_file.close()


        #create a tmp trait file with a incorrect trait name
        self.in_bad_trait_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv')
        self.in_bad_trait_file=open(self.in_bad_trait_fp,'w')
        self.in_bad_trait_file.write(in_bad_trait)
        self.in_bad_trait_file.close()

        self.files_to_remove = [self.in_trait1_fp,self.in_trait2_fp,self.in_bad_trait_fp]
Beispiel #18
0
    def setUp(self):
        self.in_seqs1_fp = get_tmp_filename(prefix="FormatDbTests", suffix=".fasta")
        self.in_seqs1_file = open(self.in_seqs1_fp, "w")
        self.in_seqs1_file.write(in_seqs1)
        self.in_seqs1_file.close()
        self.in_seqs1 = LoadSeqs(self.in_seqs1_fp, aligned=False)
        self.test_seq = test_seq

        self.in_aln1_fp = get_tmp_filename(prefix="FormatDbTests", suffix=".fasta")
        self.in_aln1_file = open(self.in_aln1_fp, "w")
        self.in_aln1_file.write(in_aln1)
        self.in_aln1_file.close()
        self.in_aln1 = LoadSeqs(self.in_aln1_fp)

        self.files_to_remove = [self.in_seqs1_fp, self.in_aln1_fp]
    def setUp(self):
        """ """
        
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []
        self.files_to_remove = []
        
        #this is specific to the web-apps only
        test_dir = abspath(dirname(__file__))
        sff_original_fp = os.path.join(test_dir, 'support_files', \
                                        'Fasting_subset.sff')

        # copy sff file to working directory
        self.sff_dir = tempfile.mkdtemp()
        self.dirs_to_remove.append(self.sff_dir)
        
        self.sff_fp = os.path.join(self.sff_dir, 'Fasting_subset.sff')
        copy(sff_original_fp, self.sff_fp)
        self.files_to_remove.append(self.sff_fp)
        
        tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(tmp_dir):
            makedirs(tmp_dir)
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(tmp_dir)
        
        self.wf_out = get_tmp_filename(tmp_dir=tmp_dir,
         prefix='qiime_wf_out',suffix='',result_constructor=str)
        self.dirs_to_remove.append(self.wf_out)
        
        self.fasting_mapping_fp = get_tmp_filename(tmp_dir=tmp_dir,
         prefix='qiime_wf_mapping',suffix='.txt')
        fasting_mapping_f = open(self.fasting_mapping_fp,'w')
        fasting_mapping_f.write(fasting_map)
        
        fasting_mapping_f.close()
        self.files_to_remove.append(self.fasting_mapping_fp)
        
        working_dir = self.qiime_config['working_dir'] or './'
        jobs_dir = join(working_dir,'jobs')
        if not exists(jobs_dir):
            # only clean up the jobs dir if it doesn't already exist
            self.dirs_to_remove.append(jobs_dir)
        self.params = parse_qiime_parameters(qiime_parameters_f.split('\n'))

        signal.signal(signal.SIGALRM, timeout)
        # set the 'alarm' to go off in allowed_seconds seconds
        signal.alarm(allowed_seconds_per_test)
Beispiel #20
0
def check_chimera(refseqs, target_id, target_seq):
    """Check if target is a chimera

    refseqs : something like a dict {id:seq}
    target_id : target sequence id, string
    target_seq : the actual target sequence

    expects the refseqs and target seq to both be aligned against same ref
    """
    assert target_id not in refseqs

    inputseqs = refseqs.copy()
    inputseqs[target_id] = target_seq

    params = {
        '-o': get_tmp_filename(),
        '-w': 400,
        '-t': target_id,
        '-f': 'full',
        '-c': 'Huber-Hugenholtz'
    }
    app = Bel3(InputHandler='_input_as_seqs', params=params, HALT_EXEC=False)
    res = app(inputseqs)

    how_chimeric = parse_bel3_result(res['B3out'])

    res.cleanUp()

    return how_chimeric
Beispiel #21
0
    def test_create_bwa_index_from_fasta_file(self):
        """Test create_bwa_index_from_fasta_file

        Makes sure that the file paths are as expected.
        """

        # get a new temp file for the input fasta
        fasta_in = get_tmp_filename(suffix=".fna")
        # write the test fasta (see end of this file) to the temp file
        fasta = open(fasta_in, 'w')
        fasta.write(test_fasta)
        fasta.close()

        # make sure to remove this fasta file upon tearDown
        self.files_to_remove.append(fasta_in)

        # run the function
        results = create_bwa_index_from_fasta_file(fasta_in, {})

        # for each of the 5 output files (not counting stdout, stderr, and
        # the exitStatus), make sure the file paths are as expcted.
        for filetype, result in results.iteritems():
            if filetype not in ('ExitStatus'):
                # be sure to remove these 5 files
                self.files_to_remove.append(result.name)
            if filetype not in ('StdOut', 'ExitStatus', 'StdErr'):
                self.assertEqual(fasta_in + filetype, result.name)
Beispiel #22
0
 def setUp(self):
     self.input_fp = get_tmp_filename(\
      prefix='CogentAlignerTests_',suffix='.fasta')
     open(self.input_fp,'w').write(seqs_for_muscle)
     
     self._paths_to_clean_up =\
      [self.input_fp] 
Beispiel #23
0
    def setUp(self):
        self.input_fp = get_tmp_filename(\
         prefix='CogentAlignerTests_',suffix='.fasta')
        open(self.input_fp, 'w').write(seqs_for_muscle)

        self._paths_to_clean_up =\
         [self.input_fp]
Beispiel #24
0
def build_blast_db_from_fasta_file(fasta_file,is_protein=False,\
    output_dir=None,HALT_EXEC=False):
    """Build blast db from fasta_path; return db name and list of files created
    
        **If using to create temporary blast databases, you can call
        cogent.util.misc.remove_files(db_filepaths) to clean up all the
        files created by formatdb when you're done with the database.
    
        fasta_path: path to fasta file of sequences to build database from
        is_protein: True if working on protein seqs (default: False)
        output_dir: directory where output should be written
         (default: directory containing fasta_path)
        HALT_EXEC: halt just before running the formatdb command and
         print the command -- useful for debugging
    """
    output_dir = output_dir or '.'
    fasta_path = get_tmp_filename(\
     tmp_dir=output_dir, prefix="BLAST_temp_db_", suffix=".fasta")
    
    fasta_f = open(fasta_path,'w')
    for line in fasta_file:
        fasta_f.write('%s\n' % line.strip())
    fasta_f.close()
    
    blast_db, db_filepaths = build_blast_db_from_fasta_path(\
     fasta_path, is_protein=is_protein, output_dir=None, HALT_EXEC=HALT_EXEC)
     
    db_filepaths.append(fasta_path)
    
    return blast_db, db_filepaths
Beispiel #25
0
def build_blast_db_from_seqs(seqs,is_protein=False,\
    output_dir='./',HALT_EXEC=False):
    """Build blast db from seqs; return db name and list of files created
    
        **If using to create temporary blast databases, you can call
        cogent.util.misc.remove_files(db_filepaths) to clean up all the
        files created by formatdb when you're done with the database.
    
        seqs: sequence collection or alignment object
        is_protein: True if working on protein seqs (default: False)
        output_dir: directory where output should be written
         (default: current directory)
        HALT_EXEC: halt just before running the formatdb command and
         print the command -- useful for debugging
    """
    
    # Build a temp filepath
    tmp_fasta_filepath = get_tmp_filename(\
     prefix='Blast_tmp_db',suffix='.fasta')
    # open the temp file
    tmp_fasta_file = open(tmp_fasta_filepath,'w')
    # write the sequence collection to file
    tmp_fasta_file.write(seqs.toFasta())
    tmp_fasta_file.close()
    
    # build the bast database
    db_name, db_filepaths = build_blast_db_from_fasta_path(\
     tmp_fasta_filepath,is_protein=is_protein,\
     output_dir=output_dir,HALT_EXEC=HALT_EXEC)
     
    # clean-up the temporary file
    remove(tmp_fasta_filepath)
    
    # return the results
    return db_name, db_filepaths
Beispiel #26
0
def build_blast_db_from_seqs(seqs,is_protein=False,\
    output_dir='./',HALT_EXEC=False):
    """Build blast db from seqs; return db name and list of files created
    
        **If using to create temporary blast databases, you can call
        cogent.util.misc.remove_files(db_filepaths) to clean up all the
        files created by formatdb when you're done with the database.
    
        seqs: sequence collection or alignment object
        is_protein: True if working on protein seqs (default: False)
        output_dir: directory where output should be written
         (default: current directory)
        HALT_EXEC: halt just before running the formatdb command and
         print the command -- useful for debugging
    """

    # Build a temp filepath
    tmp_fasta_filepath = get_tmp_filename(\
     prefix='Blast_tmp_db',suffix='.fasta')
    # open the temp file
    tmp_fasta_file = open(tmp_fasta_filepath, 'w')
    # write the sequence collection to file
    tmp_fasta_file.write(seqs.toFasta())
    tmp_fasta_file.close()

    # build the bast database
    db_name, db_filepaths = build_blast_db_from_fasta_path(\
     tmp_fasta_filepath,is_protein=is_protein,\
     output_dir=output_dir,HALT_EXEC=HALT_EXEC)

    # clean-up the temporary file
    remove(tmp_fasta_filepath)

    # return the results
    return db_name, db_filepaths
def make_torque_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"):
    """prepare qsub text files.
    
    command: list of commands
    
    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to
    
    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime 
    
    ncpus: number of cpus
    
    nodes: number of nodes
    
    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames=[]
    create_dir(jobs_dir)
    for command in commands:
        job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_",
                                    suffix = ".txt")
        out_fh = open(job_name,"w")

        out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix,
                                  keep_output, command))        
        out_fh.close()
        filenames.append(job_name)
    return filenames
    def test_call_output_to_file(self):
        """BlastTaxonAssigner.__call__ functions w output to file
        """
        result_path = get_tmp_filename(prefix='BlastTaxonAssignerTests_',
                                       suffix='.fasta')
        self._paths_to_clean_up.append(result_path)

        p = BlastTaxonAssigner({
            'reference_seqs_filepath':
            self.reference_seqs_fp,
            'id_to_taxonomy_filepath':
            self.id_to_taxonomy_fp,
        })
        actual = p(self.input_seqs_fp, result_path=result_path)

        expected_lines = set([
            's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n',
            's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n',
            's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n',
            's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n',
            's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n',
            's6\tNo blast hit\tNone\tNone\n',
        ])
        f = open(result_path)
        observed_lines = set(f.readlines())
        f.close()
        self.assertEqual(observed_lines, expected_lines)

        # Return value is None when result_path is provided (Not sure
        # if this is what we want yet, or if we would want both so
        # results could be logged to file...)
        self.assertEqual(actual, None)
Beispiel #29
0
def build_blast_db_from_fasta_file(fasta_file,is_protein=False,\
    output_dir=None,HALT_EXEC=False):
    """Build blast db from fasta_path; return db name and list of files created
    
        **If using to create temporary blast databases, you can call
        cogent.util.misc.remove_files(db_filepaths) to clean up all the
        files created by formatdb when you're done with the database.
    
        fasta_path: path to fasta file of sequences to build database from
        is_protein: True if working on protein seqs (default: False)
        output_dir: directory where output should be written
         (default: directory containing fasta_path)
        HALT_EXEC: halt just before running the formatdb command and
         print the command -- useful for debugging
    """
    output_dir = output_dir or '.'
    fasta_path = get_tmp_filename(\
     tmp_dir=output_dir, prefix="BLAST_temp_db_", suffix=".fasta")

    fasta_f = open(fasta_path, 'w')
    for line in fasta_file:
        fasta_f.write('%s\n' % line.strip())
    fasta_f.close()

    blast_db, db_filepaths = build_blast_db_from_fasta_path(\
     fasta_path, is_protein=is_protein, output_dir=None, HALT_EXEC=HALT_EXEC)

    db_filepaths.append(fasta_path)

    return blast_db, db_filepaths
    def test_assign_taxonomy_file_output(self):
        """ assign_taxonomy wrapper writes correct file output when requested
        
            This function tests for sucessful completion of assign_taxonomy
             when writing to file, that the lines in the file roughly look
             correct by verifying how many are written (by zipping with 
             expected), and that each line starts with the correct seq id.
             Actual testing of taxonomy data is performed elsewhere.
        
        """
        output_fp = get_tmp_filename(\
         prefix='RDPAssignTaxonomyTests',suffix='.txt')
        # convert the expected dict to a list of lines to match 
        # file output
        expected_file_headers = self.expected_assignments1.keys()
        expected_file_headers.sort()
        
        actual_return_value = assign_taxonomy(\
         self.test_input1,min_confidence=0.95,output_fp=output_fp)
        
        actual_file_output = list(open(output_fp))
        actual_file_output.sort()

        # remove the output_fp before running the tests, so if they
        # fail the output file is still cleaned-up
        remove(output_fp)
        
        # None return value on write to file
        self.assertEqual(actual_return_value,None)
        
        # check that each line starts with the correct seq_id -- not 
        # checking the taxonomies or confidences here as these are variable and
        # tested elsewhere
        for a,e in zip(actual_file_output,expected_file_headers):
            self.assertTrue(a.startswith(e))
Beispiel #31
0
def dotur_from_file(distance_matrix_file_path, params=None):
    """Returns dotur results given a distance matrix file.
    
        - distance_matrix_file_path:  Path to distance matrix file.  This file
             must a PHYLIP formatted square distance matrix.  This format
             is available in cogent.format.table.
             - IMPORANT NOTE:  This distance matrix format allows only 10
                characters for the row labels in the distance matrix.  Also,
                the IDs must be unique and ungapped to be useful when using
                dotur.
        - NOTE:  This function will only return the parsed *.list file, as
            it contains the OTU identities.
            Dotur generates 23 output files, so if this is not the one you
            are looking for, check out the documentation and add the others
            to the result path.
    """
    # Read out the data from the distance_matrix_file_path.
    # This is important so we can run dotur in a temp directory and avoid
    # having to handle all 23 output files.
    d_matrix_string = open(distance_matrix_file_path, 'U').read()

    working_dir = get_tmp_filename(suffix='')
    app = Dotur(InputHandler='_input_as_multiline_string',\
        WorkingDir=working_dir,params=params)

    res = app(d_matrix_string)

    otu_list = OtuListParser(res['List'].readlines())

    shutil.rmtree(app.WorkingDir)

    return otu_list
Beispiel #32
0
    def setUp(self):

        # create a list of files to cleanup
        self._paths_to_clean_up = []
        self._dirs_to_clean_up = []

        # load query seqs
        self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))

        # generate temp filename
        tmp_dir = '/tmp'
        self.outfile = get_tmp_filename(tmp_dir)

        # create and write out reference sequence file
        self.outfasta = splitext(self.outfile)[0] + '.fasta'
        fastaout = open(self.outfasta, 'w')
        fastaout.write(REF_SEQS)
        fastaout.close()
        self._paths_to_clean_up.append(self.outfasta)

        # create and write out starting tree file
        self.outtree = splitext(self.outfile)[0] + '.tree'
        treeout = open(self.outtree, 'w')
        treeout.write(REF_TREE)
        treeout.close()
        self._paths_to_clean_up.append(self.outtree)
Beispiel #33
0
def R_format_otu_table(otu_filepath, output_dir=None, write_to_tmp_file=True):
    """Formats OTU table for R (remove comments & column 1 header)
       If write_to_tmp_file, writes formatted file to tmp file and returns path
       else, returns lines to go in file
    """
    sample_ids, otu_ids, otu_matrix, lineages = \
        parse_otu_table(open(otu_filepath,'U').readlines())
    # first line is sample ids, no header for first column (how R likes it)
    lines = ['\t'.join(sample_ids)]
    for i in xrange(len(otu_ids)):
        # note: casting array as a string and calling "split" is much faster
        # than mapping "str" onto the array
        array_as_strings = str(otu_matrix[i, :])[1:-1].split()
        lines.append(otu_ids[i] + '\t' + '\t'.join(array_as_strings))
    if write_to_tmp_file:
        if output_dir is None:
            tmp_fp = get_tmp_filename(prefix='otus_R_format', suffix='.txt')
        else:
            tmp_fp = join(output_dir, 'otus_R_format.txt')
        fout = open(tmp_fp, 'w')
        fout.write('\n'.join(lines))
        fout.close()
        return tmp_fp
    else:
        return lines
 def test_single_file_upgma(self):
     """ single_file_upgma should throw no errors"""
     
     titles = ['hi','ho']
     distdata = numpy.array([[0,.5],[.5,0.]])
     fname = get_tmp_filename(prefix='upgma_',suffix='.txt')
     f = open(fname,'w')
     self._paths_to_clean_up.append(fname)
     f.write(format_distance_matrix(titles, distdata))
     f.close()
     
     fname2 = get_tmp_filename(prefix='upgma_',suffix='.txt',
         result_constructor=str)
     self._paths_to_clean_up.append(fname2)
     single_file_upgma(fname,fname2)
     assert(os.path.exists(fname2))
    def setUp(self):
        """setup the test values"""
        
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []
        self.files_to_remove = []
        
        #this is specific to the web-apps only
        test_dir = abspath(dirname(__file__))
        self.fna_original_fp = os.path.join(test_dir, 'support_files', \
                                        'test.fna')

        tmp_dir = self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(tmp_dir):
            makedirs(tmp_dir)
            
            # if test creates the temp dir, also remove it
            self.dirs_to_remove.append(tmp_dir)
        
        self.wf_out = get_tmp_filename(tmp_dir=tmp_dir,
         prefix='qiime_wf_out',suffix='',result_constructor=str)
        if not exists(self.wf_out):
            makedirs(self.wf_out)         
            self.dirs_to_remove.append(self.wf_out)
        #print self.wf_out
        working_dir = self.qiime_config['working_dir'] or './'
        jobs_dir = join(working_dir,'jobs')
        if not exists(jobs_dir):
            # only clean up the jobs dir if it doesn't already exist
            self.dirs_to_remove.append(jobs_dir)
        self.params = parse_qiime_parameters(qiime_parameters_f.split('\n'))

        signal.signal(signal.SIGALRM, timeout)
        # set the 'alarm' to go off in allowed_seconds seconds
        signal.alarm(allowed_seconds_per_test)
Beispiel #36
0
def uclust_cluster_from_sorted_fasta_filepath(
    fasta_filepath,
    uc_save_filepath=None, 
    percent_ID=0.97, 
    max_accepts=1,
    max_rejects=8, 
    optimal = False,
    exact = False,
    suppress_sort = False,
    enable_rev_strand_matching=False,
    subject_fasta_filepath=None,
    suppress_new_clusters=False,
    stable_sort=False,
    HALT_EXEC=False):
    """ Returns clustered uclust file from sorted fasta"""
    output_filepath = uc_save_filepath or \
     get_tmp_filename(prefix='uclust_clusters',suffix='.uc')
     
    
    params = {'--id':percent_ID,
              '--maxaccepts':max_accepts,
              '--maxrejects':max_rejects}
    app = Uclust(params,HALT_EXEC=HALT_EXEC)
    
    # Set any additional parameters specified by the user
    if enable_rev_strand_matching: app.Parameters['--rev'].on()
    if optimal: app.Parameters['--optimal'].on()
    if exact: app.Parameters['--exact'].on()
    if suppress_sort: app.Parameters['--usersort'].on()
    if subject_fasta_filepath: app.Parameters['--lib'].on(subject_fasta_filepath)
    if suppress_new_clusters: app.Parameters['--libonly'].on()
    if stable_sort: app.Parameters['--stable_sort'].on()
    
    app_result = app({'--input':fasta_filepath,'--uc':output_filepath})
    return app_result
Beispiel #37
0
def make_torque_jobs(commands, job_prefix, queue, jobs_dir="jobs/",
              walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"):
    """prepare qsub text files.
    
    command: list of commands
    
    job_prefix: a short, descriptive name for the job.

    queue: name of the queue to submit to
    
    jobs_dir: path to directory where job submision scripts are written

    walltime: the maximal walltime 
    
    ncpus: number of cpus
    
    nodes: number of nodes
    
    keep_output: keep standard error, standard out, both, or neither
                 o=std out, e=std err, oe=both, n=neither
    """

    filenames=[]
    create_dir(jobs_dir)
    for command in commands:
        job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_",
                                    suffix = ".txt")
        out_fh = open(job_name,"w")

        out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix,
                                  keep_output, command))        
        out_fh.close()
        filenames.append(job_name)
    return filenames
Beispiel #38
0
def dotur_from_file(distance_matrix_file_path,params=None):
    """Returns dotur results given a distance matrix file.
    
        - distance_matrix_file_path:  Path to distance matrix file.  This file
             must a PHYLIP formatted square distance matrix.  This format
             is available in cogent.format.table.
             - IMPORANT NOTE:  This distance matrix format allows only 10
                characters for the row labels in the distance matrix.  Also,
                the IDs must be unique and ungapped to be useful when using
                dotur.
        - NOTE:  This function will only return the parsed *.list file, as
            it contains the OTU identities.
            Dotur generates 23 output files, so if this is not the one you
            are looking for, check out the documentation and add the others
            to the result path.
    """
    # Read out the data from the distance_matrix_file_path.
    # This is important so we can run dotur in a temp directory and avoid
    # having to handle all 23 output files.
    d_matrix_string = open(distance_matrix_file_path,'U').read()
    
    working_dir = get_tmp_filename(suffix='')
    app = Dotur(InputHandler='_input_as_multiline_string',\
        WorkingDir=working_dir,params=params)
    
    res = app(d_matrix_string)
    
    otu_list = OtuListParser(res['List'].readlines())
    
    shutil.rmtree(app.WorkingDir)
    
    return otu_list
Beispiel #39
0
 def setUp(self):
     
     # create a list of files to cleanup
     self._paths_to_clean_up = []
     self._dirs_to_clean_up = []
     
     # load query seqs
     self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split()))
     
     # generate temp filename
     tmp_dir='/tmp'
     self.outfile = get_tmp_filename(tmp_dir)
     
     # create and write out reference sequence file
     self.outfasta=splitext(self.outfile)[0]+'.fasta'
     fastaout=open(self.outfasta,'w')
     fastaout.write(REF_SEQS)
     fastaout.close()
     self._paths_to_clean_up.append(self.outfasta)
     
     # create and write out starting tree file
     self.outtree=splitext(self.outfile)[0]+'.tree'
     treeout=open(self.outtree,'w')
     treeout.write(REF_TREE)
     treeout.close()
     self._paths_to_clean_up.append(self.outtree)
Beispiel #40
0
    def test_create_bwa_index_from_fasta_file(self):
        """Test create_bwa_index_from_fasta_file

        Makes sure that the file paths are as expected.
        """

        # get a new temp file for the input fasta
        fasta_in = get_tmp_filename(suffix=".fna")
        # write the test fasta (see end of this file) to the temp file
        fasta = open(fasta_in, 'w')
        fasta.write(test_fasta)
        fasta.close()

        # make sure to remove this fasta file upon tearDown
        self.files_to_remove.append(fasta_in)

        # run the function
        results = create_bwa_index_from_fasta_file(fasta_in, {})
        
        # for each of the 5 output files (not counting stdout, stderr, and
        # the exitStatus), make sure the file paths are as expcted.
        for filetype, result in results.iteritems():
            if filetype not in ('ExitStatus'):
                # be sure to remove these 5 files
                self.files_to_remove.append(result.name)
            if filetype not in ('StdOut', 'ExitStatus', 'StdErr'):
                self.assertEqual(fasta_in + filetype, result.name)
Beispiel #41
0
    def test_assign_taxonomy_file_output(self):
        """ assign_taxonomy wrapper writes correct file output when requested
        
            This function tests for sucessful completion of assign_taxonomy
             when writing to file, that the lines in the file roughly look
             correct by verifying how many are written (by zipping with 
             expected), and that each line starts with the correct seq id.
             Actual testing of taxonomy data is performed elsewhere.
        
        """
        output_fp = get_tmp_filename(\
         prefix='RDPAssignTaxonomyTests',suffix='.txt')
        # convert the expected dict to a list of lines to match
        # file output
        expected_file_headers = self.expected_assignments1.keys()
        expected_file_headers.sort()

        actual_return_value = assign_taxonomy(\
         self.test_input1,min_confidence=0.95,output_fp=output_fp)

        actual_file_output = list(open(output_fp))
        actual_file_output.sort()

        # remove the output_fp before running the tests, so if they
        # fail the output file is still cleaned-up
        remove(output_fp)

        # None return value on write to file
        self.assertEqual(actual_return_value, None)

        # check that each line starts with the correct seq_id -- not
        # checking the taxonomies or confidences here as these are variable and
        # tested elsewhere
        for a, e in zip(actual_file_output, expected_file_headers):
            self.assertTrue(a.startswith(e))
    def test_call_output_to_file(self):
        """BlastTaxonAssigner.__call__ functions w output to file
        """
        result_path = get_tmp_filename(
            prefix='BlastTaxonAssignerTests_', suffix='.fasta')
        self._paths_to_clean_up.append(result_path)

        p = BlastTaxonAssigner({
            'reference_seqs_filepath': self.reference_seqs_fp,
            'id_to_taxonomy_filepath': self.id_to_taxonomy_fp,
            })
        actual = p(self.input_seqs_fp, result_path=result_path)

        expected_lines = set([
            's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n',
            's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n',
            's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n',
            's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n',
            's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n',
            's6\tNo blast hit\tNone\tNone\n',
            ])
        f = open(result_path)
        observed_lines = set(f.readlines())
        f.close()
        self.assertEqual(observed_lines, expected_lines)
        
        # Return value is None when result_path is provided (Not sure
        # if this is what we want yet, or if we would want both so 
        # results could be logged to file...)
        self.assertEqual(actual, None)
Beispiel #43
0
    def setUp(self):
        self.pynast_test1_input_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='.fasta')
        open(self.pynast_test1_input_fp,'w').write(pynast_test1_input_fasta)

        self.pynast_test1_template_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='template.fasta')
        open(self.pynast_test1_template_fp,'w').\
         write(pynast_test1_template_fasta)

        self.pynast_test_template_w_dots_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='template.fasta')
        open(self.pynast_test_template_w_dots_fp,'w').\
         write(pynast_test1_template_fasta.replace('-','.'))

        self.pynast_test_template_w_u_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='template.fasta')
        open(self.pynast_test_template_w_u_fp,'w').\
         write(pynast_test1_template_fasta.replace('T','U'))

        self.pynast_test_template_w_lower_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='template.fasta')
        open(self.pynast_test_template_w_lower_fp,'w').\
         write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably 
        # clean them up)
        self.result_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='.fasta')
        open(self.result_fp,'w').close()
        self.failure_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='.fasta')
        open(self.failure_fp,'w').close()
        self.log_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_',suffix='.log')
        open(self.log_fp,'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
            ]

        self.pynast_test1_aligner = PyNastAligner({
                'template_filepath': self.pynast_test1_template_fp,
                'min_len': 15,
                })

        self.pynast_test1_expected_aln = \
         LoadSeqs(data=pynast_test1_expected_alignment,aligned=DenseAlignment)
        self.pynast_test1_expected_fail = \
         LoadSeqs(data=pynast_test1_expected_failure,aligned=False)
Beispiel #44
0
    def setUp(self):
        self.in_seqs1_fp =\
         get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
        self.in_seqs1_file = open(self.in_seqs1_fp, 'w')
        self.in_seqs1_file.write(in_seqs1)
        self.in_seqs1_file.close()
        self.in_seqs1 = LoadSeqs(self.in_seqs1_fp, aligned=False)
        self.test_seq = test_seq

        self.in_aln1_fp =\
         get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
        self.in_aln1_file = open(self.in_aln1_fp, 'w')
        self.in_aln1_file.write(in_aln1)
        self.in_aln1_file.close()
        self.in_aln1 = LoadSeqs(self.in_aln1_fp)

        self.files_to_remove = [self.in_seqs1_fp, self.in_aln1_fp]
Beispiel #45
0
    def setUp(self):
        # create the temporary input files
        self.tmp_seq_filepath = get_tmp_filename(\
         prefix='GenericRepSetPickerTest_',\
         suffix='.fasta')
        seq_file = open(self.tmp_seq_filepath, 'w')
        seq_file.write(dna_seqs)
        seq_file.close()

        self.tmp_otu_filepath = get_tmp_filename(\
         prefix='GenericRepSetPickerTest_',\
         suffix='.otu')
        otu_file = open(self.tmp_otu_filepath, 'w')
        otu_file.write(otus)
        otu_file.close()

        self.params = {'Algorithm': 'first', 'ChoiceF': first_id}
    def setUp(self):
        """ """
        
        self.qiime_config = load_qiime_config()
        self.dirs_to_remove = []
        self.files_to_remove = []
        
        #this is specific to the web-apps only
        test_dir = abspath(dirname(__file__))
        sff_original_fp = os.path.join(test_dir, 'support_files', \
                                        'Fasting_subset.sff')
        
        self.sff_fp = os.path.join('/%s/' % environ['HOME'], 
                                   'Fasting_subset.sff')
        self.files_to_remove.append(self.sff_fp)
        copy(sff_original_fp, self.sff_fp)
        
        self.illumina_fps = [os.path.join(test_dir, 'support_files', \
                                        's_8_1_sequence_100_records.txt'),
                             os.path.join(test_dir, 'support_files', \
                                        's_8_2_sequence_100_records.txt')]
        self.illumina_map_fp = os.path.join(test_dir, 'support_files', \
                                        's8_map_incomplete.txt')
    
        self.fasta_fps=[os.path.join(test_dir,'support_files',
                                   'test_split_lib_seqs.fasta')]
        self.fasta_map_fp = os.path.join(test_dir, 'support_files', \
                                        'fasta_mapping_file.txt')

        tmp_dir = "/%s/test_wf" % environ['HOME']
        self.dirs_to_remove.append(tmp_dir)
        
        #self.qiime_config['temp_dir'] or '/tmp/'
        if not exists(tmp_dir):
            makedirs(tmp_dir)
            # if test creates the temp dir, also remove it
            #self.dirs_to_remove.append(tmp_dir)
            
        self.wf_out="/%s/test_processed_data" % environ['HOME']
        #print self.wf_out
        self.dirs_to_remove.append(self.wf_out)
        self.gg_out=os.path.join(self.wf_out,'gg_97_otus')
        if not exists(self.gg_out):
            makedirs(self.gg_out)
            #self.dirs_to_remove.append(self.gg_out)
            
        self.fasting_mapping_fp = get_tmp_filename(tmp_dir=tmp_dir,
         prefix='qiime_wf_mapping',suffix='.txt')
        fasting_mapping_f = open(self.fasting_mapping_fp,'w')
        fasting_mapping_f.write(fasting_map)
        fasting_mapping_f.close()
        self.files_to_remove.append(self.fasting_mapping_fp)
        
        self.params = parse_qiime_parameters(qiime_parameters_f)

        signal.signal(signal.SIGALRM, timeout)
        # set the 'alarm' to go off in allowed_seconds seconds
        signal.alarm(allowed_seconds_per_test)
Beispiel #47
0
def cluster_seqs(seqs,
                 neighbor_join=False,
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_chars=1000000,
                 max_hours=1.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle cluster list of sequences.
    
    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.
    
    Addl docs coming soon
    """
    num_seqs = len(seqs)
    if num_seqs < 2:
        raise ValueError("Muscle requres 2 or more sequences to cluster.")

    
    num_chars = sum(map(len, seqs))
    if num_chars > max_chars:
        params["-maxiters"] = 2
        params["-diags1"] = True
        params["-sv"] = True
        #params["-distance1"] = "kmer6_6"
        #params["-distance1"] = "kmer20_3"
        #params["-distance1"] = "kbit20_3"
        print("lots of chars, using fast align", num_chars)

    
    params["-maxhours"] = max_hours
    #params["-maxiters"] = 10
    
    #cluster_type = "upgmb"
    #if neighbor_join:
    #    cluster_type = "neighborjoining"
    
    params["-cluster"] = True
    params["-tree1"] = get_tmp_filename(WorkingDir)
    
    muscle_res = muscle_seqs(seqs,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)
    
    tree = DndParser(muscle_res["Tree1Out"], constructor=constructor)
    
    if clean_up:
        muscle_res.cleanUp()
    return tree
Beispiel #48
0
def cluster_seqs(seqs,
                 neighbor_join=False,
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_chars=1000000,
                 max_hours=1.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle cluster list of sequences.
    
    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.
    
    Addl docs coming soon
    """
    num_seqs = len(seqs)
    if num_seqs < 2:
        raise ValueError, "Muscle requres 2 or more sequences to cluster."

    
    num_chars = sum(map(len, seqs))
    if num_chars > max_chars:
        params["-maxiters"] = 2
        params["-diags1"] = True
        params["-sv"] = True
        #params["-distance1"] = "kmer6_6"
        #params["-distance1"] = "kmer20_3"
        #params["-distance1"] = "kbit20_3"
        print "lots of chars, using fast align", num_chars

    
    params["-maxhours"] = max_hours
    #params["-maxiters"] = 10
    
    #cluster_type = "upgmb"
    #if neighbor_join:
    #    cluster_type = "neighborjoining"
    
    params["-clusteronly"] = True
    params["-tree1"] = get_tmp_filename(WorkingDir)
    
    muscle_res = muscle_seqs(seqs,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)
    
    tree = DndParser(muscle_res["Tree1Out"], constructor=constructor)
    
    if clean_up:
        muscle_res.cleanUp()
    return tree
    def test_format_blast_db_string_file(self):
        """Test when path is fasta file"""
        filename = get_tmp_filename(tmp_dir=self.tmp_dir)
        copyfile(self.refseqs_fp, filename)

        self._paths_to_clean_up = [filename]

        obs = format_blast_db_string(filename)
        self.assertEqual(obs, filename)
Beispiel #50
0
    def test_insert_sequences_into_tree(self):
        """Inserts sequences into Tree using params - test handles tree-insertion"""

        # generate temp filename for output
        outfname = splitext(get_tmp_filename('/tmp/'))[0]

        # create starting tree
        outtreefname = outfname + '.tre'
        outtree = open(outtreefname, 'w')
        outtree.write(REF_TREE)
        outtree.close()

        # set params for tree-insertion
        params = {}
        params["-w"] = "/tmp/"
        params["-n"] = get_tmp_filename().split("/")[-1]
        params["-f"] = 'v'
        #params["-G"] = '0.25'
        params["-t"] = outtreefname
        params["-m"] = 'GTRGAMMA'

        aln_ref_query = get_align_for_phylip(
            StringIO(PHYLIP_FILE_DNA_REF_QUERY))
        aln = Alignment(aln_ref_query)
        seqs, align_map = aln.toPhylip()

        tree = insert_sequences_into_tree(seqs,
                                          DNA,
                                          params=params,
                                          write_log=False)

        for node in tree.tips():
            removed_query_str = re.sub('QUERY___', '', str(node.Name))
            new_node_name = re.sub('___\d+', '', str(removed_query_str))
            if new_node_name in align_map:
                node.Name = align_map[new_node_name]

        self.assertTrue(isinstance(tree, PhyloNode))
        self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
        self.assertEqual(len(tree.tips()), 7)
        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
                         self.align1, RNA, True)

        remove(outtreefname)
Beispiel #51
0
    def setUp(self):
        self.pynast_test1_input_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta)

        self.pynast_test1_template_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test1_template_fp,'w').\
         write(pynast_test1_template_fasta)

        self.pynast_test_template_w_dots_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_dots_fp,'w').\
         write(pynast_test1_template_fasta.replace('-','.'))

        self.pynast_test_template_w_u_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_u_fp,'w').\
         write(pynast_test1_template_fasta.replace('T','U'))

        self.pynast_test_template_w_lower_fp = get_tmp_filename(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        open(self.pynast_test_template_w_lower_fp,'w').\
         write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        self.result_fp = get_tmp_filename(prefix='PyNastAlignerTests_',
                                          suffix='.fasta')
        open(self.result_fp, 'w').close()
        self.failure_fp = get_tmp_filename(prefix='PyNastAlignerTests_',
                                           suffix='.fasta')
        open(self.failure_fp, 'w').close()
        self.log_fp = get_tmp_filename(prefix='PyNastAlignerTests_',
                                       suffix='.log')
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp, self.result_fp, self.failure_fp,
            self.log_fp, self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = \
         LoadSeqs(data=pynast_test1_expected_alignment,aligned=DenseAlignment)
        self.pynast_test1_expected_fail = \
         LoadSeqs(data=pynast_test1_expected_failure,aligned=False)
Beispiel #52
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError("Moltype must be either DNA, RNA, or PROTEIN")

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output    
    params["-w"] = "/tmp/"    
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1,100000)
    params["-x"] = randint(1,100000)
    
    ih = '_input_as_multiline_string'    

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)
                      
    raxml_result = raxml_app(seqs)
    
    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
    
    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
Beispiel #53
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
    """Returns a tree from Alignment object aln.
    
    aln: an xxx.Alignment object, or data that can be used to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: best_tree suppport is currently not implemented
    
    params: dict of parameters to pass in to the RAxML app controller.
    
    The result will be an xxx.Alignment object, or None if tree fails.
    """
    if best_tree:
        raise NotImplementedError

    if '-m' not in params:
        if moltype == DNA or moltype == RNA:
            #params["-m"] = 'GTRMIX'
            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
            params["-m"] = 'GTRGAMMA'
        elif moltype == PROTEIN:
            params["-m"] = 'PROTGAMMAmatrixName'
        else:
            raise ValueError("Moltype must be either DNA, RNA, or PROTEIN")

    if not hasattr(aln, 'toPhylip'):
        aln = Alignment(aln)
    seqs, align_map = aln.toPhylip()

    # generate temp filename for output
    params["-w"] = "/tmp/"
    params["-n"] = get_tmp_filename().split("/")[-1]
    params["-k"] = True
    params["-p"] = randint(1, 100000)
    params["-x"] = randint(1, 100000)

    ih = '_input_as_multiline_string'

    raxml_app = Raxml(params=params,
                      InputHandler=ih,
                      WorkingDir=None,
                      SuppressStderr=True,
                      SuppressStdout=True)

    raxml_result = raxml_app(seqs)

    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)

    for node in tree.tips():
        node.Name = align_map[node.Name]

    raxml_result.cleanUp()

    return tree
Beispiel #54
0
def aln_tree_seqs(seqs,
                 input_handler=None,
                 tree_type='neighborjoining',
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_hours=5.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle align sequences and report tree from iteration2.
    
    Unlike cluster_seqs, returns tree2 which is the tree made during the
    second muscle iteration (it should be more accurate that the cluster from
    the first iteration which is made fast based on  k-mer words)
    
    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.
    tree_type: can be either neighborjoining (default) or upgmb for UPGMA
    clean_up: When true, will clean up output files
    """
    
    params["-maxhours"] = max_hours
    if tree_type:
        params["-cluster2"] = tree_type
    params["-tree2"] = get_tmp_filename(WorkingDir)
    params["-out"] = get_tmp_filename(WorkingDir)
    
    muscle_res = muscle_seqs(seqs,
                 input_handler=input_handler,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)
    tree = DndParser(muscle_res["Tree2Out"], constructor=constructor)
    aln = [line for line in muscle_res["MuscleOut"]]
    
    if clean_up:
        muscle_res.cleanUp()
    return tree, aln
Beispiel #55
0
def aln_tree_seqs(seqs,
                 input_handler=None,
                 tree_type='neighborjoining',
                 params={},
                 add_seq_names=True,
                 WorkingDir=None,
                 SuppressStderr=None,
                 SuppressStdout=None,
                 max_hours=5.0,
                 constructor=PhyloNode,
                 clean_up=True
                 ):
    """Muscle align sequences and report tree from iteration2.
    
    Unlike cluster_seqs, returns tree2 which is the tree made during the
    second muscle iteration (it should be more accurate that the cluster from
    the first iteration which is made fast based on  k-mer words)
    
    seqs: either file name or list of sequence objects or list of strings or
        single multiline string containing sequences.
    tree_type: can be either neighborjoining (default) or upgmb for UPGMA
    clean_up: When true, will clean up output files
    """
    
    params["-maxhours"] = max_hours
    if tree_type:
        params["-cluster2"] = tree_type
    params["-tree2"] = get_tmp_filename(WorkingDir)
    params["-out"] = get_tmp_filename(WorkingDir)
    
    muscle_res = muscle_seqs(seqs,
                 input_handler=input_handler,
                 params=params,
                 add_seq_names=add_seq_names,
                 WorkingDir=WorkingDir,
                 SuppressStderr=SuppressStderr,
                 SuppressStdout=SuppressStdout)
    tree = DndParser(muscle_res["Tree2Out"], constructor=constructor)
    aln = [line for line in muscle_res["MuscleOut"]]
    
    if clean_up:
        muscle_res.cleanUp()
    return tree, aln