Example #1
0
    def test_no_clustering(self):
        otu_table = [self.headers,['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA','6','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli'], #last base only is different to first sequence
['ribosomal_protein_S17_gpkg','minimal','GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT','9','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus']]
        otu_table = "\n".join(["\t".join(x) for x in otu_table])

        with tempfile.NamedTemporaryFile() as f:
            f.write(otu_table)
            f.flush()

            with tempdir.TempDir() as d:
                cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 0".format(
                    path_to_script, d, f.name)
                extern.run(cmd)
                with tempfile.NamedTemporaryFile() as f2:
                    f2.write(">seq1\n")
                    # first sequence with an extra A at the start
                    f2.write("AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\n")
                    f2.flush()

                    # Querying the smafadb directly should show no clustering
                    cmd = "smafa query {} {}".format(
                        os.path.join(d,'db','ribosomal_protein_L11_rplK_gpkg.smafadb'),
                        f2.name)
                    out = extern.run(cmd)
                    self.assertEqual(
                        out,
                        'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA\t2\t60\n'+
                        'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\t1\t60\n')
Example #2
0
 def test_hello_world(self):
     with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f1:
         with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f2:
             f1.write(self.eg1)
             f1.flush()
             extern.run("diamond makedb --in %s --db %s.dmnd" %\
                        (f1.name, f1.name))
             f2.write(self.eg1)
             f2.write(self.eg2)
             f2.flush()
             extern.run("diamond makedb --in %s --db %s.dmnd" %\
                        (f2.name, f2.name))
             with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f3:
                 with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f4:
                     f3.write(self.eg1)
                     f3.flush()
                     ret = DecoyFilter(
                         Diamond(f2.name+".dmnd"),
                         Diamond(f1.name+".dmnd")).filter(f1.name, f4.name)
                     self.assertEqual(True, ret)
                     seqs = SequenceIO().read_fasta_file(f4.name)
                     self.assertEqual(1, len(seqs))
                     self.assertEqual("PROKKA_03952", seqs[0].name)
             # clean up
             os.remove(f1.name+".dmnd")
             os.remove(f2.name+".dmnd")
Example #3
0
    def run(self, input_sequence_file, input_sequence_type, daa_file_basename=None):
        '''Run input sequences in either blastp or blastx mode against the
        database specified in __init__.
            
        Parameters
        ----------
        input_sequence_file: str
            path to query sequences
        input_sequence_type: either 'nucleotide' or 'protein'
            the input_sequences are this kind of sequence
            
        Returns
        -------
        DiamondSearchResult
        '''
        
        cmd_list = ["diamond"]
        if input_sequence_type == UnpackRawReads.PROTEIN_SEQUENCE_TYPE:
            cmd_list.append('blastp')
        elif input_sequence_type == UnpackRawReads.NUCLEOTIDE_SEQUENCE_TYPE:
            cmd_list.append('blastx')
        else:
            raise Exception("Programming error")
        
        basename = daa_file_basename
        if basename is None:
            with tempfile.NamedTemporaryFile(prefix='graftm_diamond') as t:
                # we are just stealing the name, don't need the file itself
                basename = t.name
            
        for c in ['-k 1',
                  "-d",
                    self._database,
                    "-q",
                    "%s" % input_sequence_file,
                    "-a",
                    basename]:
            cmd_list.append(c)
        if self._threads:
            cmd_list.append("--threads")
            cmd_list.append(str(self._threads))
        if self._evalue:
            cmd_list.append("--evalue")
            cmd_list.append(str(self._evalue))

        cmd = ' '.join(cmd_list)
        extern.run(cmd)
        
        daa_name = "%s.daa" % basename
        res = DiamondSearchResult.import_from_daa_file(daa_name)
        
        if daa_file_basename is None:
            # Diamond makes an extra file, need to remove this
            os.remove(daa_name)
            
        return res
            
            
            
                
Example #4
0
    def write_krona_plot(self, sample_names, read_taxonomies, output_krona_filename):
        '''Creates krona plot at the given location. Assumes the krona executable
        ktImportText is available on the shell PATH'''
        tempfiles = []
        for n in sample_names:
            tempfiles.append(tempfile.NamedTemporaryFile(prefix='GraftMkronaInput', suffix=n))
        
        delim=u'\t'
        for _, tax, counts in self._iterate_otu_table_rows(read_taxonomies):
            for i, c in enumerate(counts):
                if c != 0:
                    tempfiles[i].write(delim.join((str(c),
                                                  delim.join(tax)
                                                  ))+"\n")
                    
        for t in tempfiles:
            t.flush()
        
        cmd = ["ktImportText",'-o',output_krona_filename]
        for i, tmp in enumerate(tempfiles):
            cmd.append(','.join([tmp.name,sample_names[i]]))

        # run the actual krona
        cmd = ' '.join(cmd)
        extern.run(cmd)

        # close tempfiles
        for t in tempfiles:
            t.close()
Example #5
0
    def test_hello_world(self):
        with tempdir.in_tempdir():
            with tempfile.NamedTemporaryFile() as fasta:
                with tempfile.NamedTemporaryFile() as tax:
                    fasta.write(Tests.extra_mcra_fasta)
                    fasta.flush()
                    tax.write(Tests.extra_mcra_taxonomy)
                    tax.flush()
                    prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
                    cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %(
                        path_to_script,
                        prev_path,
                        fasta.name,
                        tax.name,
                        'updated.gpkg')
                    extern.run(cmd1)

                    prev = GraftMPackage.acquire(prev_path)
                    up = GraftMPackage.acquire('updated.gpkg')
                    prevhash = prev.taxonomy_hash()
                    taxhash = up.taxonomy_hash()
                    self.assertEqual(len(prevhash)+1,
                                     len(taxhash))
                    self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'],
                                     taxhash['KYC55281.1'])
                    self.assertEqual(prevhash['638165755'],
                                     taxhash['638165755'])
                    seqio = SequenceIO()
                    self.assertEqual(
                        len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                        len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Example #6
0
    def test_query_with_otu_table_two_samples_same_sequence(self):
        with tempfile.NamedTemporaryFile() as f:
            query = [self.headers,
                     # second sequence with an extra A at the end
                     ['ribosomal_protein_L11_rplK_gpkg','maximal','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                     ['ribosomal_protein_L11_rplK_gpkg','minimal','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli']
                     ] # converted A to T in the middle
            query = "\n".join(["\t".join(x) for x in query])
            f.write(query)
            f.flush()

            with tempdir.TempDir() as d:
                cmd = "{} makedb --db {}/sdb --otu_table {}".format(
                    path_to_script, d, f.name)
                extern.run(cmd)

                cmd = "{} query --query_otu_table {} --db {}/sdb".format(
                    path_to_script,
                    f.name,
                    d)

                expected = [['query_name','query_sequence','divergence','num_hits','sample','marker','hit_sequence','taxonomy'],
                            ['maximal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','maximal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                            ['maximal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','minimal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli'],
                            ['minimal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','maximal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                            ['minimal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','minimal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli'],
                            ]
                observed = subprocess.check_output(cmd, shell=True)
                self.assertEqualOtuTable(expected, observed)
Example #7
0
    def test_alignment_rereplication(self):
        gpkg = os.path.join(path_to_data,'61_otus.gpkg')
        test_sequences=""">FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA_2/1
ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGGT
>FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/1
ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGGT
>FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/2
CGGGGTATCTAATCCCGTTCGCTCCCCTAGCTTTCGTGCCTCAGCGTCAGAAAAGACCCAGTGAGCCGCTTTCGCCCCCGGTGTTCCTTAGGATATCAAC
"""
        expected_rereplicated_alignment=""">FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/2
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TTGATATCCTAAGGAACACCGGGGGCGAAAGCGGCTCACTGGGTCTTCTGACGCTGAGGCACGAAAGCTAGGGGAGCGAACGGGATTAGATACCCC----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA_2/1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------""".split()
        with tempfile.NamedTemporaryFile(suffix=".fa") as tf:
            tf.write(test_sequences)
            tf.flush()
            
            with tempdir.TempDir() as tmp:
                cmd = "%s graft --forward %s --graftm_package %s --output_directory %s --force" % (path_to_script,
                                                                                                   tf.name,
                                                                                                   gpkg,
                                                                                                   tmp)
                extern.run(cmd)
                
                filename=os.path.splitext(os.path.basename(tf.name))[0]
                observed_rereplicated_alignment = [x.strip() for x in open(os.path.join(tmp, filename, "%s_hits.aln.fa" % filename))]
                
                self.assertEquals(expected_rereplicated_alignment, 
                                  observed_rereplicated_alignment)
Example #8
0
    def _align_sequences(self, input_sequences_path, output_alignment_path,
                         threads):
        '''Align sequences into alignment_file

        Parameters
        ----------
        input_sequences_path: str
            path to input sequences in fasta format
        output_alignment_path: str
            path to output alignment path
        threads: str
            number of threads to use
        Returns
        -------
        Nothing
        '''
        logging.debug("Aligning sequences using mafft")
        cmd = "mafft --anysymbol --thread %s --auto /dev/stdin > %s" % (
            threads,
            output_alignment_path)
        inputs = []
        with open(input_sequences_path) as f:
            for name,seq,_ in SequenceIO().each(f):
                inputs.append('>%s' % name)
                # Do not include * characters in the HMM, as this means tree
                # insertion fails.
                inputs.append(seq.replace('*',''))
        extern.run(cmd, stdin="\n".join(inputs))
Example #9
0
    def run(self,
            input_sequence_file,
            input_sequence_type,
            daa_file_basename=None,
            extra_args=''):
        '''Run input sequences in either blastp or blastx mode against the
        database specified in __init__.

        Parameters
        ----------
        input_sequence_file: str
            path to query sequences
        input_sequence_type: either 'nucleotide' or 'protein'
            the input_sequences are this kind of sequence

        Returns
        -------
        DiamondSearchResult
        '''

        cmd_list = ["diamond"]
        if input_sequence_type == UnpackRawReads.PROTEIN_SEQUENCE_TYPE:
            cmd_list.append('blastp')
        elif input_sequence_type == UnpackRawReads.NUCLEOTIDE_SEQUENCE_TYPE:
            cmd_list.append('blastx')
        else:
            raise Exception("Programming error")

        basename = daa_file_basename
        if basename is None:
            with tempfile.NamedTemporaryFile(prefix='graftm_diamond') as t:
                # we are just stealing the name, don't need the file itself
                basename = t.name

        for c in [
                '-k 1', "-d", self._database, "-q",
                "%s" % input_sequence_file, "-a", basename, extra_args
        ]:
            cmd_list.append(c)
        if self._threads:
            cmd_list.append("--threads")
            cmd_list.append(str(self._threads))
        if self._evalue:
            cmd_list.append("--evalue")
            cmd_list.append(str(self._evalue))

        cmd = ' '.join(cmd_list)
        extern.run(cmd)

        daa_name = "%s.daa" % basename
        res = DiamondSearchResult.import_from_daa_file(daa_name)

        if daa_file_basename is None:
            # Diamond makes an extra file, need to remove this
            os.remove(daa_name)

        return res
Example #10
0
    def summarise(**kwargs):
        '''Summarise an OTU table'''
        krona_output_file = kwargs.pop('krona_output')
        table_collection = kwargs.pop('table_collection')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        # prep the array
        gene_to_sample_to_taxonomy_to_count = Summariser._collapse_otu_table_into_gene_to_sample_to_taxonomy_to_count(
            table_collection)

        # write the output krona files
        sample_name_to_tempfile = OrderedDict()
        logging.info("Writing krona %s" % krona_output_file)
        cmd = 'ktImportText -o %s' % krona_output_file
        sample_tempfiles = []
        sample_to_gene_to_taxonomy_to_count = {}
        all_sample_names = set()
        all_gene_names = set()
        for gene, sample_to_taxonomy_to_count in gene_to_sample_to_taxonomy_to_count.items(
        ):
            all_gene_names.add(gene)
            for sample, taxonomy_to_count in sample_to_taxonomy_to_count.items(
            ):
                all_sample_names.add(sample)
                if sample not in sample_to_gene_to_taxonomy_to_count:
                    sample_to_gene_to_taxonomy_to_count[sample] = {}
                sample_to_gene_to_taxonomy_to_count[sample][
                    gene] = taxonomy_to_count
        is_more_than_one_sample = len(sample_to_gene_to_taxonomy_to_count) > 1
        for sample in sorted(all_sample_names):
            for gene in sorted(all_gene_names):
                if gene in sample_to_gene_to_taxonomy_to_count[sample]:
                    f = tempfile.NamedTemporaryFile(prefix='singlem_for_krona',
                                                    mode='w')
                    sample_tempfiles.append(f)

                    taxonomy_to_count = sample_to_gene_to_taxonomy_to_count[
                        sample][gene]
                    for taxonomy, coverage in taxonomy_to_count.items():
                        tax_split = taxonomy.split('; ')
                        if tax_split[0] == 'Root' and len(tax_split) > 1:
                            tax_split = tax_split[1:]
                        f.write('\t'.join([str(coverage)] + tax_split))
                        f.write('\n')
                    f.flush()
                    if is_more_than_one_sample:
                        display_name = '%s: %s' % (sample, gene)
                    else:
                        display_name = gene
                    cmd += " %s,'%s'" % (f.name, display_name)

        extern.run(cmd)
        for f in sample_tempfiles:
            f.close()
Example #11
0
 def test_bootstrap_executable(self):
     with tempfile.NamedTemporaryFile() as tf:
         cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % (
             path_to_script,
             os.path.join(path_to_data, 'bootstrapper',
                          'contigs.fna'), tf.name,
             os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm'))
         extern.run(cmd)
         self.assertEqual(
             "HMMER3/f [3.1b2 | February 2015]\n",
             subprocess.check_output("head -n1 %s" % tf.name, shell=True))
         self.assertEqual('NSEQ  2\n', open(tf.name).readlines()[10])
Example #12
0
    def test_no_clustering(self):
        otu_table = [
            self.headers,
            [
                'ribosomal_protein_L11_rplK_gpkg', 'minimal',
                'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',
                '7', '4.95',
                'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'
            ],
            [
                'ribosomal_protein_L11_rplK_gpkg', 'minimal',
                'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA',
                '6', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli'
            ],  #last base only is different to first sequence
            [
                'ribosomal_protein_S17_gpkg', 'minimal',
                'GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT',
                '9', '4.95',
                'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'
            ]
        ]
        otu_table = "\n".join(["\t".join(x) for x in otu_table])

        with tempfile.NamedTemporaryFile() as f:
            f.write(otu_table)
            f.flush()

            with tempdir.TempDir() as d:
                cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 0".format(
                    path_to_script, d, f.name)
                extern.run(cmd)
                with tempfile.NamedTemporaryFile() as f2:
                    f2.write(">seq1\n")
                    # first sequence with an extra A at the start
                    f2.write(
                        "AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\n"
                    )
                    f2.flush()

                    # Querying the smafadb directly should show no clustering
                    cmd = "smafa query {} {}".format(
                        os.path.join(
                            d, 'db',
                            'ribosomal_protein_L11_rplK_gpkg.smafadb'),
                        f2.name)
                    out = extern.run(cmd)
                    self.assertEqual(
                        out,
                        'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA\t2\t60\n'
                        +
                        'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\t1\t60\n'
                    )
Example #13
0
 def test_bootstrap_executable(self):
     with tempfile.NamedTemporaryFile() as tf:
         cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % (path_to_script,
                                                                                           os.path.join(path_to_data,'bootstrapper','contigs.fna'),
                                                                                           tf.name,
                                                                                           os.path.join(path_to_data,'bootstrapper','DNGNGWU00001.hmm'))
         extern.run(cmd)
         self.assertTrue(
             subprocess.check_output("head -n1 %s" % tf.name,
                                     shell=True) in
             ["HMMER3/f [3.1b2 | February 2015]\n",
              "HMMER3/f [3.2.1 | June 2018]\n"])
         self.assertEqual('NSEQ  2\n', open(tf.name).readlines()[10])
Example #14
0
    def test_jplace_output(self):
        expected_jpace = {
            'fields': [
                'classification', 'distal_length', 'edge_num',
                'like_weight_ratio', 'likelihood', 'pendant_length'
            ],
            'metadata':
            'the_metadata',
            'placements': [{
                'nm': [[
                    'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                    2
                ]],
                'p': [[
                    "o__Bacillales", 0.0874346630859, 13, 0.333350512423,
                    -608.20180926, 6.11351501465e-06
                ],
                      [
                          "o__Bacillales", 0.0643521435547, 14, 0.333326884837,
                          -608.201880142, 6.11351501465e-06
                      ],
                      [
                          "p__Firmicutes", 5.97534179688e-06, 15,
                          0.33332260274, -608.201892989, 6.11351501465e-06
                      ]]
            }],
            'tree':
            'tree_thanks',
            'version':
            3
        }

        with tempdir.TempDir() as d:
            cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\
                  " --singlem_packages %s" % (
                      path_to_script,
                      os.path.join(path_to_data,'1_pipe','jplace_test.fna'),
                      os.path.join(d, "my_jplace"),
                      os.path.join(path_to_data,'4.12.22seqs.spkg'))
            extern.run(cmd)
            jplace_path = os.path.join(
                d, 'my_jplace_jplace_test_4.12.22seqs.jplace')
            with open(jplace_path) as f:
                j = json.load(f)
            j['tree'] = 'tree_thanks'
            j['metadata'] = 'the_metadata'
            self.assertEqual(expected_jpace, j)

            # Make sure the guppy sing does not croak
            extern.run("guppy sing -o /dev/null '%s'" % jplace_path)
Example #15
0
 def test_biom_hello_world(self):
     insert_otu_table = [self.headers,
                         ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                         ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','2','2.94','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']]
     with tempdir.TempDir() as tmp:
         with tempfile.NamedTemporaryFile(suffix='.otu_table.csv') as n:
             n.write("\n".join(["\t".join(x) for x in insert_otu_table]+['']))
             n.flush()
             extern.run("%s summarise --biom_prefix '%s' --input_otu_tables '%s'" % (
                 path_to_script, os.path.join(tmp,"mybiom"), n.name))
             self.assertEqual(['mybiom.4.12.ribosomal_protein_L11_rplK.biom'], os.listdir(tmp))
             self.assertEqual(
                 '# Constructed from biom file\n#OTU ID\tinsert\ttaxonomy\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG\t1.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG\t2.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales',
                 extern.run("biom convert -i '%s' -o /dev/stdout --to-tsv --header-key taxonomy" % os.path.join(tmp,'mybiom.4.12.ribosomal_protein_L11_rplK.biom')))
Example #16
0
 def run(hmm_paths, output_directory, is_protein):
     cmd = self._graftm_command_prefix(is_protein) + \
           "--threads %i "\
           "--forward %s "\
           "--search_only "\
           "--search_hmm_files %s "\
           "--output_directory %s "\
           "--aln_hmm_file %s " % (
               self._num_threads,
               ' '.join(forward_read_files),
               ' '.join(hmm_paths),
               output_directory,
               hmm_paths[0])
     extern.run(cmd)
Example #17
0
 def test_biom_hello_world(self):
     insert_otu_table = [self.headers,
                         ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                         ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','2','2.94','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']]
     with tempdir.TempDir() as tmp:
         with tempfile.NamedTemporaryFile(suffix='.otu_table.csv') as n:
             n.write("\n".join(["\t".join(x) for x in insert_otu_table]+['']))
             n.flush()
             extern.run("%s summarise --biom_prefix '%s' --input_otu_tables '%s'" % (
                 path_to_script, os.path.join(tmp,"mybiom"), n.name))
             self.assertEqual(['mybiom.4.12.ribosomal_protein_L11_rplK.biom'], os.listdir(tmp))
             self.assertEqual(
                 '# Constructed from biom file\n#OTU ID\tinsert\ttaxonomy\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG\t1.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG\t2.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales',
                 extern.run("biom convert -i '%s' -o /dev/stdout --to-tsv --header-key taxonomy" % os.path.join(tmp,'mybiom.4.12.ribosomal_protein_L11_rplK.biom')))
Example #18
0
    def test_jplace_output(self):
        expected_jpace = {
            u'fields': [
                u'classification', u'distal_length', u'edge_num',
                u'like_weight_ratio', u'likelihood', u'pendant_length'
            ],
            u'metadata':
            'the_metadata',
            u'placements': {
                u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG':
                {
                    u'nm': [[
                        u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                        2
                    ]],
                    u'p': [[
                        u'o__Bacillales', 0.0874346630859, 13, 0.333351177694,
                        -631.301684875, 0.150831104822
                    ],
                           [
                               u'o__Bacillales', 0.0643521435547, 14,
                               0.333326655502, -631.301758441, 0.15083915761
                           ],
                           [
                               u'p__Firmicutes', 5.97534179688e-06, 15,
                               0.333322166804, -631.301771907, 0.150839131805
                           ]]
                }
            },
            u'tree':
            'tree_thanks',
            u'version':
            3
        }

        with tempdir.TempDir() as d:
            cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\
                  " --singlem_packages %s" % (
                      path_to_script,
                      os.path.join(path_to_data,'1_pipe','jplace_test.fna'),
                      os.path.join(d, "my_jplace"),
                      os.path.join(path_to_data,'4.12.22seqs.spkg'))
            extern.run(cmd)
            j = json.load(
                open(
                    os.path.join(d,
                                 'my_jplace_jplace_test_4.12.22seqs.jplace')))
            j['tree'] = 'tree_thanks'
            j['metadata'] = 'the_metadata'
            self.assertEqual(expected_jpace, j)
Example #19
0
    def _build_tree(self, alignment, base, ptype, fasttree):
        log_file = base + ".tre.log"
        tre_file = base + ".tre"
        if ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:  # If it's a nucleotide sequence
            cmd = "%s -quiet -gtr -nt -log %s -out %s %s" % (
                fasttree, log_file, tre_file, alignment)
            extern.run(cmd)
        else:  # Or if its an amino acid sequence
            cmd = "%s -quiet -log %s -out %s %s" % (fasttree, log_file,
                                                    tre_file, alignment)
            extern.run(cmd)

        self.the_trash += [log_file, tre_file]
        return log_file, tre_file
Example #20
0
    def global_search(self, query_otu_table_collection,
                      subject_otu_table_collection, cluster_identity):
        '''Search a query OTU table against a subject OTU table, yield over
        UCEntry objects that have been modified so that the query
        and subject are the relevant OtuTableEntry objects rather than
        strings. Or they are None if there are no hits, since
        --output_no_hits is used.

        query_otu_table_collection: OtuTableCollection
        subject_otu_table_collection: OtuTableCollection
        cluster_identity: float or str
            reject hits if have lower identity than this (implemented with vsearch --id).
        '''
        logging.info("Caching query OTUs")
        query_otus = list(query_otu_table_collection)
        logging.info("Caching target OTUs")
        subject_otus = list(subject_otu_table_collection)

        def name_to_index(name):
            return int(str.split(name, ';')[0])

        # write out fasta file numbered to corresponding to the OTU info
        with tempfile.NamedTemporaryFile(prefix='singlem_q_for_vsearch',mode='w') as query_f:
            for i, u in enumerate(query_otus):
                query_f.write(">%i;size=%i\n" % (i, u.count))
                query_f.write(u.sequence.replace('-','')+"\n")
            query_f.flush()

            with tempfile.NamedTemporaryFile(prefix='singlem_db_for_vsearch',mode='w') as db_f:
                for i, u in enumerate(subject_otu_table_collection):
                    db_f.write(">%i;size=%i\n" % (i, u.count))
                    db_f.write(u.sequence.replace('-','')+"\n")
                db_f.flush()

                with tempfile.NamedTemporaryFile(prefix='singlem_uc') as uc:
                    command = "vsearch --usearch_global %s --db %s --uc %s --id %s --output_no_hits" % (
                        query_f.name,
                        db_f.name,
                        uc.name,
                        str(cluster_identity))
                    logging.info("Running search")
                    extern.run(command)
                    logging.info("Finished running search")
                    with open(uc.name) as uc_read:
                        for uc_entry in UCFile(uc_read):
                            uc_entry.query = query_otus[name_to_index(uc_entry.query)]
                            if uc_entry.target is not None:
                                uc_entry.target = subject_otus[name_to_index(uc_entry.target)]
                            yield uc_entry
Example #21
0
    def test_jplace_output(self):
        expected_jpace = {u'fields': [u'classification',
                                      u'distal_length',
                                      u'edge_num',
                                      u'like_weight_ratio',
                                      u'likelihood',
                                      u'pendant_length'],
                          u'metadata': 'the_metadata',
                          u'placements':
                          [{
                           u'nm': [[u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                                     2]],
                            u'p': [[u'o__Bacillales',
                                    0.0874346630859,
                                    13,
                                    0.333351177694,
                                    -631.301684875,
                                    0.150831104822],
                                   [u'o__Bacillales',
                                    0.0643521435547,
                                    14,
                                    0.333326655502,
                                    -631.301758441,
                                    0.15083915761],
                                   [u'p__Firmicutes',
                                    5.97534179688e-06,
                                    15,
                                    0.333322166804,
                                    -631.301771907,
                                    0.150839131805]]}],
                          u'tree': 'tree_thanks',
                          u'version': 3}

        with tempdir.TempDir() as d:
            cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\
                  " --singlem_packages %s" % (
                      path_to_script,
                      os.path.join(path_to_data,'1_pipe','jplace_test.fna'),
                      os.path.join(d, "my_jplace"),
                      os.path.join(path_to_data,'4.12.22seqs.spkg'))
            extern.run(cmd)
            jplace_path = os.path.join(d, 'my_jplace_jplace_test_4.12.22seqs.jplace')
            j = json.load(open(jplace_path))
            j['tree'] = 'tree_thanks'
            j['metadata'] = 'the_metadata'
            self.assertEqual(expected_jpace, j)

            # Make sure the guppy sing does not croak
            extern.run("guppy sing -o /dev/null '%s'" % jplace_path)
Example #22
0
    def _create_dmnd_database(self, unaligned_sequences_path, daa_output):
        '''
        Build a diamond database using diamond makedb

        Parameters
        ----------
        unaligned_sequences_path: str
            path to a FASTA file containing unaligned sequences
        daa_output: str
            Name of output database.
        '''
        logging.debug("Building diamond database")

        cmd = "diamond makedb --in '%s' -d '%s'" % (unaligned_sequences_path, daa_output)
        extern.run(cmd)
Example #23
0
    def global_search(self, query_otu_table_collection,
                     subject_otu_table_collection, cluster_identity):
        '''Search a query OTU table against a subject OTU table, yield over
        UCEntry objects that have been modified so that the query
        and subject are the relevant OtuTableEntry objects rather than
        strings. Or they are None if there are no hits, since
        --output_no_hits is used.

        query_otu_table_collection: OtuTableCollection
        subject_otu_table_collection: OtuTableCollection
        cluster_identity: float or str
            reject hits if have lower identity than this (implemented with vsearch --id).
        '''
        logging.info("Caching query OTUs")
        query_otus = list(query_otu_table_collection)
        logging.info("Caching target OTUs")
        subject_otus = list(subject_otu_table_collection)

        def name_to_index(name):
            return int(string.split(name, ';')[0])

        # write out fasta file numbered to corresponding to the OTU info
        with tempfile.NamedTemporaryFile(prefix='singlem_q_for_vsearch') as query_f:
            for i, u in enumerate(query_otus):
                query_f.write(">%i;size=%i\n" % (i, u.count))
                query_f.write(u.sequence.replace('-','')+"\n")
            query_f.flush()

            with tempfile.NamedTemporaryFile(prefix='singlem_db_for_vsearch') as db_f:
                for i, u in enumerate(subject_otu_table_collection):
                    db_f.write(">%i;size=%i\n" % (i, u.count))
                    db_f.write(u.sequence.replace('-','')+"\n")
                db_f.flush()

                with tempfile.NamedTemporaryFile(prefix='singlem_uc') as uc:
                    command = "vsearch --usearch_global %s --db %s --uc %s --id %s --output_no_hits" % (query_f.name,
                                                                               db_f.name,
                                                                               uc.name,
                                                                               str(cluster_identity))
                    logging.info("Running search")
                    extern.run(command)
                    logging.info("Finished running search")
                    with open(uc.name) as uc_read:
                        for uc_entry in UCFile(uc_read):
                            uc_entry.query = query_otus[name_to_index(uc_entry.query)]
                            if uc_entry.target is not None:
                                uc_entry.target = subject_otus[name_to_index(uc_entry.target)]
                            yield uc_entry
Example #24
0
    def test_paired_reads_hello_world(self):
        # Reads should be merged
        expected = [
            "\t".join(self.headers),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	1	2.44	Root; d__Bacteria; p__Firmicutes',
            '']
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
'''
        inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT
''' # reverse complement of the forward, so should collapse.
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile(suffix='.fa') as n2:
                n2.write(inseqs_reverse)
                n2.flush()

                cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {}".format(
                    path_to_script,
                    n.name,
                    os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'),
                    n2.name)
                self.assertEqualOtuTable(
                    list([line.split("\t") for line in expected]),
                    extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
Example #25
0
    def test_seqs_dna(self):
        aln = '''>s1
ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATGTG
>s2 asdas
ca---------GAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGA----
>s3
ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGGGCTGATGTG-
>d4
-g----------AGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATG--
'''
        expected = '''TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
'''
        with tempfile.NamedTemporaryFile() as a:
            a.write(aln)
            a.flush()
            with tempfile.NamedTemporaryFile() as stderr:
                cmd = "%s --debug seqs --alignment %s --alignment_type dna"\
                      " --window_size 20 2>%s" % (
                          path_to_script, a.name, stderr.name)
                self.assertEqual('', extern.run(cmd))
                # This includes ignored columns at the front, which were messing things up.
                self.assertTrue('Found best section of the alignment starting from 14\n' in \
                                open(stderr.name).read())
Example #26
0
    def _get_hmm_from_alignment(self, alignment, hmm_filename,
                                output_alignment_filename):
        '''Return a HMM file and alignment of sequences to that HMM

        Parameters
        ----------
        alignment: str
            path to aligned proteins
        hmm_filename: str
            write the hmm to this file path
        output_alignment_filename: str
            write the output alignment to this file path

        Returns
        -------
        Return the pipeline type of the HMM.
        '''
        logging.info("Building HMM from alignment")

        with tempfile.NamedTemporaryFile(suffix='.fasta',
                                         prefix='graftm',
                                         mode='w') as tempaln:

            cmd = "hmmbuild -O /dev/stdout -o /dev/stderr '%s' '%s'" % (
                hmm_filename, alignment)
            output = extern.run(cmd)

            SeqIO.write(SeqIO.parse(StringIO(output), 'stockholm'), tempaln,
                        'fasta')
            tempaln.flush()

            ptype, _ = self._pipe_type(hmm_filename)
            SequenceSearcher(hmm_filename).alignment_correcter(
                [tempaln.name], output_alignment_filename)
            return ptype
Example #27
0
    def test_two_nucleotide_packages(self):
        expected = [
            "\t".join(self.headers),
            '61_otus.v3		GGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGACTGACGCTGATGTGCGAAAGCG	2	5.13	Root; k__Bacteria; p__Proteobacteria',
            '61_otus.second.v3		TTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCA	1	1.66	Root; k__Archaea; p__Euryarchaeota',
            '']
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483 1:N:0:AAGAGGCAAAGGAGTA
GATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGGGATCAAACAGGATTAGATACCCTGGTAGT
>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483_revcom
ACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGCACATCAGCGTCAGTTACAGACCAGAAAGTCGCCTTCGCCACTGGTGTTCCTCCATATC
>NS500333:10:H0V2GAGXX:2:13211:8623:16289 1:N:0:GATCAG
ATTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCAAGAGCCCGGAGATGGATTCTGAGACACGAATCCAGGTCCTACGGGGCGCAGCAGGCGCGAAAACTTTACACTGCGCGAAAGCGCGATA
'''
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()

            cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s %s" % (
                path_to_script,
                n.name,
                os.path.join(path_to_data,'61_otus.v3.gpkg.spkg'),
                os.path.join(path_to_data,'second_packge.spkg'))
            self.assertEqualOtuTable(
                list([line.split("\t") for line in expected]),
                extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
Example #28
0
    def test_known_sequence_taxonomy(self):
        expected = [
            "\t".join(self.headers),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	2	4.88	mytax; yeh',
            '']
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
>another
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
'''
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile() as taxf:
                taxf.write("HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482\tmytax; yeh\n")
                taxf.write("another\tmytax; yeh; 2\n")
                taxf.flush()

                cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s "\
                      "--no_assign_taxonomy --known_sequence_taxonomy %s"% (
                          path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'),
                          taxf.name)
                self.assertEqual(expected,
                                 extern.run(cmd).replace(
                                     os.path.basename(n.name).replace('.fa',''),
                                     '').split("\n"))
Example #29
0
    def test_one_read_two_orfs_two_diamond_hits(self):
        # what a pain the real world is
        seq = '''>HWI-ST1240:128:C1DG3ACXX:7:2204:6599:65352 1:N:0:GTAGAGGATAGATCGC
ACCCACAGCTCGGGGTTGCCCTTGCCCGACCCCATGCGTGTCTCGGCGGGCTTCTGGTGACGGGCTTGTCCGGGAAGACGCGGATCCAGACCTTGCCTCCGCGCTTGACGTGCCGGGTCATCGCGATACGGGCCGCCTCGATCTGACGTGC
'''
        expected = [
            self.headers,
            [
                'S1.7.ribosomal_protein_L16_L10E_rplP		CGCGTCTTCCCGGACAAGCCCGTCACCAGAAGCCCGCCGAGACACGCATGGGGTCGGGCA	1	1.64	GCA_000949295.1'
            ]
        ]
        exp = sorted(["\t".join(x) for x in expected] + [''])
        with tempfile.NamedTemporaryFile(mode='w',
                                         prefix='singlem_test',
                                         suffix='.fa') as t:
            t.write(seq)
            t.flush()
            cmd = "%s --quiet pipe --sequences %s --otu_table /dev/stdout --threads 4 --assignment_method diamond_example" % (
                path_to_script, t.name)
            self.assertEqual(
                exp,
                sorted(
                    extern.run(cmd).replace(
                        os.path.basename(t.name).replace('.fa', ''),
                        '').split("\n")))
Example #30
0
    def test_paired_reads_one_read_each_diamond_example(self):
        # Reads should be merged
        expected = [
            "\t".join(self.headers_with_extras),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	2	4.88	2524614704	HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 seq2	60 60	False',
            '']
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
>seq2
AAAAAAAAAAAAAAAAA
'''
        inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
AAAAAAAAAAAAAAAAA
>seq2
TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT
''' # reverse complement of the forward, so should collapse.
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile(suffix='.fa') as n2:
                n2.write(inseqs_reverse)
                n2.flush()

                cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {} --output_extras --assignment_method diamond_example".format(
                    path_to_script,
                    n.name,
                    os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'),
                    n2.name)
                self.assertEqualOtuTable(
                    list([line.split("\t") for line in expected]),
                    extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
Example #31
0
    def _create_dmnd_database(self, unaligned_sequences_path, daa_output):
        '''
        Build a diamond database using diamond makedb

        Parameters
        ----------
        unaligned_sequences_path: str
            path to a FASTA file containing unaligned sequences
        daa_output: str
            Name of output database.
        '''
        logging.debug("Building diamond database")

        cmd = "diamond makedb --in '%s' -d '%s'" % (unaligned_sequences_path,
                                                    daa_output)
        extern.run(cmd)
Example #32
0
    def test_query_with_otu_table(self):
        with tempfile.NamedTemporaryFile(mode='w') as f:
            query = [
                self.headers,
                # second sequence with an extra A at the end
                [
                    'ribosomal_protein_L11_rplK_gpkg', 'minimal',
                    'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATA',
                    '7', '4.95',
                    'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'
                ]
            ]
            query = "\n".join(["\t".join(x) for x in query])
            f.write(query)
            f.flush()

            cmd = "%s query --query_otu_table %s --db %s" % (
                path_to_script, f.name, os.path.join(path_to_data, 'a.sdb'))

            expected = [
                [
                    'query_name', 'query_sequence', 'divergence', 'num_hits',
                    'sample', 'marker', 'hit_sequence', 'taxonomy'
                ],
                [
                    'minimal;ribosomal_protein_L11_rplK_gpkg',
                    'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATA',
                    '1', '6', 'minimal', 'ribosomal_protein_S2_rpsB_gpkg',
                    'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATC',
                    'Root; k__Bacteria; p__Firmicutes; c__Bacilli'
                ]
            ]
            expected = ["\t".join(x) for x in expected] + ['']
            self.assertEqual(expected, extern.run(cmd).split('\n'))
Example #33
0
    def test_paired_reads_hello_world(self):
        # Reads should be merged
        expected = [
            "\t".join(self.headers),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	1	2.44	Root; d__Bacteria; p__Firmicutes',
            ''
        ]
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
'''
        inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT
''' # reverse complement of the forward, so should collapse.
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile(suffix='.fa') as n2:
                n2.write(inseqs_reverse)
                n2.flush()

                cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {}".format(
                    path_to_script, n.name,
                    os.path.join(path_to_data, '4.11.22seqs.gpkg.spkg'),
                    n2.name)
                self.assertEqualOtuTable(
                    list([line.split("\t") for line in expected]),
                    extern.run(cmd).replace(
                        os.path.basename(n.name).replace('.fa', ''), ''))
Example #34
0
    def test_cluster_across_samples_via_script(self):
        e = [['gene','sample','sequence','num_hits','coverage','taxonomy'],
            ['4.11.ribosomal_protein_L10','minimal','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACT','2','4.88','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'],
            ['4.12.ribosomal_protein_L11_rplK','ma','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA','4','9.76','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']
            ]
        exp = "\n".join(["\t".join(x) for x in e]+[''])

        with tempfile.NamedTemporaryFile(prefix='singlem_cluster') as f:
            cmd = "%s summarise --cluster --cluster_id %f --input_otu_tables %s --output_otu_table /dev/stdout" % (
                path_to_script, 58.5/60, f.name)
            for l in ["\t".join(o) for o in e]:
                f.write(l+"\n")
            f.flush()
            output = extern.run(cmd)
            out_clusters = [o.split("\t") for o in output.split("\n")]
            self.assertEqual(
                [['gene', 'sample', 'sequence', 'num_hits', 'coverage', 'taxonomy'],
                 ['4.12.ribosomal_protein_L11_rplK',
                  'ma',
                  'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA',
                  '4',
                  '9.76',
                  'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                 ['4.12.ribosomal_protein_L11_rplK',
                  'minimal',
                  'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA',
                  '2',
                  '4.88',
                  'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                 ['']],
                out_clusters)
Example #35
0
    def test_paired_reads_one_read_each_diamond_example(self):
        # Reads should be merged
        expected = [
            "\t".join(self.headers_with_extras),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	2	4.88	2524614704	HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 seq2	60 60	False',
            ''
        ]
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
>seq2
AAAAAAAAAAAAAAAAA
'''
        inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
AAAAAAAAAAAAAAAAA
>seq2
TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT
''' # reverse complement of the forward, so should collapse.
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile(suffix='.fa') as n2:
                n2.write(inseqs_reverse)
                n2.flush()

                cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {} --output_extras --assignment_method diamond_example".format(
                    path_to_script, n.name,
                    os.path.join(path_to_data, '4.11.22seqs.gpkg.spkg'),
                    n2.name)
                self.assertEqualOtuTable(
                    list([line.split("\t") for line in expected]),
                    extern.run(cmd).replace(
                        os.path.basename(n.name).replace('.fa', ''), ''))
Example #36
0
    def test_diamond_assign_taxonomy(self):
        with tempfile.NamedTemporaryFile(suffix='.fasta') as f:
            query = "\n".join([
                '>HWI-ST1243:156:D1K83ACXX:7:1109:18214:9910 1:N:0:TCCTGAGCCTAAGCCT',
                'GTTAAATTACAAATTCCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATCATGGGATTCTGTAAAGAGT',
                ''
            ])
            f.write(query)
            f.flush()

            cmd = "%s --debug pipe --sequences %s --otu_table /dev/stdout --assignment_method diamond --threads 4" % (
                path_to_script, f.name)

            expected = [
                self.headers,
                [
                    'S1.5.ribosomal_protein_L11_rplK',
                    os.path.basename(f.name)[:-6],
                    'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                    '1', '2.44',
                    'Root; d__Bacteria; p__Firmicutes; c__Bacilli_A; o__Thermoactinomycetales; f__Thermoactinomycetaceae'
                ]
            ]
            expected = ["\t".join(x) for x in expected] + ['']
            observed = extern.run(cmd).split("\n")
            r = re.compile(
                '; g__.*'
            )  # Do not test beyond genus level because updated diamond version change slightly.
            self.assertEqual([r.sub('', e) for e in expected],
                             [r.sub('', e) for e in observed])
Example #37
0
    def test_two_nucleotide_packages(self):
        expected = [
            "\t".join(self.headers),
            '61_otus.v3		GGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGACTGACGCTGATGTGCGAAAGCG	2	5.13	Root; k__Bacteria; p__Proteobacteria',
            '61_otus.second.v3		TTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCA	1	1.66	Root; k__Archaea; p__Euryarchaeota',
            ''
        ]
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483 1:N:0:AAGAGGCAAAGGAGTA
GATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGGGATCAAACAGGATTAGATACCCTGGTAGT
>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483_revcom
ACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGCACATCAGCGTCAGTTACAGACCAGAAAGTCGCCTTCGCCACTGGTGTTCCTCCATATC
>NS500333:10:H0V2GAGXX:2:13211:8623:16289 1:N:0:GATCAG
ATTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCAAGAGCCCGGAGATGGATTCTGAGACACGAATCCAGGTCCTACGGGGCGCAGCAGGCGCGAAAACTTTACACTGCGCGAAAGCGCGATA
'''
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()

            cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s %s" % (
                path_to_script, n.name,
                os.path.join(path_to_data, '61_otus.v3.gpkg.spkg'),
                os.path.join(path_to_data, 'second_packge.spkg'))
            self.assertEqualOtuTable(
                list([line.split("\t") for line in expected]),
                extern.run(cmd).replace(
                    os.path.basename(n.name).replace('.fa', ''), ''))
Example #38
0
    def test_known_sequence_taxonomy(self):
        expected = [
            "\t".join(self.headers),
            '4.11.22seqs		TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA	2	4.88	mytax; yeh',
            ''
        ]
        inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
>another
ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA
'''
        with tempfile.NamedTemporaryFile(suffix='.fa') as n:
            n.write(inseqs)
            n.flush()
            with tempfile.NamedTemporaryFile() as taxf:
                taxf.write(
                    "HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482\tmytax; yeh\n"
                )
                taxf.write("another\tmytax; yeh; 2\n")
                taxf.flush()

                cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s "\
                      "--no_assign_taxonomy --known_sequence_taxonomy %s"% (
                          path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'),
                          taxf.name)
                self.assertEqual(
                    expected,
                    extern.run(cmd).replace(
                        os.path.basename(n.name).replace('.fa', ''),
                        '').split("\n"))
Example #39
0
    def test_seqs_dna(self):
        aln = '''>s1
ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATGTG
>s2 asdas
ca---------GAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGA----
>s3
ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGGGCTGATGTG-
>d4
-g----------AGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATG--
'''
        expected = '''TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
TATGGAGGAACACCAGTGGC
'''
        with tempfile.NamedTemporaryFile(mode='w') as a:
            a.write(aln)
            a.flush()
            with tempfile.NamedTemporaryFile() as stderr:
                cmd = "%s --debug seqs --alignment %s --alignment_type dna"\
                      " --window_size 20 2>%s" % (
                          path_to_script, a.name, stderr.name)
                self.assertEqual('', extern.run(cmd))
                # This includes ignored columns at the front, which were messing things up.
                with open(stderr.name) as stde:
                    self.assertTrue(
                        'Found best section of the alignment starting from 14\n' in \
                        stde.read())
Example #40
0
    def create_diamond_db(self):
        '''Create a diamond database from the unaligned sequences in this package.

        Returns
        -------
        path to the created diamond db e.g. 'my_sequences.dmnd'
        '''
        base = self.unaligned_sequence_database_path()
        cmd = "diamond makedb --in '%s' -d '%s'" % (self.unaligned_sequence_database_path(), base)
        extern.run(cmd)
        diamondb = '%s.dmnd' % base
        # Mostly this moves a file to it's current location because Create
        # follows this same logic, but there's a specially crafted
        # test/data/mcrA.gpkg which is slightly different.
        os.rename(diamondb, self.diamond_database_path())
        return diamondb
Example #41
0
    def test_known_tax_table(self):
        expected = [
            self.headers,
            [
                '4.12.22seqs', 'small',
                'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                '4', '9.76',
                'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'
            ],
            [
                '4.11.22seqs', 'small',
                'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA',
                '2', '4.88', 'Root; d__Bacteria; p__Firmicutes'
            ]
        ]
        exp = sorted(["\t".join(x) for x in expected] + [''])

        cmd = "%s --quiet pipe --sequences %s/1_pipe/small.fa --otu_table /dev/stdout --threads 4 --singlem_packages %s" % (
            path_to_script, path_to_data, self.two_packages)
        self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))

        expected = [
            self.headers,
            [
                '4.12.22seqs', 'small',
                'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG',
                '4', '9.76', 'some1'
            ],
            [
                '4.11.22seqs', 'small',
                'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA',
                '2', '4.88', 'Root; d__Bacteria; p__Firmicutes'
            ]
        ]
        exp = sorted(["\t".join(x) for x in expected] + [''])

        with tempfile.NamedTemporaryFile(mode='w',
                                         prefix='singlem_test_known') as t:
            t.write('\n'.join(["\t".join(x) for x in expected[:2]]))
            t.flush()

            cmd = "%s --quiet pipe --sequences %s/1_pipe/small.fa --otu_table /dev/stdout --threads 4 --known_otu_tables %s --singlem_packages %s"\
                 % (path_to_script,
                    path_to_data,
                    t.name,
                    self.two_packages)
            self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
Example #42
0
    def summarise(**kwargs):
        '''Summarise an OTU table'''
        krona_output_file = kwargs.pop('krona_output')
        table_collection = kwargs.pop('table_collection')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        # prep the array
        gene_to_sample_to_taxonomy_to_count = Summariser._collapse_otu_table_into_gene_to_sample_to_taxonomy_to_count(table_collection)

        # write the output krona files
        sample_name_to_tempfile = OrderedDict()
        logging.info("Writing krona %s" % krona_output_file)
        cmd = 'ktImportText -o %s' % krona_output_file
        sample_tempfiles = []
        sample_to_gene_to_taxonomy_to_count = {}
        all_sample_names = set()
        all_gene_names = set()
        for gene, sample_to_taxonomy_to_count in gene_to_sample_to_taxonomy_to_count.items():
            all_gene_names.add(gene)
            for sample, taxonomy_to_count in sample_to_taxonomy_to_count.items():
                all_sample_names.add(sample)
                if sample not in sample_to_gene_to_taxonomy_to_count:
                    sample_to_gene_to_taxonomy_to_count[sample] = {}
                sample_to_gene_to_taxonomy_to_count[sample][gene] = taxonomy_to_count
        is_more_than_one_sample = len(sample_to_gene_to_taxonomy_to_count) > 1
        for sample in sorted(all_sample_names):
            for gene in sorted(all_gene_names):
                if gene in sample_to_gene_to_taxonomy_to_count[sample]:
                    f = tempfile.NamedTemporaryFile(prefix='singlem_for_krona')
                    sample_tempfiles.append(f)

                    taxonomy_to_count = sample_to_gene_to_taxonomy_to_count[sample][gene]
                    for taxonomy, coverage in taxonomy_to_count.iteritems():
                        tax_split = taxonomy.split('; ')
                        if tax_split[0] == 'Root' and len(tax_split) > 1: tax_split = tax_split[1:]
                        f.write('\t'.join([str(coverage)]+tax_split))
                        f.write('\n')
                    f.flush()
                    if is_more_than_one_sample:
                        display_name = '%s: %s' % (sample, gene)
                    else:
                        display_name = gene
                    cmd += " %s,'%s'" % (f.name, display_name)
        extern.run(cmd)
        for f in sample_tempfiles:
            f.close()
Example #43
0
    def hmmsearch(self, input_pipe, hmms, output_files):
        r"""Run HMMsearch with all the HMMs, generating output files

        Parameters
        ----------
        input_pipe: String
            A string which is a partial command line. When this command is run
            is outputs to STDOUT fasta formatted protein sequences, which
            hmmsearch runs on.
        hmms: list of paths
            A list of (string) paths to HMM files which are used to search with.
        output_files: list of paths
            A list of (string) paths to output CSV files to be generated by the
            HMM searching

        Returns
        -------
        N/A

        May raise an exception if hmmsearching went amiss"""

        # Check input and output paths are the same length
        if len(hmms) != len(output_files):
            raise Exception(
                "Programming error: number of supplied HMMs differs from the number of supplied output files"
            )

        # Create queue data structure
        queue = []
        for i, hmm in enumerate(hmms):
            queue.append([hmm, output_files[i]])

        # While there are more things left in the queue
        while len(queue) > 0:
            pairs_to_run = self._munch_off_batch(queue)

            # Run hmmsearches with each of the pairs
            cmd = self._hmm_command(input_pipe, pairs_to_run)
            logging.debug("Running command: %s" % cmd)

            try:
                extern.run(cmd)
            except extern.ExternCalledProcessError as e:
                if e.stderr == b'\nError: Sequence file - is empty or misformatted\n\n':
                    raise NoInputSequencesException(cmd)
                else:
                    raise e
Example #44
0
    def test_clustering(self):
        otu_table = [
            self.headers,
            [
                'ribosomal_protein_L11_rplK_gpkg', 'minimal',
                'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',
                '7', '4.95',
                'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'
            ],
            [
                'ribosomal_protein_L11_rplK_gpkg', 'minimal',
                'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA',
                '6', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli'
            ],  #last base only is different to first sequence
            [
                'ribosomal_protein_S17_gpkg', 'minimal',
                'GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT',
                '9', '4.95',
                'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'
            ]
        ]
        otu_table = "\n".join(["\t".join(x) for x in otu_table])

        with tempfile.NamedTemporaryFile(mode='w') as f:
            f.write(otu_table)
            f.flush()

            with tempdir.TempDir() as d:
                cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 3".format(
                    path_to_script, d, f.name)
                subprocess.check_call(cmd, shell=True)

                cmd = "%s query --query_sequence %s --db %s/db" % (
                    path_to_script,
                    'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',  # first sequence with an extra A at the start
                    d)

                expected = [
                    [
                        'query_name', 'query_sequence', 'divergence',
                        'num_hits', 'sample', 'marker', 'hit_sequence',
                        'taxonomy'
                    ],
                    [
                        'unnamed_sequence',
                        'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',
                        '1', '7', 'minimal', 'ribosomal_protein_L11_rplK_gpkg',
                        'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',
                        'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'
                    ],
                    [
                        'unnamed_sequence',
                        'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC',
                        '2', '6', 'minimal', 'ribosomal_protein_L11_rplK_gpkg',
                        'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA',
                        'Root; k__Bacteria; p__Firmicutes; c__Bacilli'
                    ]
                ]
                self.assertEqualOtuTable(expected, extern.run(cmd))
Example #45
0
    def get_dmnd(self):
        ''' Create temporary DIAMOND file for search method '''
        fasta_paths = [
            pkg.graftm_package().unaligned_sequence_database_path()
            for pkg in self.singlem_packages
        ]
        temp_dmnd = tempfile.NamedTemporaryFile(
            mode="w",
            prefix='singlem-diamond-prefilter',
            suffix='.dmnd',
            delete=False).name
        cmd = 'cat %s | '\
            'diamond makedb --in - --db %s' % (' '.join(fasta_paths), temp_dmnd)

        extern.run(cmd)

        return temp_dmnd
Example #46
0
 def test_hello_world(self):
     with tempdir.TempDir() as tmp:
         with tempdir.TempDir() as tmp2:
             cmd1 = "%s create --verbosity 2 --sequences %s --alignment %s --taxonomy %s --rerooted_tree %s --output %s" \
                 %(path_to_script,
                   os.path.join(path_to_data,'create','homologs.trimmed.unaligned.faa'),
                   os.path.join(path_to_data,'create','homologs.trimmed.aligned.faa'),
                   os.path.join(path_to_data,'create','homologs.tax2tree.rerooted.decorated.tree-consensus-strings'),
                   os.path.join(path_to_data,'create','homologstre.tree'),
                   tmp+".gpkg")
             extern.run(cmd1)
             cmd2 = "%s graft --verbosity 2 --graftm_package %s --forward %s --output_directory %s" \
                 % (path_to_script,
                    "%s.gpkg" % tmp,
                    os.path.join(path_to_data,'create','test.faa'),
                    tmp2+"_")
             extern.run(cmd2)
Example #47
0
    def test_print_insert(self):
        expected = [self.headers,['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                    ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','1','2.51','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']]
        exp = sorted(["\t".join(x) for x in expected]+[''])

        cmd = "%s --debug pipe --sequences %s/1_pipe/insert.fna --otu_table /dev/stdout --threads 4 --include_inserts" % (path_to_script,
                                                                                                    path_to_data)
        self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
Example #48
0
 def test_bootstrap_executable(self):
     with tempfile.NamedTemporaryFile() as tf:
         cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % (
             path_to_script,
             os.path.join(path_to_data, 'bootstrapper',
                          'contigs.fna'), tf.name,
             os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm'))
         extern.run(cmd)
         with open(tf.name) as tf2:
             lines = tf2.readlines()
             first_line = lines[0]
             self.assertTrue(first_line in [
                 "HMMER3/f [3.1b2 | February 2015]\n",
                 "HMMER3/f [3.2.1 | June 2018]\n"
             ],
                             msg=first_line)
             self.assertEqual('NSEQ  2\n', lines[10])
Example #49
0
    def test_print_insert(self):
        expected = [self.headers,['S1.5.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
                    ['S1.5.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','1','2.51','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']]
        exp = sorted(["\t".join(x) for x in expected]+[''])

        cmd = "%s --debug pipe --sequences %s/1_pipe/insert.fna --otu_table /dev/stdout --threads 4 --include_inserts" % (path_to_script,
                                                                                                    path_to_data)
        self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
Example #50
0
 def test_hello_world(self):
     with tempdir.TempDir() as tmp:
         with tempdir.TempDir() as tmp2:
             cmd1 = "%s create --verbosity 2 --sequences %s --alignment %s --taxonomy %s --rerooted_tree %s --output %s" \
                 %(path_to_script,
                   os.path.join(path_to_data,'create','homologs.trimmed.unaligned.faa'),
                   os.path.join(path_to_data,'create','homologs.trimmed.aligned.faa'),
                   os.path.join(path_to_data,'create','homologs.tax2tree.rerooted.decorated.tree-consensus-strings'),
                   os.path.join(path_to_data,'create','homologstre.tree'),
                   tmp+".gpkg")
             extern.run(cmd1)
             cmd2 = "%s graft --verbosity 2 --graftm_package %s --forward %s --output_directory %s" \
                 % (path_to_script,
                    "%s.gpkg" % tmp,
                    os.path.join(path_to_data,'create','test.faa'),
                    tmp2+"_")
             extern.run(cmd2)
Example #51
0
    def hmmsearch(self, input_pipe, hmms, output_files):
        r"""Run HMMsearch with all the HMMs, generating output files

        Parameters
        ----------
        input_pipe: String
            A string which is a partial command line. When this command is run
            is outputs to STDOUT fasta formatted protein sequences, which
            hmmsearch runs on.
        hmms: list of paths
            A list of (string) paths to HMM files which are used to search with.
        output_files: list of paths
            A list of (string) paths to output CSV files to be generated by the
            HMM searching

        Returns
        -------
        N/A

        May raise an exception if hmmsearching went amiss"""

        # Check input and output paths are the same length
        if len(hmms) != len(output_files):
            raise Exception("Programming error: number of supplied HMMs differs from the number of supplied output files")

        # Create queue data structure
        queue = []
        for i, hmm in enumerate(hmms):
            queue.append( [hmm, output_files[i]] )

        # While there are more things left in the queue
        while len(queue) > 0:
            pairs_to_run = self._munch_off_batch(queue)

            # Run hmmsearches with each of the pairs
            cmd = self._hmm_command(input_pipe, pairs_to_run)
            logging.debug("Running command: %s" % cmd)

            try:
                extern.run(cmd)
            except extern.ExternCalledProcessError, e:
                if e.stderr == '\nError: Sequence file - is empty or misformatted\n\n':
                    raise NoInputSequencesException(cmd)
                else:
                    raise e
Example #52
0
    def _generate_tree_log_file(self, tree, alignment, output_tree_file_path,
                               output_log_file_path, residue_type, fasttree):
        '''Generate the FastTree log file given a tree and the alignment that
        made that tree

        Returns
        -------
        Nothing. The log file as parameter is written as the log file.
        '''
        if residue_type==Create._NUCLEOTIDE_PACKAGE_TYPE:
            cmd = "%s -quiet -gtr -nt -nome -mllen -intree '%s' -log %s -out %s %s" %\
                                       (fasttree, tree, output_log_file_path,
                                        output_tree_file_path, alignment)
        elif residue_type==Create._PROTEIN_PACKAGE_TYPE:
            cmd = "%s -quiet -nome -mllen -intree '%s' -log %s -out %s %s" %\
                                       (fasttree, tree, output_log_file_path,
                                        output_tree_file_path, alignment)
        extern.run(cmd)
Example #53
0
 def test_get_tree_default(self):
     cmd = "{} get_tree".format(path_to_script)
     observed = extern.run(cmd)
     splits = observed.split('\n')
     self.assertEqual('marker\ttree_file', splits[0])
     self.assertEqual('.tre', splits[1][-4:])
     self.assertGreater(len(splits), 10)
     for line in splits[1:-1]:
         self.assertTrue(os.path.exists(line.split('\t')[1]))
Example #54
0
 def test_get_tree_default(self):
     cmd = "{} get_tree".format(path_to_script)
     observed = extern.run(cmd)
     splits = observed.split('\n')
     self.assertEqual('marker\ttree_file', splits[0])
     self.assertEqual('.tre',splits[1][-4:])
     self.assertGreater(len(splits), 10)
     for line in splits[1:-1]:
         self.assertTrue(os.path.exists(line.split('\t')[1]))
Example #55
0
    def _generate_tree_log_file(self, tree, alignment, output_tree_file_path,
                                output_log_file_path, residue_type, fasttree):
        '''Generate the FastTree log file given a tree and the alignment that
        made that tree

        Returns
        -------
        Nothing. The log file as parameter is written as the log file.
        '''
        if residue_type == Create._NUCLEOTIDE_PACKAGE_TYPE:
            cmd = "%s -quiet -gtr -nt -nome -mllen -intree '%s' -log %s -out %s %s" %\
                                       (fasttree, tree, output_log_file_path,
                                        output_tree_file_path, alignment)
        elif residue_type == Create._PROTEIN_PACKAGE_TYPE:
            cmd = "%s -quiet -nome -mllen -intree '%s' -log %s -out %s %s" %\
                                       (fasttree, tree, output_log_file_path,
                                        output_tree_file_path, alignment)
        extern.run(cmd)
Example #56
0
    def _build_tree(self, alignment, base, ptype, fasttree):
        log_file = base + ".tre.log"
        tre_file = base + ".tre"
        if ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: # If it's a nucleotide sequence
            cmd = "%s -quiet -gtr -nt -log %s -out %s %s" % (fasttree,
                                                             log_file,
                                                             tre_file,
                                                             alignment)
            extern.run(cmd)
        else: # Or if its an amino acid sequence
            cmd = "%s -quiet -log %s -out %s %s" % (fasttree,
                                                    log_file,
                                                    tre_file,
                                                    alignment)
            extern.run(cmd)

        self.the_trash += [log_file, tre_file]
        return log_file, tre_file
Example #57
0
    def test_dump(self):
        expected = """gene	sample	sequence	num_hits	coverage	taxonomy
ribosomal_protein_L11_rplK_gpkg	minimal	GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC	7	15.1	Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales
ribosomal_protein_S2_rpsB_gpkg	minimal	CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATC	6	12.4	Root; k__Bacteria; p__Firmicutes; c__Bacilli
ribosomal_protein_S17_gpkg	minimal	GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT	9	19.5	Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus"""
        cmd = "{} query --db {}/a.sdb --dump".format(
            path_to_script, path_to_data)
        self.assertEqualOtuTable(
            list([line.split("\t") for line in expected.split("\n")]),
            extern.run(cmd))
Example #58
0
    def test_diamond_example_assign_taxonomy(self):
        expected = [self.headers,['S1.5.ribosomal_protein_L11_rplK','minimal','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','4','9.76','2513237297']
                    ]
        exp = sorted(["\t".join(x) for x in expected]+[''])

        cmd = "%s --debug pipe --sequences %s/1_pipe/minimal.fa --otu_table /dev/stdout --threads 4 --assignment_method diamond_example" % (path_to_script,
                                                                                                    path_to_data)
        observed = sorted(extern.run(cmd).split("\n"))
        r = re.compile('\t.*?$') # Do not test the exact genome number because updated diamond version change this slightly.
        self.assertEqual([r.sub('',e) for e in exp], [r.sub('',e) for e in observed])
Example #59
0
    def test_query_by_taxonomy(self):
        expected = [
            self.headers,
            ['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC','7','15.10','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'],
            ['ribosomal_protein_S17_gpkg','minimal','GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT','9','19.50','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus']]
        expected = ["\t".join(x) for x in expected]+['']

        cmd = "%s query --db %s/a.sdb --taxonomy o__Bacillales" %(path_to_script,
                                                                  path_to_data)
        self.assertEqual(expected, extern.run(cmd).split('\n'))
Example #60
0
    def _concatenate_file(self, file_list, output):
        '''
        Call unix "cat" to concatenate a list of files

        Parameters
        ----------
        file_list: list
            List of strings, each leading to a file. These files are the ones to
            be concatenate together. E.g.:
                ["/path/to/file1", "/path/to/file2"]
        output: str
            Path to file to which to the files in file_list will be concatenated
            into.

        '''
        to_cat = ' '.join(file_list)
        logging.debug("Concatenating files: %s" % (to_cat))
        cmd = "cat %s > %s" % (to_cat, output)
        extern.run(cmd)