def test_blast_seq_against_bad_db(self):
        'We can blast a seq file against a database'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        create_project(directory=test_dir.name,
                                       name=project_name)
        project_dir = join(test_dir.name, project_name)

        #some query fasta file
        query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT'
        query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n'
        query_fhand = NamedTemporaryFile(mode='w')
        query_fhand.write(query)
        query_fhand.flush()

        #the blast db
        blast_db_fname = 'uni'
        blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname)

        blast_program = 'blastn'
        try:
            backbone_blast_runner(query_fpath=query_fhand.name,
                                  project_dir=project_dir,
                                  blast_program=blast_program,
                                  blast_db=blast_db)
            self.fail('RuntimeError expected')
        except RuntimeError:
            pass
        test_dir.close()
    def test_blast_seq_against_db():
        'We can blast a seq file against a database'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        create_project(directory=test_dir.name,
                                       name=project_name)
        project_dir = join(test_dir.name, project_name)

        #some query fasta file
        query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT'
        query += 'CATACCCCTGCCGAACCGCTTTTGTCA|n'
        query_fhand = NamedTemporaryFile(mode='w')
        query_fhand.write(query)
        query_fhand.flush()

        #the blast db
        blast_db_fname = 'univec+'
        blast_db = join(TEST_DATA_DIR, 'blast', blast_db_fname)

        blast_program = 'blastn'
        backbone_blast_runner(query_fpath=query_fhand.name,
                              project_dir=project_dir,
                              blast_program=blast_program,
                              blast_db=blast_db)

        #is the blast ok?
        blast_fpath = join(project_dir,
                           BACKBONE_DIRECTORIES['blast_dir'],
                           _get_basename(query_fhand.name),
                           blast_db_fname,
                           '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'],
                                          blast_program))
        assert '<Hit_def>vec1</Hit_def>' in open(blast_fpath).read()
        test_dir.close()
Beispiel #3
0
    def test_go_annotation_analysis():
        "We can annotate gos"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        nr_path = os.path.join(TEST_DATA_DIR, "blast", "arabidopsis_genes+")
        b2g = os.path.join(TEST_DATA_DIR, "b2gPipe.properties")
        b2gpipe_bin = os.path.join(guess_jar_dir("blast2go.jar"), "blast2go.jar")
        if not b2gpipe_bin:
            print "Do not run b2gppe tests, blast2go jar file not found "
            return
        config = {
            "blast": {"nr": {"path": nr_path, "species": "nr"}},
            "Annotation": {
                "go_annotation": {
                    "blast_database": "nr",
                    "create_dat_file": True,
                    "java_memory": 2048,
                    "b2g_properties_file": b2g,
                    "blast2go_path": b2gpipe_bin,
                }
            },
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)
        seq = "CTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT"
        seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC"
        seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA"
        seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA"
        seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT"
        seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGG"

        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq1\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.st_nucl.pl_454.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        bdir = join(project_dir, "annotations", "blast", "seqs.st_nucl.pl_454", "arabidopsis_genes+")
        os.makedirs(bdir)
        shutil.copy(join(TEST_DATA_DIR, "blastResult.xml"), join(bdir, "blast.tblastx.xml"))

        do_analysis(project_settings=settings_path, kind="annotate_gos", silent=True)
        repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.st_nucl.pl_454.0.pickle")
        result = open(repr_fpath).read()
        assert "GO:0043094" in result
        assert os.path.exists(os.path.join(project_dir, "annotations", "features", "seqs.st_nucl.pl_454.b2g.dat"))
        assert os.path.exists(os.path.join(project_dir, "annotations", "features", "seqs.st_nucl.pl_454.b2g.annot"))

        do_analysis(project_settings=settings_path, kind="annotate_gos", silent=True)

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.st_nucl.pl_454.txt")
        result = open(stats_fpath).read()
        expected = """Sequences with GOs: 1
Number of GOs: 10"""
        assert expected in result
Beispiel #4
0
    def test_snv_annot_without_rg():
        'It tests that we can do snv calling with a bam without rg info'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        configuration = {'Snvs':{'default_bam_platform':'sanger'},
                         'General_settings':{'threads':THREADS}}
        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)


        project_dir = join(test_dir.name, project_name)
        #the reference
        reference_dir = join(project_dir, 'mapping/reference')
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, 'reference.fasta')
        out = open(reference_fpath, 'w')
        for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')):
            out.write(line)

        bams_dir = join(project_dir, 'mapping', 'bams')
        os.makedirs(bams_dir)
        bam_fpath = join(bams_dir, 'merged.0.bam')

        shutil.copy(join(TEST_DATA_DIR, 'merged.0.bam'), bam_fpath)
        create_bam_index(bam_fpath)

        annot_input_dir = join(project_dir, 'annotations', 'input')
        os.makedirs(annot_input_dir)
        os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta'))
        do_analysis(project_settings=settings_path, kind='annotate_snvs', silent=True)
    def test_read_stats_analysis2():

        # another read stats with real data

        clean_fpath = os.path.join(TEST_DATA_DIR, 'clean_stats', 'cleaned',
                                   'lb_sflp2.pl_sanger.sm_t111.sfastq')
        raw_fpath = os.path.join(TEST_DATA_DIR, 'clean_stats', 'raw',
                                   'lb_sflp2.pl_sanger.sm_t111.sfastq')

        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        project_dir = join(test_dir.name, project_name)

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name)

        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        cleaned_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)
        os.mkdir(cleaned_reads_dir)

        shutil.copy(clean_fpath, cleaned_reads_dir)
        shutil.copy(raw_fpath, original_reads_dir)
        do_analysis(project_settings=settings_path, kind='read_stats',
                    silent=True)
    def test_remove_output_on_error():
        'We remove files when we have an error on cleaning'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        project_dir = join(test_dir.name, project_name)

        configuration = {'Cleaning': {'adaptors_file_454': 'AKHSGDASD'},
                         'General_settings': {'threads': THREADS}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)

        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)

        #fake solid reads
        try:
            do_analysis(project_settings=settings_path, kind='clean_reads',
                        silent=True)
        except KeyError:
            pass
        output__fpath = join(reads_dir, 'cleaned', 'pl_454.lb_b.sfastq')
        assert not exists(output__fpath)
    def test_blast_seq_against_seq_db():
        'We can blast a seq file against a sequence file database'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        create_project(directory=test_dir.name,
                                       name=project_name)
        project_dir = join(test_dir.name, project_name)

        #some query fasta file
        query = '>seq1\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGCT'
        query += 'CATACCCCTGCCGAACCGCTTTTGTCA\n'
        query_fhand = NamedTemporaryFile(mode='w')
        query_fhand.write(query)
        query_fhand.flush()

        #the blast db
        blast = '@seq\nGATCGGCCTTCTTGCGCATCTCACGCGCTCCTGCGGCGGCCTGTAGGGCAGGC\n'
        blast += '+\n'
        blast += '11111111111111111111111111111111111111111111111111111\n'
        blast_db_fhand = NamedTemporaryFile(mode='w', suffix='.sfastq')
        blast_db_fhand.write(blast)
        blast_db_fhand.flush()

        blast_program = 'blastn'
        backbone_blast_runner(query_fpath=query_fhand.name,
                              project_dir=project_dir,
                              blast_program=blast_program,
                              blast_db_seq=blast_db_fhand.name)

        #is the blast ok?
        blast_fpath = join(project_dir,
                           BACKBONE_DIRECTORIES['blast_dir'],
                           _get_basename(query_fhand.name),
                           _get_basename(blast_db_fhand.name),
                           '%s.%s.xml' % (BACKBONE_BASENAMES['blast_basename'],
                                          blast_program))
        result = open(blast_fpath).read()
        assert '<Hit_num>1</Hit_num>' in result
Beispiel #8
0
    def test_orf_annotation_analysis():
        "We can annotate orfs"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        matrix = os.path.join(TEST_DATA_DIR, "At.smat")
        config = {
            "Annotation": {"orf_annotation": {"estscan_matrix": matrix}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)
        seq = "CTACTTACTAGCTTTAGTAAATCCTTCTAACCCTCGGTAAAAAAAAAAAAGAGGCATCAAATG"
        seq += "GCTTCATCCATTCTCTCATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCTGCTCAAGCT"
        seq += "AGCATGGGGGCACCATTCACTGGCCTAAAATCCGCCGCTGCTTTCCCNGTNACTCGCANGACC"
        seq += "AACGACATCACCACTTTGGTTAGCAATGGGGGAAGAGTTCAGGGCNTGAAGGTGTGCCCACCA"
        seq += "CTTGGATTGAAGAAGTTCGAGACTCTTTCTTACCTTCCTGATATGAGTAACGAGCAATTGGGA"
        seq += "AAGGAAGTTGACTACCTTCTCAGGAAGGGATGGATTCCCTGCATTGAATTCGACATTCACAGT"
        seq += "GGATTCGTTTACCGTGAGACCCACAGGTCACCAGGATACTTCGATGGACGCTACTGGACCATG"
        seq += "TGGAAGCTGCCCATGTTTGGCTGCACCGAT"

        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True)
        repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        result = open(repr_fpath).read()
        assert "orf" in result
        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)

        seq_fpath = join(project_dir, "annotations", "features", "seqs.orf_seq.fasta")
        pep_fpath = join(project_dir, "annotations", "features", "seqs.orf_pep.fasta")

        assert "ATCCGCCGNTGTGGCCTTTGNCAACAGGGCTTCCCCT" in open(seq_fpath).read()
        assert "QASMGAPFTGLKSAAAFPVTRXTNDITTLVSNG" in open(pep_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = """Sequences with ORF: 1
Number of ORFs: 1"""
        assert expected in result

        test_dir.close()
    def test_create_project():
        'We can create a project'
        test_dir = NamedTemporaryDir()
        settings_path = create_project(directory=test_dir.name,
                                       name='backbone')

        assert settings_path == join(test_dir.name,
                                'backbone', BACKBONE_DIRECTORIES['config_file'])
        settings = create_configuration(settings_path)
        assert settings['General_settings']['project_name'] == 'backbone'
        project_path = join(test_dir.name, 'backbone')
        assert settings['General_settings']['project_path'] == project_path
        assert settings['Cleaning']['strip_n_percent'] == 2.0
        content = open(settings_path).read()
        assert 'strip_n_percent' in content
        test_dir.close()
Beispiel #10
0
    def test_cleaning_analysis_lucy():
        'We can clean the reads'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec')
        configuration = {'Cleaning':{'vector_database':None}}
        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)

        project_dir = join(test_dir.name, project_name)
        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)

        os.makedirs(join(project_dir, 'config_data', 'lucy'))
        lucy_settings = join(project_dir, 'config_data', 'lucy', 'lucy.conf')
        luc_c = open(lucy_settings, 'w')
        luc_c.write(repr({'ps':{'vector_file':'tmp' , 'splice_file':'tmp'}}))
        luc_c.flush()

        #print original_reads_dir
        fpath_noqual = join(original_reads_dir, 'pl_sanger.lb_ps.fasta')
        fpath_qual = join(original_reads_dir, 'pl_sanger.lb_andreas.sfastq')
        fpath_454 = join(original_reads_dir, 'pl_454.lb_ps.sfastq')
        fpath_ill = join(original_reads_dir, 'pl_illumina.lb_psi.sfastq')
        open(fpath_noqual, 'w').write(READS_NOQUAL)
        open(fpath_qual, 'w').write(SANGER_QUAL)
        open(fpath_454, 'w').write(READS_454)
        open(fpath_ill, 'w').write(READS_ILL)
        do_analysis(project_settings=settings_path, kind='clean_reads',
                    silent=True)
        cleaned_dir = join(project_dir, 'reads', 'cleaned')
        assert exists(cleaned_dir)

        cleaned_qual = join(cleaned_dir, os.path.basename(fpath_qual))
        assert 'SEX' in open(cleaned_qual).read()

        cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454))
        assert exists(cleaned_454)

        cleaned_noqual = join(cleaned_dir, os.path.basename(fpath_noqual))
        clean_seqs = open(cleaned_noqual).read()
        assert clean_seqs.startswith('>FM195262.1\nGCATTCTCG')
Beispiel #11
0
    def test_microsatellite_annoation_analysis():
        "We can annotate introns"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        settings_path = create_project(
            directory=test_dir.name, name=project_name, configuration={"General_settings": {"threads": THREADS}}
        )
        project_dir = join(test_dir.name, project_name)
        seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC"
        seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG"
        seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA"
        seq += "GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"
        seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC"
        seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG"
        seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG"
        seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA"
        seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG"
        seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT"
        seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG"
        seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT"
        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_microsatellites", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        result = open(pickle_fpath).read()
        assert "microsatellite" in result

        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)
        ssr_fpath = join(project_dir, "annotations", "features", "seqs.ssr")
        assert os.path.exists(ssr_fpath)
        assert "Seqname" in open(ssr_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = "Sequences with microsatellites: 1"
        assert expected in result

        test_dir.close()
Beispiel #12
0
    def test_cdna_intron_annoation_analysis():
        "We can annotate introns"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        blast_db_path = os.path.join(TEST_DATA_DIR, "blast")
        genomic_db = os.path.join(blast_db_path, "tomato_genome2+")
        config = {
            "Annotation": {"Cdna_intron_annotation": {"genomic_db": genomic_db, "genomic_seq_file": genomic_db}},
            "General_settings": {"threads": THREADS},
        }
        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)
        seq = "GAAAAGATGTGATTGGTGAAATAAGTTTGCCTCAATTCTCTTGTGCCGAAGTTCCAAAGAAGC"
        seq += "AGTTGGTGAATGAGCAGCCAGTACCCGAAAAATCGAGCAAAGATTTTGTGATGTATGTTGGAG"
        seq += "GTCTAGCATGGGGGATGGACTGGTGTCCCCAAGCTCATGAAAATAGGGATGCTCCTATGAAAA"
        seq += "GTGAGTTTGTCGCAATTGCTCCTCATCCTCCTGATTCATCATATCACAAGACTGATGCCTCAC"
        seq += "TTACAGGCAGAGGTGTAATTCAGATATGGTGCCTGCCAGATCTCATTCAAAAAGATATAATTG"
        seq += "TGAAAGAAGATTATTTTGCTCAGGTTAACAAAAAACCGTATAGAAATTTGACAAGAAGTGAAG"
        seq += "CAGGTACGGGAGAAGTATCTGGACCTCAAAAACCAAGAGGAAGACCAAAAAAGAACCCTGGTA"
        seq += "AAGCAGTCCAGGCAAAAGCATCTAGACCACAAAATCCAAGAGGAAGACCGAGAAAGAAGCCTG"
        seq += "TTACTGAATCTTTAGGTGATAGAGATAGTGAAGACCACAGTTTACAACCTCTTGCTATAGAGT"
        seq += "GGTCGCTGCAATCAACAGAACTTTCTGTAGATTTGTCTTGTGGAAATATGAATAAAGCCCAAG"
        seq += "TAGATATTGCGCTGAGTCAAGAAAGATGTATTAATGCGGCAT"
        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)

        # create some seqs to annotate
        fasta = ">seq\n%s\n" % seq
        fhand = open(os.path.join(annot_input_dir, "seqs.fasta"), "w")
        fhand.write(fasta)
        fhand.close()
        do_analysis(project_settings=settings_path, kind="annotate_introns", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "seqs.0.pickle")
        assert "intron" in open(pickle_fpath).read()

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "seqs.txt")
        result = open(stats_fpath).read()
        expected = """Sequences with intron: 1
Number of introns: 3"""
        assert expected in result

        test_dir.close()
Beispiel #13
0
    def test_description_annotation_analysis():
        "We can annotate with description"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        arab_blastdb = join(TEST_DATA_DIR, "blast", "arabidopsis_genes+")
        config = {
            "blast": {"arabidopsis": {"path": arab_blastdb, "species": "arabidopsis"}},
            "Annotation": {"description_annotation": {"description_databases": ["arabidopsis"]}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)

        # some melon file to annotate
        input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"])
        os.makedirs(input_dir)
        seq_ = "AGGTGTCACCGTTCACGAGGGCGACTGGGACTCCCACGGGGCCATCAAGTCCTGGAACTACA"
        seq_ += "CATGCGGTCCTCTATCTCATTCTCTATTTGTATGAATATGTGTTTATTACTAGCTAGGGTTT"
        seq_ += "CTATTAATGAAAGGTTCATGTAAATATATGAAGATGGGAAGCAAGAGGTGTTCAAGGAGAAG"
        seq_ += "AGGGAGTTAGACGACCAGAAGAT"
        seq1 = SeqWithQuality(Seq(seq_), id="CUTC021854")
        seq2 = SeqWithQuality(Seq("Atagtagcatcagatgagcatcgacttctagctagctagct"), id="CUTC021853")
        write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a"))

        do_analysis(project_settings=settings_path, kind="annotate_descriptions", silent=True)

        repr_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle")
        result = open(repr_fpath).read()
        # print result
        assert "yet another one" in result

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt")
        result = open(stats_fpath).read()
        expected = """Annotation statistics
---------------------
Number of sequences: 2
Sequences with description: 1"""
        assert expected in result

        test_dir.close()
Beispiel #14
0
    def test_mapping_color():
        'It test the mapping of the mapper with color space'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'

        blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')

        snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True,
                                  'genomic_db':blastdb_seq,
                                  'genomic_seqs_fpath':blastdb_seq},

                       'filter7':{'name':'by_kind', 'use':True,
                                  'kind':'SNP'},
                       'filter12':{'name':'ref_not_in_list', 'use':True,
                                'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')},
                       'filter10':{'unique_name': 'variable_in_sm',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola']},
                       'filter11':{'unique_name': 'variable_in_adios',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['adios']},
                       'filter13':{'unique_name': 'variable_in_caracola',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['caracola']}, }

        configuration = {'Snvs':{'min_quality':20},
                         'Sam_processing':{'add_default_qualities':True},
                         'snv_filters':snv_filters,
                         'General_settings':{'threads':THREADS}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        clean_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)
        shutil.copy(os.path.join(TEST_DATA_DIR, 'solid.fastq'),
               os.path.join(clean_reads_dir, 'pl_solid.lb_hola.sm_hola.sfastq'))

        #the reference
        reference_dir = join(project_dir, 'mapping/reference')
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, 'reference.fasta')
        out = open(reference_fpath, 'w')
        for line in open(join(TEST_DATA_DIR, 'samtools_color/reference')):
            out.write(line)

        do_analysis(project_settings=settings_path, kind='mapping', silent=True)
        mapping_dir = join(project_dir, 'mapping')
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, 'bams',
                            'by_readgroup', 'pl_solid.lb_hola.sm_hola.bam'))
        result_dir = join(mapping_dir, 'bams')
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, 'by_readgroup')
        assert exists(result_dir_by_lib)

        do_analysis(project_settings=settings_path, kind='merge_bams',
                    silent=True)
        assert exists(join(result_dir, 'merged.0.bam'))
        assert exists(join(result_dir, 'merged.0.bam.bai'))

        #we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind='realign_bam',
                    silent=True)
        assert exists(join(result_dir, 'merged.1.bam'))

        test_dir.close()
    def test_read_stats_analysis():
        'It test the read statistics'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        project_dir = join(test_dir.name, project_name)

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name)

        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)
        fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq')
        fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq')
        open(fpath_454, 'w').write(READS_454)
        open(fpath_ill, 'w').write(READS_ILL)

        #the cleaned reads
        cleaned_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(cleaned_reads_dir)
        fpath_454 = join(cleaned_reads_dir, 'pl_454.lb_a.sfastq')
        fpath_ill = join(cleaned_reads_dir, 'pl_illumina.lb_no_raw.sfastq')
        open(fpath_454, 'w').write(READS_454)
        open(fpath_ill, 'w').write(READS_ILL)

        do_analysis(project_settings=settings_path, kind='read_stats',
                    silent=True)

        clean_stats_dir = join(cleaned_reads_dir, 'stats')
        clean_fnames = os.listdir(clean_stats_dir)
        expected_fnames = ['pl_illumina.lb_no_raw.qual',
                           'pl_454.lb_a.qual',
                           'pl_illumina.lb_no_raw.length',
                           'pl_454.lb_a.length']
        for fname in expected_fnames:
            assert fname + '.dat' in clean_fnames
            assert fname + '.svg' in clean_fnames

        statistics_fpath = join(clean_stats_dir,
                                BACKBONE_BASENAMES['statistics_file'])
        content = open(statistics_fpath).read()
        assert content == '''statistics for pl_454.lb_a.sfastq
---------------------------------
Num sequences: 4
Total sequence length: 759
Sequence length minimum: 106
Sequence length maximum: 295
Sequence length average: 189.75
Sequence length variance: 4972.69
Sequence qualities minimum: 20
Sequence qualities maximum: 40
Sequence qualities average: 36.99
Sequence qualities variance: 8.19

statistics for pl_illumina.lb_no_raw.sfastq
-------------------------------------------
Num sequences: 6
Total sequence length: 172
Sequence length minimum: 24
Sequence length maximum: 31
Sequence length average: 28.67
Sequence length variance: 10.89
Sequence qualities minimum: 4
Sequence qualities maximum: 34
Sequence qualities average: 29.63
Sequence qualities variance: 47.80

'''

        boxplot_fpath = join(clean_stats_dir,
                             'pl_illumina.lb_no_raw' + '.qual.boxplot.dat')
        exp = 'distrib\tmean\tstd_deviation\t1st_quartile\tmedian\t3rd_qualtile'
        assert exp in open(boxplot_fpath).read()
        freq_nucl_fpath = join(clean_stats_dir, 'pl_454.lb_a.freq_position.svg')
        nucl_freq = open(freq_nucl_fpath).read()
        assert 'style="fill:#0000ff;stroke:#000000;"/>' in nucl_freq
    def test_cleaning_analysis():
        'We can clean the reads'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        project_dir = join(test_dir.name, project_name)
        adaptors_dir = join(project_dir, 'config_data', 'adaptors')
        adaptors_path_454 = join(adaptors_dir, '454_adaptors')
        words = ['^ATGAAC', 'TTGATTTGGT']
        univec = os.path.join(TEST_DATA_DIR, 'blast', 'univec+')
        configuration = {'Cleaning': {'vector_database': univec,
                                     'adaptors_file_454': adaptors_path_454,
                                     'short_adaptors_454': words,
                                     'edge_removal': {'454_left': 3,
                                                      '454_right': 3}},
                         'General_settings': {'threads': THREADS}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)

        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        original_reads_dir = join(reads_dir, 'raw')
        os.mkdir(reads_dir)
        os.mkdir(original_reads_dir)

        os.makedirs(adaptors_dir)
        adap_fhand = open(adaptors_path_454, 'w')
        adap_fhand.write('''>smart_5_cds_primer_1
GGTTCAAGGTTTGAGAAAGGATGGGAAG\n''')
        adap_fhand.close()

        #print original_reads_dir
        fpath_454 = join(original_reads_dir, 'pl_454.lb_a.sfastq')
        fpath_ill = join(original_reads_dir, 'pl_illumina.lb_b.sfastq')
        fpath_solid = join(original_reads_dir, 'pl_solid.lb_prueba.sfastq')

        open(fpath_solid, 'w').write(READS_SOLID)
        open(fpath_454, 'w').write(READS_454)
        open(fpath_ill, 'w').write(READS_ILL)

        do_analysis(project_settings=settings_path, kind='clean_reads',
                    silent=True)
        cleaned_dir = join(project_dir, 'reads', 'cleaned')
        assert exists(cleaned_dir)
        cleaned_454 = join(cleaned_dir, os.path.basename(fpath_454))
        assert exists(cleaned_454)
        seqs = list(seqs_in_file(open(cleaned_454)))
        # It means thar the adaptor has been removed
        seq = seqs[0].seq
        assert 'GGTTCAAGGTTTGAGAAAGGATGGGAAG' not in seq

        seq = seqs[2].seq
        # It means that the starting word has been removed
        assert  seq.startswith('TTCCAAGATTCTTCCCACAT')

        # solid
        cleaned_solid = join(cleaned_dir, os.path.basename(fpath_solid))
        clean_seqs = open(cleaned_solid).read()
        assert '10_1824_570_F3' not in clean_seqs

        do_analysis(project_settings=settings_path,
                    kind='prepare_mira_assembly', silent=True)
        assembly_input = join(project_dir, 'assembly', 'input')
        assert exists(assembly_input)
        mira_in_454 = join(assembly_input, 'backbone_in.454.fasta')
        mira_in_qul = join(assembly_input, 'backbone_in.454.fasta.qual')
        assert exists(mira_in_454)
        assert exists(mira_in_qul)

        do_analysis(project_settings=settings_path, kind='mira_assembly',
                    silent=True)
        assembly_dir = join(project_dir, 'assembly')
        sorted(os.listdir(assembly_dir))
        test_dir.close()
Beispiel #17
0
    def test_mapping_analysis():
        'We can map the reads'
        test_dir = NamedTemporaryDir()
        project_name = 'backbone'
        bed_fhand = NamedTemporaryFile(suffix='.bed')
        bed_fhand.write('AT1G14930.1\t200\t400\nAT1G55265.1\t100\t300\n')
        bed_fhand.flush()

        blastdb_seq = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
        snv_filters = {'filter1':{'name':'uniq_contiguous', 'use':True,
                                  'genomic_db':blastdb_seq,
                                  'genomic_seqs_fpath':blastdb_seq},

                       'filter7':{'name':'by_kind', 'use':True,
                                  'kind':'SNP'},
                       'filter12':{'name':'ref_not_in_list', 'use':True,
                                'list_path':os.path.join(TEST_DATA_DIR, 'cos_list')},
                       'filter10':{'unique_name': 'variable_in_sm',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola1']},
                       'filter11':{'unique_name': 'variable_in_adios',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['adios']},
                       'filter13':{'unique_name': 'variable_in_caracola',
                                   'name': 'is_variable', 'use':True,
                                   'group_kind':'libraries',
                                   'groups':['hola2']},
                       'filter14':{'name': 'in_segment_bed', 'use':True,
                                   'bed_fpath':bed_fhand.name,
                                   'edge_avoidance':10}}

        configuration = {'Snvs':{'min_quality':20},
                         'Sam_processing':{'add_default_qualities':True},
                         'snv_filters':snv_filters,
                         'General_settings':{'threads':THREADS},
                         'Mappers':{'keep_unmapped_reads_in_bam':False}}

        settings_path = create_project(directory=test_dir.name,
                                       name=project_name,
                                       configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        #setup the original reads
        reads_dir = join(project_dir, 'reads')
        clean_reads_dir = join(reads_dir, 'cleaned')
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)

        solexa = '@seq1\n'
        solexa += 'TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq2\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq14\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq15\n'
        solexa += 'ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq12\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq13\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq16\n'
        solexa += 'ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'
        solexa += '@seq17\n'
        solexa += 'ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n'
        solexa += '+\n'
        solexa += 'IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n'

        sanger = '>seq3\n'
        sanger += 'GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA'
        sanger += 'GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA'
        sanger += 'TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA'
        sanger += 'TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA'
        sanger += 'AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n'
        sanger += '>seq4\n'
        sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT'
        sanger += 'CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG'
        sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG'
        sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT'
        sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC'
        sanger += '>seq5\n'
        sanger += 'TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT'
        sanger += 'CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG'
        sanger += 'TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG'
        sanger += 'AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT'
        sanger += 'CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC'

        fpath_sanger = join(clean_reads_dir, 'lb_hola1.pl_sanger.sm_hola.fasta')
        fpath_solexa = join(clean_reads_dir,
                                    'lb_hola2.pl_illumina.sm_hola.sfastq')
        open(fpath_sanger, 'w').write(sanger)
        open(fpath_solexa, 'w').write(solexa)

        fpath_sanger2 = join(clean_reads_dir, 'lb_adios.pl_sanger.fasta')
        fpath_solexa2 = join(clean_reads_dir,
                                    'lb_adios.pl_illumina.sfastq')
        open(fpath_sanger2, 'w').write(sanger)
        open(fpath_solexa2, 'w').write(solexa)

        #the reference
        reference_dir = join(project_dir, 'mapping/reference')
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, 'reference.fasta')
        out = open(reference_fpath, 'w')
        for line in open(join(TEST_DATA_DIR, 'blast/arabidopsis_genes')):
            out.write(line)

        do_analysis(project_settings=settings_path, kind='mapping', silent=True)
        mapping_dir = join(project_dir, 'mapping')
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, 'bams',
                            'by_readgroup', 'lb_hola2.pl_illumina.sm_hola.bam'))
        result_dir = join(mapping_dir, 'bams')
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, 'by_readgroup')
        assert exists(result_dir_by_lib)
        unmapped_fpath = join(mapping_dir, 'unmapped_reads.gz')
        assert exists(unmapped_fpath)
        unmappeds = GzipFile(unmapped_fpath).read()
        assert 'seq17' in unmappeds


        do_analysis(project_settings=settings_path, kind='merge_bams',
                    silent=True)
        assert exists(join(result_dir, 'merged.0.bam'))
        assert exists(join(result_dir, 'merged.0.bam.bai'))

        #we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind='realign_bam',
                    silent=True)
        assert exists(join(result_dir, 'merged.1.bam'))

        #we calculate BAQ
        do_analysis(project_settings=settings_path, kind='calmd_bam',
                    silent=True)

        assert exists(join(result_dir, 'merged.2.bam'))
        assert exists(join(result_dir, 'merged.2.bam.bai'))


        do_analysis(project_settings=settings_path, kind='mapping_stats',
                    silent=True)
        stats_fname = join(mapping_dir,
                           BACKBONE_DIRECTORIES['mapping_stats'][1],
                           BACKBONE_BASENAMES['statistics_file'])
        result = open(stats_fname).read()
        assert 'Statistics for Coverage for platform sanger' in result

        annot_input_dir = join(project_dir, 'annotations', 'input')
        os.makedirs(annot_input_dir)
        os.symlink(reference_fpath, join(annot_input_dir, 'reference.fasta'))
        do_analysis(project_settings=settings_path, kind='annotate_snvs',
                    silent=True)
        json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'],
                          'reference.0.pickle')
        assert 'snv' in  open(json_fpath).read()

        do_analysis(project_settings=settings_path, kind='filter_snvs',
                    silent=True)
        json_fpath = join(project_dir, BACKBONE_DIRECTORIES['annotation_dbs'],
                          'reference.1.pickle')
        result = open(json_fpath).read()
        #print result
        assert 'snv' in result
        assert 'adios_sanger' in result

        do_analysis(project_settings=settings_path, kind='write_annotations',
                    silent=True)
        vcf_fpath = join(project_dir, 'annotations', 'features',
                         'reference.vcf')
        vcf = open(vcf_fpath).read()

        assert 'VLB1' in vcf
        assert 'VLB2' in vcf
        assert 'VLB3' in vcf
        assert 'AT1G14930.1' in vcf
        assert 'IS10' in vcf

        do_analysis(project_settings=settings_path, kind='mapping_stats',
                    silent=True)
        stats_dir = join(project_dir, 'mapping', 'bams', 'stats')
        assert exists(join(stats_dir, 'backbone.coverage_illumina.dat'))

        stats_fpath = join(stats_dir, BACKBONE_BASENAMES['statistics_file'])
        result = open(stats_fpath).read()
        expected = '''average: 0.4542
variance: 1.3050
total sequence length: 3941'''
        assert expected in result

        test_dir.close()
Beispiel #18
0
    def test_ortholog_annotation_analysis():
        "We can annotate orthologs"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"

        config = {
            "blast": {
                "arabidopsis": {"path": "/path/to/tair", "species": "arabidopsis", "kind": "nucl"},
                "arabidopsis2": {"path": "/path/to/tair2", "species": "arabidopsis2", "kind": "nucl"},
            },
            "Annotation": {"ortholog_annotation": {"ortholog_databases": ["arabidopsis", "arabidopsis2"]}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=config)
        project_dir = join(test_dir.name, project_name)

        # create blast results
        melon_tair_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair")
        melon_tair2_blastdir = join(project_dir, "annotations", "blast", "melon.st_nucl.pl_454", "tair2")
        os.makedirs(melon_tair_blastdir)
        os.makedirs(melon_tair2_blastdir)
        tair_melon_blastdir = join(project_dir, "annotations", "blast", "tair", "melon.st_nucl.pl_454")
        tair2_melon_blastdir = join(project_dir, "annotations", "blast", "tair2", "melon.st_nucl.pl_454")
        os.makedirs(tair_melon_blastdir)
        os.makedirs(tair2_melon_blastdir)
        blast_fname = BACKBONE_BASENAMES["blast_basename"] + ".tblastx.xml"
        shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "melon_tair.xml"), join(melon_tair2_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair_melon_blastdir, blast_fname))
        shutil.copy(join(TEST_DATA_DIR, "tair_melon.xml"), join(tair2_melon_blastdir, blast_fname))

        # some melon file to annotate
        input_dir = join(project_dir, BACKBONE_DIRECTORIES["annotation_input"])
        os.makedirs(input_dir)
        seq1 = SeqWithQuality(Seq("A"), id="melon1")
        seq2 = SeqWithQuality(Seq("A"), id="melon2")
        write_seqs_in_file([seq1, seq2], open(join(input_dir, "melon.st_nucl.pl_454.fasta"), "a"))

        do_analysis(project_settings=settings_path, kind="annotate_orthologs", silent=True)
        pickle_fpath = join(project_dir, BACKBONE_DIRECTORIES["annotation_dbs"], "melon.st_nucl.pl_454.0.pickle")
        pickle = open(pickle_fpath).read()
        assert "arabidopsis-orthologs" in pickle
        assert "arabidopsis2-orthologs" in pickle

        do_analysis(project_settings=settings_path, kind="write_annotations", silent=True)

        orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orthologs")
        assert os.path.exists(orf_fpath)
        assert "tair1" in open(orf_fpath).read()

        orf_fpath = join(project_dir, "annotations", "features", "melon.st_nucl.pl_454.orf")
        assert not os.path.exists(orf_fpath)

        do_analysis(project_settings=settings_path, kind="annotation_stats", silent=True)
        stats_fpath = join(project_dir, "annotations", "features", "stats", "melon.st_nucl.pl_454.txt")
        result = open(stats_fpath).read()
        expected = """Orthologs
_________
Sequences with arabidopsis orthologs: 2
Number of arabidopsis orthologs: 2
Sequences with arabidopsis2 orthologs: 2
Number of arabidopsis2 orthologs: 2"""

        assert expected in result

        test_dir.close()
Beispiel #19
0
    def test_protein_change_annotation_analysis():
        "We can annotate protein changes"
        test_dir = NamedTemporaryDir()
        project_name = "backbone"
        matrix = os.path.join(TEST_DATA_DIR, "At.smat")
        configuration = {
            "Snvs": {"min_quality": 20},
            "Sam_processing": {"add_default_qualities": True},
            "Annotation": {"orf_annotation": {"estscan_matrix": matrix}},
            "General_settings": {"threads": THREADS},
        }

        settings_path = create_project(directory=test_dir.name, name=project_name, configuration=configuration)
        project_dir = join(test_dir.name, project_name)
        # setup the original reads
        reads_dir = join(project_dir, "reads")
        clean_reads_dir = join(reads_dir, "cleaned")
        os.mkdir(reads_dir)
        os.mkdir(clean_reads_dir)

        solexa = "@seq1\n"
        solexa += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq2\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq14\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq15\n"
        solexa += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq12\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq13\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa = "@seq16\n"
        solexa += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa += "+\n"
        solexa += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"

        solexa2 = "@seq18\n"
        solexa2 += "TCATTGAAAGTTGAAACTGATAGTAGCAGAGTTTTTTCCTCTGTTTGG\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIIIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq19\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq20\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq21\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq22\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq23\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq24\n"
        solexa2 += "ATATGATTGAAGATATTTCTGGACTTTAAGGGTTCTTGAGGATTTATA\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"
        solexa2 += "@seq25\n"
        solexa2 += "ATGTACTAGCAGTACGATCACACACTGGACAGTACAGACCAGAATGAC\n"
        solexa2 += "+\n"
        solexa2 += "IIIIIIHIIIIIIIIIIIIIIIZIIUJUAUGJUUJUDFAOUDJOFSUD\n"

        sanger = ">seq3\n"
        sanger += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA"
        sanger += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA"
        sanger += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA"
        sanger += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA"
        sanger += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n"
        sanger += ">seq4\n"
        sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sanger += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        sanger += ">seq5\n"
        sanger += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sanger += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sanger += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sanger += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sanger += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"

        sange2 = ">seq6\n"
        sange2 += "GATATGATTGAAGATATTTCTGGGCTTTAAGGGTTCTTGAGGATTTATAGGAGATACTGA"
        sange2 += "GATTCTGGAATCTCTGAGTTTCTGGGTTCAAGTTGCACTGACCATTGTTGGATTTGTAGA"
        sange2 += "TTGTTTCTTCTTTCATTAGGCATTGATTATGGGTAAATGCGTGGGTACATATAATATATA"
        sange2 += "TCTGTTGAATGCAATTTACACATTGACTGAGGAACAACATGAACATGGCAGCTTTCTCAA"
        sange2 += "AATTGAACCACAGAAGGCTTAAAAGCAAAGTCTTTGGAGAATCAGACTAAGCTTGAGA\n"
        sange2 += ">seq7\n"
        sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sange2 += "CTATATATTCTAATGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        sange2 += ">seq8\n"
        sange2 += "TCCATACTTTACTCTATCTCTTTCTGTGTTTGGTAACACGCGAGGATTGGATGATAT"
        sange2 += "CTATATATTCTAAAGTGGACTAAAAATGTGTGTGTGTGTATGAAGATGGGAAGCCGGAAG"
        sange2 += "TCATCAAGGAGAAAAGAGAGATAGACGACGAGAAGATGGCGTTGACGTTCAGAGGACTAG"
        sange2 += "AGGGTCATGTGATGGAGAAGTACAAGAAGTATGAGGTTATCTTACAGTTCATTCCCAAGT"
        sange2 += "CGAACGAAGGCTGCGTCTGCAAAGTCACTCTGATATGGGAGAATCGCAACGAAGACTCCC"
        fpath_sanger = join(clean_reads_dir, "lb_hola1.pl_sanger.sm_hola.fasta")
        fpath_solexa = join(clean_reads_dir, "lb_hola2.pl_illumina.sm_hola.sfastq")
        open(fpath_sanger, "w").write(sanger)
        open(fpath_solexa, "w").write(solexa)

        fpath_sanger2 = join(clean_reads_dir, "lb_adios.pl_sanger.fasta")
        fpath_solexa2 = join(clean_reads_dir, "lb_adios.pl_illumina.sfastq")
        open(fpath_sanger2, "w").write(sange2)
        open(fpath_solexa2, "w").write(solexa2)

        # the reference
        reference_dir = join(project_dir, "mapping/reference")
        os.makedirs(reference_dir)
        reference_fpath = join(reference_dir, "reference.fasta")
        out = open(reference_fpath, "w")
        for line in open(join(TEST_DATA_DIR, "blast/arabidopsis_genes")):
            out.write(line)

        do_analysis(project_settings=settings_path, kind="mapping", silent=True)
        mapping_dir = join(project_dir, "mapping")
        singular_mapping_dir = sorted(os.listdir(mapping_dir))[0]
        singular_mapping_dir = join(mapping_dir, singular_mapping_dir)
        assert exists(join(singular_mapping_dir, "bams", "by_readgroup", "lb_hola2.pl_illumina.sm_hola.bam"))
        result_dir = join(mapping_dir, "bams")
        assert exists(result_dir)
        result_dir_by_lib = join(result_dir, "by_readgroup")
        assert exists(result_dir_by_lib)

        do_analysis(project_settings=settings_path, kind="merge_bams", silent=True)
        assert exists(join(result_dir, "merged.0.bam"))
        assert exists(join(result_dir, "merged.0.bam.bai"))

        # we realign the mapping using GATK
        do_analysis(project_settings=settings_path, kind="realign_bam", silent=True)
        assert exists(join(result_dir, "merged.1.bam"))

        annot_input_dir = join(project_dir, "annotations", "input")
        os.makedirs(annot_input_dir)
        os.symlink(reference_fpath, join(annot_input_dir, "reference.fasta"))
        do_analysis(project_settings=settings_path, kind="annotate_snvs", silent=True)

        do_analysis(project_settings=settings_path, kind="annotate_orfs", silent=True)

        do_analysis(project_settings=settings_path, kind="annotate_prot_change", silent=True)

        result_file = join(project_dir, "annotations", "db", "reference.2.pickle")

        seqs = list(seqs_in_file(open(result_file)))
        snv = seqs[2].features[0]
        assert snv.qualifiers["protein_change"]["kind"] == "substitution"
        assert snv.qualifiers["protein_change"]["location"] == "codon_1"

        test_dir.close()
def main():
    'The main part'
    project_name, directory = set_parameters()
    create_project(project_name, directory=directory)