コード例 #1
0
    def test_seqquals_in_file(self):
        'It test that we get seqs with quality from two sequence files'
        fcontent = '>hola\nACGA\n>caracola\nATCG'
        fhand = StringIO.StringIO(fcontent)
        fcontent_qual = '>hola\n1 2 3 4\n>caracola\n5 6 7 8'
        fhand_qual = StringIO.StringIO(fcontent_qual)
        expected = [('hola', 'ACGA', [1, 2, 3, 4]),
                    ('caracola', 'ATCG', [5, 6, 7, 8])]
        for index, seq in enumerate(seqs_in_file(fhand, fhand_qual)):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #when the seq and qual names do not match we get an error
        fcontent = '>hola\nACGA\n>caracola\nATCG'
        fhand = StringIO.StringIO(fcontent)
        fcontent_qual = '>caracola\n1 2 3 4\n>hola\n5 6 7 8'
        fhand_qual = StringIO.StringIO(fcontent_qual)
        try:
            for seq in seqs_in_file(fhand, fhand_qual):
                #pylint: disable-msg=W0104
                seq.name
            self.fail()
            #pylint: disable-msg=W0704
        except ValueError:
            pass
コード例 #2
0
    def test_illumina(self):
        'It tests the Illumina cleaning'
        seq1 = create_random_seqwithquality(50, qual_range=35)
        seq2 = create_random_seqwithquality(10, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert out_seqs[0].qual[-2] == 35

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-x']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seqs[0].seq == out_seqs[0].seq


        #illumina format
        inseq_fhand = create_temp_seq_file(seqs, format='fastq-illumina')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq-illumina']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq-illumina'))
        assert out_seqs[0].qual[-2] == 35
コード例 #3
0
    def test_vectordb(self):
        'It removes the vector from a vector database'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTC'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        vector_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d', vector_db]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5

        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'GGTGCCTCCGGCGGGCCACTCAATGCTTGAGTATACTCACTAGACTTTGCTTCGCAAAG'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
コード例 #4
0
def do_general_analysis(seq_path, dir_out, group_kind):
    groups = get_groups(seq_path)
    summary_fpath = join(dir_out, 'summary.all.txt')
    if exists(summary_fpath):
        os.remove(summary_fpath)
    sum_fhand = open(join(dir_out, 'summary.all.txt'), 'a')
    for analysis in STAT_ANALYSIS:
        seqs = seqs_in_file(open(seq_path, 'r'))
        dist_fhand = open(join(dir_out, analysis+'_distrib.all.dist'), 'w')
        svg_fhand = open(join(dir_out, analysis+'_distrib.all.svg'), 'w')
        STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand,
                                plot_fhand=svg_fhand, summary_fhand=sum_fhand)
        dist_fhand.close()
        svg_fhand.close()

        if group_kind:
            for group in groups[group_kind]:
                seqs = seqs_in_file(open(seq_path, 'r'))
                dist_fhand = open(join(dir_out, analysis+'_distrib.%s.dist' % group) , 'w')
                svg_fhand = open(join(dir_out, analysis+'_distrib.%s.svg' % group), 'w')
                STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand,
                                plot_fhand=svg_fhand, summary_fhand=sum_fhand,
                                group_kind=group_kind, groups=[group])
                dist_fhand.close()
                svg_fhand.close()
    sum_fhand.close()
コード例 #5
0
    def test_fasq():
        'It test that we can get a seq iter from a fasq file'

        #fasq
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += ';;3;\n'
        fcontent += '@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += ';;;;;\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [26, 26, 18, 26]),
                    ('SRR001666.1', 'GTTGC', [26, 26, 26, 26, 26])]
        for index, seq in enumerate(seqs_in_file(fhand, format='fastq')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #fastq-illumina
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += 'AAAA\n'
        fcontent += '@seq2\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += 'AAAAA\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [1, 1, 1, 1]),
                    ('seq2', 'GTTGC', [1, 1, 1, 1, 1])]
        for index, seq in enumerate(seqs_in_file(fhand,
                                                 format='fastq-illumina')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #fastq-solexa
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += 'BBBB\n'
        fcontent += '@seq2\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += 'BBBBB\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [4, 4, 4, 4]),
                    ('seq2', 'GTTGC', [4, 4, 4, 4, 4])]
        for index, seq in enumerate(seqs_in_file(fhand,
                                                 format='fastq-solexa')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]
コード例 #6
0
    def test_sanger(self):
        'It tests the basic sanger cleaning'
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seq2 = create_random_seqwithquality(50, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual')
        outseq_fhand = NamedTemporaryFile()
        outqual_fhand = NamedTemporaryFile()

        #platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name]
        stderr = _call_python(cmd)[1]
        assert 'required' in stderr

        #a correct platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'hola']
        stderr = _call_python(cmd)[1]
        assert 'choice' in stderr

        #disable quality trimming and lucy_splice are incompatible
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '-x', '--lucy_splice', 'splice.fasta']
        stderr = _call_python(cmd)[1]
        assert 'incompatible' in stderr

        #we can clean a sanger sequence with quality
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert out_seqs[0].qual[-1] == 55

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger', '-x']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert seqs[0].seq == out_seqs[0].seq

        #we can clean a sanger sequence without quality
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seqs = [SeqWithQuality(seq1.seq + Seq('NNNNNNNNNNNNNN'), name='Ns')]
        inseq_fhand = create_temp_seq_file(seqs, format='fasta')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name)))
        assert not str(out_seqs[0].seq).lower().endswith('nnnnn')
コード例 #7
0
    def test_csfasta_reader():
        'It test a csfasta reader'
        seq_fhand = open(os.path.join(TEST_DATA_DIR, 'seq.csfasta'))
        qual_fhand = open(os.path.join(TEST_DATA_DIR, 'solid_qual.qual'))

        seqs = list(seqs_in_file(seq_fhand, qual_fhand, format='csfasta'))
        assert '121101332.0133.2221.23.2.21' in str(seqs[0].seq)
        assert len(seqs) == 3

        seqs = list(seqs_in_file(seq_fhand, qual_fhand, format='csfasta',
                                 double_encoding=True))
        assert 'TTGNACTTNGGGCNGTNGNGCA' in str(seqs[0].seq)
コード例 #8
0
    def test_adaptors(self):
        'It removes adaptors'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        adaptor_fhand = create_temp_seq_file([adaptor], format='fasta')[0]
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a', adaptor_fhand.name]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seq2.seq == out_seqs[0].seq


        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert retcode == 0
        assert  "--adaptors_file: {'454': '" in stdout

        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert 'clean_reads does not have default adaptors file' in stderr
        assert  retcode == 14
コード例 #9
0
ファイル: gff.py プロジェクト: BioinformaticsArchive/franklin
    def _get_items(self):
        'It yields the items in the GFF file'

        fhand = open(self._fpath, self._mode)

        for line in fhand:
            line = line.strip()
            if not line:
                continue
            if line.startswith('##gff-version'):
                continue #this has been taken into account
            elif line.startswith('##FASTA'):
                #in the next line we're assuming that the fasta reading is
                #sequential and that there is no seek
                items = seqs_in_file(fhand, format='fasta')
                yield FASTA, items
                break
            elif line.startswith('##'):
                item = line[2:]
                kind = METADATA
            elif line.startswith('#'):
                item = line[1:]
                kind = COMMENT
            else:
                item = self._create_feature(line)
                kind = FEATURE
            if item is not None:
                yield kind, item
コード例 #10
0
def draw_sequence_distribution(seq_path, dir_out, group_kind,
                               window_width, window_step,
                               value_kind):
    '''It creates a wig file, given a value kind, with the distribution of that
    value along a given sequence'''
    groups = get_groups(seq_path)
    for group in groups[group_kind]:
        seqs = seqs_in_file(open(seq_path, 'r'))

        if value_kind == 'het':
            profile = calculate_hets_group(seqs, groups=[group],
                                           group_kind=group_kind)
        if value_kind == 'pic':
            profile = calculate_pics_group(seqs, groups=[group],
                                           group_kind=group_kind)
        if value_kind == 'maf':
            profile = calculate_mafs_group(seqs, groups=[group],
                                           group_kind=group_kind)

        if profile and window_width and window_step:
            new_profile = apply_window(profile,
                                       window_width=window_width,
                                       window_step=window_step)

            write_wig(dir_out, new_profile, group, window_width,
                      window_step)
        else:
            write_wig(dir_out, profile, group)
コード例 #11
0
    def test_strip_seq_by_quality_lucy():
        "It tests strip_seq_by_quality_lucy2"
        seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT"
        seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC"
        seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCA"
        qual = "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "
        qual += "00 00 00 00 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 00 00 00 00"
        qual = qual.split()
        seq = Seq(seq)
        seqrec1 = SeqWithQuality(name="seq1", seq=seq, qual=qual, description="desc1")

        qual = "40 40 40 37 40 40 37 37 37 37 37 37 37 37 40 42 42 42 44 44 "
        qual += "56 56 42 40 40 40 40 36 36 28 35 32 35 35 40 42 37 37 35 37 "
        qual += "32 35 35 35 35 35 35 38 33 33 24 33 33 42 33 35 35 35 35 33 "
        qual += "36 30 30 24 29 29 35 35 35 35 29 29 29 35 38 38 38 37 35 33 "
        qual += "29 35 35 34 30 30 30 33 29 31 31 29 29 29 28 28 24 21 16 16 "
        qual += "21 24 29 29 32 40 27 27 25 25 21 30 27 28 28 32 23 23 21 24 "
        qual += "24 17 18 19 21 15 19 11 9 9 11 23 17 15 10 10 10 20 27 25 23 "
        qual += "18 22 23 24 18 10 10 13 13 18 19 10 12 12 18 16 14 10 10 11 "
        qual += "16 13 21 19 31 19 27 27 28 26 29 25 25 20 19 23 28 28 19 20 "
        qual += "13 9 9 9 9 9 17 15 21 17 14 12 21 17 19 24 28 24 23 "
        quality = qual.split()
        seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT"
        seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC"
        seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCAAGCAGACTACGAGATCAGCAGCATCAGCAGC"
        seq += "ATTACGATGAT"
        seq = Seq(seq)
        seqrec2 = SeqWithQuality(seq=seq, qual=quality, name="seq2", description="desc2")
        seq_iter = iter([seqrec1, seqrec2])
        seq_trimmer = create_seq_trim_and_masker()
        lucy_striper = create_striper_by_quality_lucy()
        # pylint:disable-msg=W0612
        seq_iter = lucy_striper(seq_iter)
        new_seqs = []
        for seq in seq_iter:
            new_seqs.append(seq_trimmer(seq))
        seq = new_seqs[0].seq
        assert seqrec1.description == new_seqs[0].description
        assert seq.startswith("CAGATCAGATCAGCATCAGCAT")
        assert seq.endswith("CGAGATCAGCAGCATCAGC")
        assert len(new_seqs) == 2
        assert new_seqs[1].description == "desc2"

        # now we test the sequence with adaptors
        vector_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_vector.fasta")
        splice_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_splice.fasta")
        parameters = {"vector": [vector_fpath, splice_fpath], "bracket": [10, 0.02]}
        lucy_striper = create_striper_by_quality_lucy(parameters)
        seq_fhand = open(os.path.join(TEST_DATA_DIR, "lucy", "seq_with_adaptor1.fastq"))
        seq_iter = lucy_striper(seqs_in_file(seq_fhand, format="fastq"))
        new_seqs = []
        for seq in seq_iter:
            new_seqs.append(seq_trimmer(seq))
コード例 #12
0
 def test_seqs_in_file():
     'It test that we get seqs without quality from a sequence file'
     fcontent = '>hola\nACGATCTAGTCATCA\n>caracola\nATCGTAGCTGATGT'
     fhand = StringIO.StringIO(fcontent)
     expected = [('hola', 'ACGATCTAGTCATCA'), ('caracola', 'ATCGTAGCTGATGT')]
     for index, seq in enumerate(seqs_in_file(fhand)):
         assert seq.name == expected[index][0]
         assert str(seq.seq) == expected[index][1]
コード例 #13
0
def _process_sequences(in_fhand_seqs, in_fhand_qual, file_format, pipeline,
                                          configuration):
    'It returns a generator with the processed sequences'
    sequences = seqs_in_file(in_fhand_seqs, in_fhand_qual, file_format)

    # the pipeline that will process the generator is build
    processed_seqs = _pipeline_builder(pipeline, sequences, configuration)
    return processed_seqs
コード例 #14
0
def get_orfs(infhand):
    'get orfs'
    for seq in seqs_in_file(infhand):
        for orf in seq.get_features(kind='orf'):
            dna = orf.qualifiers['dna']
            pep = orf.qualifiers['pep']

            if check_pep(pep) and check_dna(dna):
                yield dna
コード例 #15
0
    def run(self):
        'It runs the analysis.'
        output_dir = self._create_output_dirs()['result']
        inputs = self._get_input_fpaths()
        pickle_paths = inputs['pickle']

        for seq_path in pickle_paths:
            output_fpath = join(output_dir, seq_path.basename + '.txt')
            seqs = seqs_in_file(open(seq_path.last_version, 'r'))
            do_annotation_statistics(seqs, open(output_fpath, 'w'))
コード例 #16
0
def get_groups(fpath):
    groups = {'LB':[], 'PL':[], 'SM':[]}

    for seq in seqs_in_file(open(fpath)):
        for snv in seq.get_features(kind='snv'):
            read_groups = snv.qualifiers['read_groups']
            for tags in read_groups.values():
                for group_kind, group in tags.items():
                    if group not in groups[group_kind]:
                        groups[group_kind].append(group)
    return groups
コード例 #17
0
ファイル: seq_cleaner.py プロジェクト: JoseBlanca/franklin
def check_sequences_length(fhand, min_length=None, max_length=None):
    "It checks that all sequences in the file have the given lengths"
    if fhand is None:
        return
    for seq in seqs_in_file(open(fhand.name)):
        len_seq = len(seq)
        if min_length is not None and len_seq < min_length:
            msg = "Sequence %s is shorter than %i residues" % (seq.name, min_length)
            raise ValueError(msg)
        elif max_length is not None and len_seq > max_length:
            msg = "Sequence %s is longer than %i residues" % (seq.name, max_length)
            raise ValueError(msg)
コード例 #18
0
 def test_edge_trim(self):
     'It trims the sequence edges'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) - len(out_seqs[0].seq) == 20
コード例 #19
0
def main():
    "The main part"

    mira_path, iassembler_path = set_parameters()

    # guess the mira files that we need
    unigenes_fpath, unigenes_qual_fpath, mira_contig_read_fpath = get_mira_paths(mira_path)

    # create the iassembler project dir and subdirs
    if not os.path.exists(iassembler_path):
        os.makedirs(iassembler_path)
    mira_1_dir = os.path.join(iassembler_path, "{0:s}_Assembly".format(IASSEMBLER_INPUT_NAME), "mira")
    os.makedirs(mira_1_dir)

    # prepare contig readlist for iaasembler
    iassembler_contig_mem_fpath = os.path.join(mira_1_dir, "CMF10")
    process_contig_readlist(mira_contig_read_fpath, iassembler_contig_mem_fpath)

    # copy unigene files into the iassembler project
    iassembler_unigenes = os.path.join(mira_1_dir, "mira2.fa")
    iassembler_unigenes_qual = os.path.join(mira_1_dir, "mira2.fa.qual")
    iassembler_unigenes_fh = open(iassembler_unigenes, "w")
    iassembler_unigenes_qual_fh = open(iassembler_unigenes_qual, "w")
    seq_writer = SequenceWriter(
        fhand=iassembler_unigenes_fh, file_format="fasta", qual_fhand=iassembler_unigenes_qual_fh
    )

    for seq in seqs_in_file(seq_fhand=open(unigenes_fpath), format="sfastq", qual_fhand=open(unigenes_qual_fpath)):
        seq.name = "mira_{0:s}".format(seq.name.split("_", 1)[1])
        seq.id = seq.name
        seq_writer.write(seq)

    # create iassembler input files.
    seq_fhand = open(os.path.join(iassembler_path, IASSEMBLER_INPUT_NAME), "w")
    qual_fhand = open(os.path.join(iassembler_path, IASSEMBLER_INPUT_NAME + ".qual"), "w")

    for file_ in os.listdir(os.path.join(mira_path, "..")):
        if "_in" in file_:
            file_ = os.path.join(mira_path, "..", file_)
            seqio(
                in_seq_fhand=open(file_),
                out_seq_fhand=seq_fhand,
                out_qual_fhand=qual_fhand,
                in_format="sfastq",
                out_format="fasta",
            )

    msg = "To run iassembler you must use this command:\n"
    msg += "iassembler -c -i {0:s}\n".format(IASSEMBLER_INPUT_NAME)
    msg += "From your iassembler dir:{0:s}".format(iassembler_path)
    print msg
コード例 #20
0
    def test_json_reader():
        'It tests the json sequence writer'
        #first we write some files
        seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
        seq1 = SeqWithQuality(seq=Seq('GATACCA', DNAAlphabet()), name='seq2')
        fhand = tempfile.NamedTemporaryFile(suffix='.json')
        write_seqs_in_file([seq0, seq1], fhand, format='json')
        fhand.flush()

        #now we read them
        seqs = list(seqs_in_file(open(fhand.name)))
        assert seqs[0].seq == seq0.seq
        assert seqs[1].seq == seq1.seq
        assert str(seqs[1].seq.alphabet) == str(seq1.seq.alphabet)
コード例 #21
0
 def test_words(self):
     'It trims re words'
     vector = 'ACTG'
     vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [vector + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-r', '"^ACTG","TTTTTTTTTTTTTT"']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert seq2.seq == out_seqs[0].seq
コード例 #22
0
 def test_vector(self):
     'It removes the vector'
     seq1 = create_random_seqwithquality(5, qual_range=35)
     vector = create_random_seqwithquality(3000, qual_range=35)
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1 + vector[30:60] + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     vector_fhand = create_temp_seq_file([vector], format='fasta')[0]
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-v', vector_fhand.name]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
コード例 #23
0
ファイル: snv_statistics.py プロジェクト: JoseBlanca/franklin
def do_snv_stats(seq_path, out_dir):
    "It performs snv statistics"
    first_time = True
    for analysis in STAT_ANALYSIS:
        seqs = seqs_in_file(open(seq_path.last_version, "r"))
        dist_fhand = open(join(out_dir, analysis + "_distrib.dist"), "w")
        svg_fhand = open(join(out_dir, analysis + "_distrib.svg"), "w")
        if first_time == True:
            sum_fhand = open(join(out_dir, "distrib.sum"), "w")
            first_time = False
        else:
            sum_fhand = open(join(out_dir, "distrib.sum"), "a")
        STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand, plot_fhand=svg_fhand, summary_fhand=sum_fhand)
        dist_fhand.close()
        svg_fhand.close()
        sum_fhand.close()
コード例 #24
0
 def test_min_length(self):
     'Filtering by length'
     seq1 = create_random_seqwithquality(250, qual_range=35)
     seq2 = create_random_seqwithquality(50, qual_range=35)
     seq3 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-m', '51']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2
     assert len(out_seqs[0]) == 250
     assert len(out_seqs[1]) == 250
コード例 #25
0
 def test_trim_as_mask(self):
     'It masks the regions to trim'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10', '--mask_no_trim']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) == len(out_seqs[0].seq)
     seq = str(out_seqs[0].seq)
     assert seq[0:9].islower()
     assert seq[10:len(seq) - 10].isupper()
     assert seq[-10:].islower()
コード例 #26
0
 def test_fastq(self):
     'Cleaning fastq seqs in parallel'
     seq1 = create_random_seqwithquality(500, qual_range=55)
     seq2 = create_random_seqwithquality(50, qual_range=15)
     seq3 = create_random_seqwithquality(500, qual_range=55)
     seq4 = create_random_seqwithquality(50, qual_range=15)
     seq5 = create_random_seqwithquality(500, qual_range=55)
     seq6 = create_random_seqwithquality(50, qual_range=15)
     seqs = [seq1 + seq2, seq3 + seq4, seq5 + seq6]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     #we can clean a sanger sequence with quality
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', 'sanger', '-t', '4', '-f', 'fastq']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert out_seqs[0].qual[-1] == 55
コード例 #27
0
 def test_filter(self):
     'Filtering by blast similarity'
     seq1 = create_random_seqwithquality(150, qual_range=35)
     seq2 = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTCTC'
     seq2 += 'TACGGCAAGAATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCA'
     seq2 += 'AGCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGCTCTGCCA'
     seq2 = SeqWithQuality(Seq(seq2), name='ara', qual=[30]*len(seq2))
     seq3 = create_random_seqwithquality(150, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     ara_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq',
            '--filter_dbs', ','.join((ara_db, ara_db))]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2
コード例 #28
0
    def test_pickle_writer():
        'It tests the pickle sequence writer'
        seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
        alleles = {('G', 3): {}}
        filters = {'a_filter':{('param',):False}}
        snv_feature = SeqFeature(FeatureLocation(ExactPosition(3),
                                                 ExactPosition(3)),
                                                 type='snv',
                                        qualifiers={'alleles':alleles,
                                                    'filters':filters})
        seq1 = SeqWithQuality(seq=Seq('GATACCA'), name='seq2',
                              features=[snv_feature])
        fhand = StringIO()
        write_seqs_in_file([seq0, seq1], fhand, format='pickle')
        #print fhand.getvalue()

        fhand.seek(0)
        seqs = list(seqs_in_file(fhand))
        assert seqs[1].features[0].qualifiers['alleles'] == alleles
        assert seqs[1].features[0].qualifiers['filters'] == filters
コード例 #29
0
def seqio(in_seq_fhand, out_seq_fhand, out_format, double_encoding=False,
          in_qual_fhand=None, out_qual_fhand=None, in_format=None):
    'It converts format of the files'
    if not in_format:
        in_format = guess_seq_file_format(in_seq_fhand)
    if (in_qual_fhand is not None or
        out_qual_fhand is not None or
        in_format in ('repr', 'json', 'pickle') or
        out_format in ('repr', 'json', 'pickle')) :
        seqs = seqs_in_file(seq_fhand=in_seq_fhand,
                            qual_fhand=in_qual_fhand,
                            format=in_format, double_encoding=double_encoding)
        write_seqs_in_file(seqs, seq_fhand=out_seq_fhand,
                           qual_fhand=out_qual_fhand,
                           format=out_format)
    else:
        SeqIO.convert(in_seq_fhand, in_format, out_seq_fhand, out_format)
    out_seq_fhand.flush()
    if out_qual_fhand:
        out_qual_fhand.flush()
コード例 #30
0
ファイル: seq_analysis.py プロジェクト: JoseBlanca/franklin
def do_transitive_clustering_all(blast_fhand, seqs_fhand, filters=None):
    """It does a transtive clustering given a xml blast result. and the
    initial seq file.

    It will look for pairs of similar in the blast. Then it will create
    clusters of sequences using the transitive property in the pairs.
    e.g. a is similar to b. b to c => a, b and c belong to the same cluster.

    it will use the seqs file to add to the clusters the sequences that are o
    similar to other seqs: clusters with only one sequence
    """

    similar_pairs = list(get_hit_pairs_from_blast(blast_fhand, filters=filters))
    clusters = do_transitive_clustering(similar_pairs)

    seqs_with_hit = set([item for sublist in similar_pairs for item in sublist])
    all_seqs = set([seq.name for seq in seqs_in_file(seqs_fhand)])

    seqs_without_match = all_seqs.difference(seqs_with_hit)

    return clusters, seqs_without_match