Ejemplo n.º 1
0
    def test_seqquals_in_file(self):
        'It test that we get seqs with quality from two sequence files'
        fcontent = '>hola\nACGA\n>caracola\nATCG'
        fhand = StringIO.StringIO(fcontent)
        fcontent_qual = '>hola\n1 2 3 4\n>caracola\n5 6 7 8'
        fhand_qual = StringIO.StringIO(fcontent_qual)
        expected = [('hola', 'ACGA', [1, 2, 3, 4]),
                    ('caracola', 'ATCG', [5, 6, 7, 8])]
        for index, seq in enumerate(seqs_in_file(fhand, fhand_qual)):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #when the seq and qual names do not match we get an error
        fcontent = '>hola\nACGA\n>caracola\nATCG'
        fhand = StringIO.StringIO(fcontent)
        fcontent_qual = '>caracola\n1 2 3 4\n>hola\n5 6 7 8'
        fhand_qual = StringIO.StringIO(fcontent_qual)
        try:
            for seq in seqs_in_file(fhand, fhand_qual):
                #pylint: disable-msg=W0104
                seq.name
            self.fail()
            #pylint: disable-msg=W0704
        except ValueError:
            pass
    def test_illumina(self):
        'It tests the Illumina cleaning'
        seq1 = create_random_seqwithquality(50, qual_range=35)
        seq2 = create_random_seqwithquality(10, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert out_seqs[0].qual[-2] == 35

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-x']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seqs[0].seq == out_seqs[0].seq


        #illumina format
        inseq_fhand = create_temp_seq_file(seqs, format='fastq-illumina')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq-illumina']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq-illumina'))
        assert out_seqs[0].qual[-2] == 35
    def test_vectordb(self):
        'It removes the vector from a vector database'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTC'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        vector_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d', vector_db]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5

        seq1 = create_random_seqwithquality(5, qual_range=35)
        vector = 'GGTGCCTCCGGCGGGCCACTCAATGCTTGAGTATACTCACTAGACTTTGCTTCGCAAAG'
        vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
        seq2 = create_random_seqwithquality(250, qual_range=35)
        seqs = [seq1 + vector + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-d']
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
def do_general_analysis(seq_path, dir_out, group_kind):
    groups = get_groups(seq_path)
    summary_fpath = join(dir_out, 'summary.all.txt')
    if exists(summary_fpath):
        os.remove(summary_fpath)
    sum_fhand = open(join(dir_out, 'summary.all.txt'), 'a')
    for analysis in STAT_ANALYSIS:
        seqs = seqs_in_file(open(seq_path, 'r'))
        dist_fhand = open(join(dir_out, analysis+'_distrib.all.dist'), 'w')
        svg_fhand = open(join(dir_out, analysis+'_distrib.all.svg'), 'w')
        STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand,
                                plot_fhand=svg_fhand, summary_fhand=sum_fhand)
        dist_fhand.close()
        svg_fhand.close()

        if group_kind:
            for group in groups[group_kind]:
                seqs = seqs_in_file(open(seq_path, 'r'))
                dist_fhand = open(join(dir_out, analysis+'_distrib.%s.dist' % group) , 'w')
                svg_fhand = open(join(dir_out, analysis+'_distrib.%s.svg' % group), 'w')
                STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand,
                                plot_fhand=svg_fhand, summary_fhand=sum_fhand,
                                group_kind=group_kind, groups=[group])
                dist_fhand.close()
                svg_fhand.close()
    sum_fhand.close()
Ejemplo n.º 5
0
    def test_fasq():
        'It test that we can get a seq iter from a fasq file'

        #fasq
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += ';;3;\n'
        fcontent += '@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += ';;;;;\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [26, 26, 18, 26]),
                    ('SRR001666.1', 'GTTGC', [26, 26, 26, 26, 26])]
        for index, seq in enumerate(seqs_in_file(fhand, format='fastq')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #fastq-illumina
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += 'AAAA\n'
        fcontent += '@seq2\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += 'AAAAA\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [1, 1, 1, 1]),
                    ('seq2', 'GTTGC', [1, 1, 1, 1, 1])]
        for index, seq in enumerate(seqs_in_file(fhand,
                                                 format='fastq-illumina')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]

        #fastq-solexa
        fcontent  = '@seq1\n'
        fcontent += 'CCCT\n'
        fcontent += '+\n'
        fcontent += 'BBBB\n'
        fcontent += '@seq2\n'
        fcontent += 'GTTGC\n'
        fcontent += '+\n'
        fcontent += 'BBBBB\n'
        fhand = StringIO.StringIO(fcontent)

        expected = [('seq1', 'CCCT', [4, 4, 4, 4]),
                    ('seq2', 'GTTGC', [4, 4, 4, 4, 4])]
        for index, seq in enumerate(seqs_in_file(fhand,
                                                 format='fastq-solexa')):
            assert seq.name == expected[index][0]
            assert str(seq.seq) == expected[index][1]
            assert seq.qual == expected[index][2]
    def test_sanger(self):
        'It tests the basic sanger cleaning'
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seq2 = create_random_seqwithquality(50, qual_range=15)
        seqs = [seq1 + seq2]
        inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual')
        outseq_fhand = NamedTemporaryFile()
        outqual_fhand = NamedTemporaryFile()

        #platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name]
        stderr = _call_python(cmd)[1]
        assert 'required' in stderr

        #a correct platform is required
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'hola']
        stderr = _call_python(cmd)[1]
        assert 'choice' in stderr

        #disable quality trimming and lucy_splice are incompatible
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger', '-x', '--lucy_splice', 'splice.fasta']
        stderr = _call_python(cmd)[1]
        assert 'incompatible' in stderr

        #we can clean a sanger sequence with quality
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert out_seqs[0].qual[-1] == 55

        #disable quality trimming
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-q', inqual_fhand.name,
               '-o', outseq_fhand.name, '-u', outqual_fhand.name,
               '-p', 'sanger', '-x']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     qual_fhand=open(outqual_fhand.name)))
        assert seqs[0].seq == out_seqs[0].seq

        #we can clean a sanger sequence without quality
        seq1 = create_random_seqwithquality(500, qual_range=55)
        seqs = [SeqWithQuality(seq1.seq + Seq('NNNNNNNNNNNNNN'), name='Ns')]
        inseq_fhand = create_temp_seq_file(seqs, format='fasta')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'sanger']
        retcode = _call_python(cmd)[2]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name)))
        assert not str(out_seqs[0].seq).lower().endswith('nnnnn')
Ejemplo n.º 7
0
    def test_csfasta_reader():
        'It test a csfasta reader'
        seq_fhand = open(os.path.join(TEST_DATA_DIR, 'seq.csfasta'))
        qual_fhand = open(os.path.join(TEST_DATA_DIR, 'solid_qual.qual'))

        seqs = list(seqs_in_file(seq_fhand, qual_fhand, format='csfasta'))
        assert '121101332.0133.2221.23.2.21' in str(seqs[0].seq)
        assert len(seqs) == 3

        seqs = list(seqs_in_file(seq_fhand, qual_fhand, format='csfasta',
                                 double_encoding=True))
        assert 'TTGNACTTNGGGCNGTNGNGCA' in str(seqs[0].seq)
Ejemplo n.º 8
0
    def test_adaptors(self):
        'It removes adaptors'
        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        adaptor_fhand = create_temp_seq_file([adaptor], format='fasta')[0]
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a', adaptor_fhand.name]
        retcode = _call_python(cmd)[-1]
        assert retcode == 0
        out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                     format='fastq'))
        assert seq2.seq == out_seqs[0].seq


        seq1 = create_random_seqwithquality(5, qual_range=35)
        adaptor = create_random_seqwithquality(15, qual_range=35)
        seq2 = create_random_seqwithquality(50, qual_range=35)
        seqs = [seq1 + adaptor + seq2]
        inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
        outseq_fhand = NamedTemporaryFile()
        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', '454', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert retcode == 0
        assert  "--adaptors_file: {'454': '" in stdout

        cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
               '-p', 'illumina', '-f', 'fastq', '-a']
        stdout, stderr, retcode = _call_python(cmd)
        assert 'clean_reads does not have default adaptors file' in stderr
        assert  retcode == 14
Ejemplo n.º 9
0
    def _get_items(self):
        'It yields the items in the GFF file'

        fhand = open(self._fpath, self._mode)

        for line in fhand:
            line = line.strip()
            if not line:
                continue
            if line.startswith('##gff-version'):
                continue #this has been taken into account
            elif line.startswith('##FASTA'):
                #in the next line we're assuming that the fasta reading is
                #sequential and that there is no seek
                items = seqs_in_file(fhand, format='fasta')
                yield FASTA, items
                break
            elif line.startswith('##'):
                item = line[2:]
                kind = METADATA
            elif line.startswith('#'):
                item = line[1:]
                kind = COMMENT
            else:
                item = self._create_feature(line)
                kind = FEATURE
            if item is not None:
                yield kind, item
def draw_sequence_distribution(seq_path, dir_out, group_kind,
                               window_width, window_step,
                               value_kind):
    '''It creates a wig file, given a value kind, with the distribution of that
    value along a given sequence'''
    groups = get_groups(seq_path)
    for group in groups[group_kind]:
        seqs = seqs_in_file(open(seq_path, 'r'))

        if value_kind == 'het':
            profile = calculate_hets_group(seqs, groups=[group],
                                           group_kind=group_kind)
        if value_kind == 'pic':
            profile = calculate_pics_group(seqs, groups=[group],
                                           group_kind=group_kind)
        if value_kind == 'maf':
            profile = calculate_mafs_group(seqs, groups=[group],
                                           group_kind=group_kind)

        if profile and window_width and window_step:
            new_profile = apply_window(profile,
                                       window_width=window_width,
                                       window_step=window_step)

            write_wig(dir_out, new_profile, group, window_width,
                      window_step)
        else:
            write_wig(dir_out, profile, group)
Ejemplo n.º 11
0
    def test_strip_seq_by_quality_lucy():
        "It tests strip_seq_by_quality_lucy2"
        seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT"
        seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC"
        seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCA"
        qual = "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "
        qual += "00 00 00 00 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 60 "
        qual += "60 60 60 60 60 60 60 60 60 60 60 60 60 60 00 00 00 00"
        qual = qual.split()
        seq = Seq(seq)
        seqrec1 = SeqWithQuality(name="seq1", seq=seq, qual=qual, description="desc1")

        qual = "40 40 40 37 40 40 37 37 37 37 37 37 37 37 40 42 42 42 44 44 "
        qual += "56 56 42 40 40 40 40 36 36 28 35 32 35 35 40 42 37 37 35 37 "
        qual += "32 35 35 35 35 35 35 38 33 33 24 33 33 42 33 35 35 35 35 33 "
        qual += "36 30 30 24 29 29 35 35 35 35 29 29 29 35 38 38 38 37 35 33 "
        qual += "29 35 35 34 30 30 30 33 29 31 31 29 29 29 28 28 24 21 16 16 "
        qual += "21 24 29 29 32 40 27 27 25 25 21 30 27 28 28 32 23 23 21 24 "
        qual += "24 17 18 19 21 15 19 11 9 9 11 23 17 15 10 10 10 20 27 25 23 "
        qual += "18 22 23 24 18 10 10 13 13 18 19 10 12 12 18 16 14 10 10 11 "
        qual += "16 13 21 19 31 19 27 27 28 26 29 25 25 20 19 23 28 28 19 20 "
        qual += "13 9 9 9 9 9 17 15 21 17 14 12 21 17 19 24 28 24 23 "
        quality = qual.split()
        seq = "ATCGATCAGTCAGACTGACAGACTCAGATCAGATCAGCATCAGCATACGATACGCATCAGACT"
        seq += "ACGATCGATCGATCGACAGATCATCGATCATCGACGACTAGACGATCATCGATACGCAGACTC"
        seq += "AGCAGACTACGAGATCAGCAGCATCAGCAGCAAGCAGACTACGAGATCAGCAGCATCAGCAGC"
        seq += "ATTACGATGAT"
        seq = Seq(seq)
        seqrec2 = SeqWithQuality(seq=seq, qual=quality, name="seq2", description="desc2")
        seq_iter = iter([seqrec1, seqrec2])
        seq_trimmer = create_seq_trim_and_masker()
        lucy_striper = create_striper_by_quality_lucy()
        # pylint:disable-msg=W0612
        seq_iter = lucy_striper(seq_iter)
        new_seqs = []
        for seq in seq_iter:
            new_seqs.append(seq_trimmer(seq))
        seq = new_seqs[0].seq
        assert seqrec1.description == new_seqs[0].description
        assert seq.startswith("CAGATCAGATCAGCATCAGCAT")
        assert seq.endswith("CGAGATCAGCAGCATCAGC")
        assert len(new_seqs) == 2
        assert new_seqs[1].description == "desc2"

        # now we test the sequence with adaptors
        vector_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_vector.fasta")
        splice_fpath = os.path.join(TEST_DATA_DIR, "lucy", "icugi_splice.fasta")
        parameters = {"vector": [vector_fpath, splice_fpath], "bracket": [10, 0.02]}
        lucy_striper = create_striper_by_quality_lucy(parameters)
        seq_fhand = open(os.path.join(TEST_DATA_DIR, "lucy", "seq_with_adaptor1.fastq"))
        seq_iter = lucy_striper(seqs_in_file(seq_fhand, format="fastq"))
        new_seqs = []
        for seq in seq_iter:
            new_seqs.append(seq_trimmer(seq))
Ejemplo n.º 12
0
 def test_seqs_in_file():
     'It test that we get seqs without quality from a sequence file'
     fcontent = '>hola\nACGATCTAGTCATCA\n>caracola\nATCGTAGCTGATGT'
     fhand = StringIO.StringIO(fcontent)
     expected = [('hola', 'ACGATCTAGTCATCA'), ('caracola', 'ATCGTAGCTGATGT')]
     for index, seq in enumerate(seqs_in_file(fhand)):
         assert seq.name == expected[index][0]
         assert str(seq.seq) == expected[index][1]
Ejemplo n.º 13
0
def _process_sequences(in_fhand_seqs, in_fhand_qual, file_format, pipeline,
                                          configuration):
    'It returns a generator with the processed sequences'
    sequences = seqs_in_file(in_fhand_seqs, in_fhand_qual, file_format)

    # the pipeline that will process the generator is build
    processed_seqs = _pipeline_builder(pipeline, sequences, configuration)
    return processed_seqs
Ejemplo n.º 14
0
def get_orfs(infhand):
    'get orfs'
    for seq in seqs_in_file(infhand):
        for orf in seq.get_features(kind='orf'):
            dna = orf.qualifiers['dna']
            pep = orf.qualifiers['pep']

            if check_pep(pep) and check_dna(dna):
                yield dna
Ejemplo n.º 15
0
    def run(self):
        'It runs the analysis.'
        output_dir = self._create_output_dirs()['result']
        inputs = self._get_input_fpaths()
        pickle_paths = inputs['pickle']

        for seq_path in pickle_paths:
            output_fpath = join(output_dir, seq_path.basename + '.txt')
            seqs = seqs_in_file(open(seq_path.last_version, 'r'))
            do_annotation_statistics(seqs, open(output_fpath, 'w'))
def get_groups(fpath):
    groups = {'LB':[], 'PL':[], 'SM':[]}

    for seq in seqs_in_file(open(fpath)):
        for snv in seq.get_features(kind='snv'):
            read_groups = snv.qualifiers['read_groups']
            for tags in read_groups.values():
                for group_kind, group in tags.items():
                    if group not in groups[group_kind]:
                        groups[group_kind].append(group)
    return groups
Ejemplo n.º 17
0
def check_sequences_length(fhand, min_length=None, max_length=None):
    "It checks that all sequences in the file have the given lengths"
    if fhand is None:
        return
    for seq in seqs_in_file(open(fhand.name)):
        len_seq = len(seq)
        if min_length is not None and len_seq < min_length:
            msg = "Sequence %s is shorter than %i residues" % (seq.name, min_length)
            raise ValueError(msg)
        elif max_length is not None and len_seq > max_length:
            msg = "Sequence %s is longer than %i residues" % (seq.name, max_length)
            raise ValueError(msg)
 def test_edge_trim(self):
     'It trims the sequence edges'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) - len(out_seqs[0].seq) == 20
Ejemplo n.º 19
0
def main():
    "The main part"

    mira_path, iassembler_path = set_parameters()

    # guess the mira files that we need
    unigenes_fpath, unigenes_qual_fpath, mira_contig_read_fpath = get_mira_paths(mira_path)

    # create the iassembler project dir and subdirs
    if not os.path.exists(iassembler_path):
        os.makedirs(iassembler_path)
    mira_1_dir = os.path.join(iassembler_path, "{0:s}_Assembly".format(IASSEMBLER_INPUT_NAME), "mira")
    os.makedirs(mira_1_dir)

    # prepare contig readlist for iaasembler
    iassembler_contig_mem_fpath = os.path.join(mira_1_dir, "CMF10")
    process_contig_readlist(mira_contig_read_fpath, iassembler_contig_mem_fpath)

    # copy unigene files into the iassembler project
    iassembler_unigenes = os.path.join(mira_1_dir, "mira2.fa")
    iassembler_unigenes_qual = os.path.join(mira_1_dir, "mira2.fa.qual")
    iassembler_unigenes_fh = open(iassembler_unigenes, "w")
    iassembler_unigenes_qual_fh = open(iassembler_unigenes_qual, "w")
    seq_writer = SequenceWriter(
        fhand=iassembler_unigenes_fh, file_format="fasta", qual_fhand=iassembler_unigenes_qual_fh
    )

    for seq in seqs_in_file(seq_fhand=open(unigenes_fpath), format="sfastq", qual_fhand=open(unigenes_qual_fpath)):
        seq.name = "mira_{0:s}".format(seq.name.split("_", 1)[1])
        seq.id = seq.name
        seq_writer.write(seq)

    # create iassembler input files.
    seq_fhand = open(os.path.join(iassembler_path, IASSEMBLER_INPUT_NAME), "w")
    qual_fhand = open(os.path.join(iassembler_path, IASSEMBLER_INPUT_NAME + ".qual"), "w")

    for file_ in os.listdir(os.path.join(mira_path, "..")):
        if "_in" in file_:
            file_ = os.path.join(mira_path, "..", file_)
            seqio(
                in_seq_fhand=open(file_),
                out_seq_fhand=seq_fhand,
                out_qual_fhand=qual_fhand,
                in_format="sfastq",
                out_format="fasta",
            )

    msg = "To run iassembler you must use this command:\n"
    msg += "iassembler -c -i {0:s}\n".format(IASSEMBLER_INPUT_NAME)
    msg += "From your iassembler dir:{0:s}".format(iassembler_path)
    print msg
Ejemplo n.º 20
0
    def test_json_reader():
        'It tests the json sequence writer'
        #first we write some files
        seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
        seq1 = SeqWithQuality(seq=Seq('GATACCA', DNAAlphabet()), name='seq2')
        fhand = tempfile.NamedTemporaryFile(suffix='.json')
        write_seqs_in_file([seq0, seq1], fhand, format='json')
        fhand.flush()

        #now we read them
        seqs = list(seqs_in_file(open(fhand.name)))
        assert seqs[0].seq == seq0.seq
        assert seqs[1].seq == seq1.seq
        assert str(seqs[1].seq.alphabet) == str(seq1.seq.alphabet)
 def test_words(self):
     'It trims re words'
     vector = 'ACTG'
     vector = SeqWithQuality(Seq(vector), name='vect', qual=[30]*len(vector))
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [vector + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-r', '"^ACTG","TTTTTTTTTTTTTT"']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert seq2.seq == out_seqs[0].seq
 def test_vector(self):
     'It removes the vector'
     seq1 = create_random_seqwithquality(5, qual_range=35)
     vector = create_random_seqwithquality(3000, qual_range=35)
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1 + vector[30:60] + seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     vector_fhand = create_temp_seq_file([vector], format='fasta')[0]
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-v', vector_fhand.name]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert (len(seq2.seq) - len(out_seqs[0].seq)) < 5
Ejemplo n.º 23
0
def do_snv_stats(seq_path, out_dir):
    "It performs snv statistics"
    first_time = True
    for analysis in STAT_ANALYSIS:
        seqs = seqs_in_file(open(seq_path.last_version, "r"))
        dist_fhand = open(join(out_dir, analysis + "_distrib.dist"), "w")
        svg_fhand = open(join(out_dir, analysis + "_distrib.svg"), "w")
        if first_time == True:
            sum_fhand = open(join(out_dir, "distrib.sum"), "w")
            first_time = False
        else:
            sum_fhand = open(join(out_dir, "distrib.sum"), "a")
        STAT_ANALYSIS[analysis](seqs, distrib_fhand=dist_fhand, plot_fhand=svg_fhand, summary_fhand=sum_fhand)
        dist_fhand.close()
        svg_fhand.close()
        sum_fhand.close()
 def test_min_length(self):
     'Filtering by length'
     seq1 = create_random_seqwithquality(250, qual_range=35)
     seq2 = create_random_seqwithquality(50, qual_range=35)
     seq3 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-m', '51']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2
     assert len(out_seqs[0]) == 250
     assert len(out_seqs[1]) == 250
 def test_trim_as_mask(self):
     'It masks the regions to trim'
     seq2 = create_random_seqwithquality(250, qual_range=35)
     seqs = [seq2]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq', '-e', '10,10', '--mask_no_trim']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(seq2.seq) == len(out_seqs[0].seq)
     seq = str(out_seqs[0].seq)
     assert seq[0:9].islower()
     assert seq[10:len(seq) - 10].isupper()
     assert seq[-10:].islower()
 def test_fastq(self):
     'Cleaning fastq seqs in parallel'
     seq1 = create_random_seqwithquality(500, qual_range=55)
     seq2 = create_random_seqwithquality(50, qual_range=15)
     seq3 = create_random_seqwithquality(500, qual_range=55)
     seq4 = create_random_seqwithquality(50, qual_range=15)
     seq5 = create_random_seqwithquality(500, qual_range=55)
     seq6 = create_random_seqwithquality(50, qual_range=15)
     seqs = [seq1 + seq2, seq3 + seq4, seq5 + seq6]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     #we can clean a sanger sequence with quality
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', 'sanger', '-t', '4', '-f', 'fastq']
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert out_seqs[0].qual[-1] == 55
 def test_filter(self):
     'Filtering by blast similarity'
     seq1 = create_random_seqwithquality(150, qual_range=35)
     seq2 = 'CACTATCTCCGACGACGGCGATTTCACCGTTGACCTGATTTCCAGTTGCTACGTCAAGTTCTC'
     seq2 += 'TACGGCAAGAATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCA'
     seq2 += 'AGCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGCTCTGCCA'
     seq2 = SeqWithQuality(Seq(seq2), name='ara', qual=[30]*len(seq2))
     seq3 = create_random_seqwithquality(150, qual_range=35)
     seqs = [seq1, seq2, seq3]
     inseq_fhand = create_temp_seq_file(seqs, format='fastq')[0]
     outseq_fhand = NamedTemporaryFile()
     ara_db = os.path.join(TEST_DATA_DIR, 'blast', 'arabidopsis_genes+')
     cmd = [CLEAN_READS, '-i', inseq_fhand.name, '-o', outseq_fhand.name,
            '-p', '454', '-f', 'fastq',
            '--filter_dbs', ','.join((ara_db, ara_db))]
     retcode = _call_python(cmd)[-1]
     assert retcode == 0
     out_seqs = list(seqs_in_file(seq_fhand=open(outseq_fhand.name),
                                  format='fastq'))
     assert len(out_seqs) == 2
Ejemplo n.º 28
0
    def test_pickle_writer():
        'It tests the pickle sequence writer'
        seq0 = SeqWithQuality(seq=Seq('ATGATAGATAGATGF'), name='seq1')
        alleles = {('G', 3): {}}
        filters = {'a_filter':{('param',):False}}
        snv_feature = SeqFeature(FeatureLocation(ExactPosition(3),
                                                 ExactPosition(3)),
                                                 type='snv',
                                        qualifiers={'alleles':alleles,
                                                    'filters':filters})
        seq1 = SeqWithQuality(seq=Seq('GATACCA'), name='seq2',
                              features=[snv_feature])
        fhand = StringIO()
        write_seqs_in_file([seq0, seq1], fhand, format='pickle')
        #print fhand.getvalue()

        fhand.seek(0)
        seqs = list(seqs_in_file(fhand))
        assert seqs[1].features[0].qualifiers['alleles'] == alleles
        assert seqs[1].features[0].qualifiers['filters'] == filters
Ejemplo n.º 29
0
def seqio(in_seq_fhand, out_seq_fhand, out_format, double_encoding=False,
          in_qual_fhand=None, out_qual_fhand=None, in_format=None):
    'It converts format of the files'
    if not in_format:
        in_format = guess_seq_file_format(in_seq_fhand)
    if (in_qual_fhand is not None or
        out_qual_fhand is not None or
        in_format in ('repr', 'json', 'pickle') or
        out_format in ('repr', 'json', 'pickle')) :
        seqs = seqs_in_file(seq_fhand=in_seq_fhand,
                            qual_fhand=in_qual_fhand,
                            format=in_format, double_encoding=double_encoding)
        write_seqs_in_file(seqs, seq_fhand=out_seq_fhand,
                           qual_fhand=out_qual_fhand,
                           format=out_format)
    else:
        SeqIO.convert(in_seq_fhand, in_format, out_seq_fhand, out_format)
    out_seq_fhand.flush()
    if out_qual_fhand:
        out_qual_fhand.flush()
Ejemplo n.º 30
0
def do_transitive_clustering_all(blast_fhand, seqs_fhand, filters=None):
    """It does a transtive clustering given a xml blast result. and the
    initial seq file.

    It will look for pairs of similar in the blast. Then it will create
    clusters of sequences using the transitive property in the pairs.
    e.g. a is similar to b. b to c => a, b and c belong to the same cluster.

    it will use the seqs file to add to the clusters the sequences that are o
    similar to other seqs: clusters with only one sequence
    """

    similar_pairs = list(get_hit_pairs_from_blast(blast_fhand, filters=filters))
    clusters = do_transitive_clustering(similar_pairs)

    seqs_with_hit = set([item for sublist in similar_pairs for item in sublist])
    all_seqs = set([seq.name for seq in seqs_in_file(seqs_fhand)])

    seqs_without_match = all_seqs.difference(seqs_with_hit)

    return clusters, seqs_without_match