def main():
    'The main function'
    kind, io_fhands, input_file_format1, input_file_format2 = set_parameters()

    seqs1 = seqs_in_file( io_fhands['seqfile1'],  io_fhands['qualfile1'],
                          input_file_format1)
    seqs2 = seqs_in_file( io_fhands['seqfile2'],  io_fhands['qualfile2'],
                          input_file_format2)

    seq_distrib_diff(seqs1, seqs2, kind, distrib_fhand=io_fhands['distrib'],
                     plot_fhand=io_fhands['plot'])
def main():
    'The main function'
    kind, io_fhands, format = set_parameters()

    if io_fhands['qualfile1']:
        seqs1 = seqs_in_file(io_fhands['seqfile1'], io_fhands['qualfile1'],
                             format=format)
    else:
        seqs1 = seqs_in_file(io_fhands['seqfile1'], format=format)

    seq_distrib(sequences=seqs1, kind=kind, distrib_fhand=io_fhands['distrib'],
                plot_fhand=io_fhands['plot'], low_memory=True)
    def test_pipeline_run():
        'It tests that the pipeline runs ok'
        pipeline = 'sanger_with_qual'

        fhand_adaptors = NamedTemporaryFile()
        fhand_adaptors.write(ADAPTORS)
        fhand_adaptors.flush()

        arabidopsis_genes = 'arabidopsis_genes+'

        univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes)
        configuration = {'remove_vectors_blastdb': {'vectors': univec},
                         'remove_adaptors': {'adaptors': fhand_adaptors.name}}

        seq_fhand = open(os.path.join(TEST_DATA_DIR, 'seq.fasta'), 'r')
        qual_fhand = open(os.path.join(TEST_DATA_DIR, 'qual.fasta'), 'r')

        seq_iter = seqs_in_file(seq_fhand, qual_fhand)

        filtered_seq_iter = _pipeline_builder(pipeline, seq_iter,
                                              configuration)

        seq_list = list(filtered_seq_iter)
        assert 'CGAtcgggggg' in str(seq_list[0].seq)
        assert len(seq_list) == 6
Beispiel #4
0
 def _get_lengths_quals_from_file(seq_fpath):
     'Given a sequence file it returns the lengths and quals'
     lengths = IntsStats(init_len=1000)
     quals   = IntsStats(init_len=100)
     for seq in seqs_in_file(open(seq_fpath)):
         lengths.append(len(seq))
         qual = seq.qual
         if qual:
             quals.extend(qual)
     return lengths, quals
def _change_names_in_files_by_seq(fhand_in, fhand_out, naming, file_format):
    'It replaces the seq name using the  per_seq method'
    seqs = seqs_in_file(fhand_in, format=file_format)

    for seq in seqs:
        old_name = get_seq_name(seq)
        new_name = naming.get_uniquename(old_name)
        seq.name = new_name
        seq.id = new_name
        write_seqs_in_file([seq], fhand_out, format=file_format)
def main():
    'The main part of the script'
    io_fhands, minlength = set_parameters()

    #Get sequences from input files
    seq_iter     = seqs_in_file(io_fhands['in_seq'], io_fhands['in_qual'])

    # split new long seqs

    new_seq_iter = split_seq_by_masked_regions(seq_iter, minlength)

    # Write cutted seqs to a new fasta
    write_fasta_file(new_seq_iter, io_fhands['out_seq'], io_fhands['out_qual'])
Beispiel #7
0
    def _get_quals_by_length_from_file(fpath):
        'It returns the qualities along the sequence as a list of lists'

        quals_by_position = []
        for seq in seqs_in_file(open(fpath)):
            quals = seq.qual
            if quals:
                for base_number, qual in enumerate(quals):
                    try:
                        quals_by_position[base_number]
                    except IndexError:
                        quals_by_position.append(array.array('B'))
                    quals_by_position[base_number].append(qual)
        return quals_by_position
def main():
    'The main function'
    infhand, work_dir, tag, format = set_parameters()
    seqs = seqs_in_file(infhand, format=format)
    tags = {}

    # split seqs by tag. Create a list with all the seqrecords
    for seq in seqs:
        item = get_item_from_tag(seq.description, tag)
        if item not in tags.keys():
            name       = "".join(infhand.name.split('.')[:-1])
            name      += '.' + item + '.' + format
            tags[item] = open(os.path.join(work_dir, name), 'w')

        write_seqs_in_file([seq], tags[item], format=format)
    for files in tags.values():
        files.close()
def main():
    'Main section'
    fhand_seq, fhand_qual, result_file, file_format = set_parameters()

    seqs = seqs_in_file(fhand_seq, fhand_qual, file_format)

    stats = general_seq_statistics(seqs)

    if result_file:
        output = result_file
    else:
        output = sys.stdout

    for key, value in stats.items():
        if value is not None:
            to_print = '%-19s : %d\n' % (key, value)
            output.write(to_print)
Beispiel #10
0
def merge_sam(infiles, outfile, reference):
    'It merges a list of sam files'

    #first the reference part of the header
    ref_header = []
    for seq in seqs_in_file(reference):
        name = seq.name
        length = len(seq)
        ref_header.append(['@SQ', 'SN:%s' % name, 'LN:%d' % length])

    #now the read groups
    headers = set()

    for input_ in infiles:
        input_.seek(0)
        for line in input_:
            line = line.strip()
            if not line:
                continue
            if line.startswith('@SQ') or line.startswith('@PG'):
                continue
            elif line.startswith('@'):
                if 'SO:' in line:
                    continue
                else:
                    headers.add(tuple(line.split()))
            else:
                break

    #join and write both header parts
    headers = list(headers)
    headers.extend(ref_header)
    for header in headers:
        outfile.write('\t'.join(header))
        outfile.write('\n')

    #the non header parts
    for input_ in infiles:
        input_.seek(0)
        for line in input_:
            if line.startswith('@'):
                continue
            outfile.write(line)

    outfile.flush()
Beispiel #11
0
    def _do_seq_distrib_for_pair(self, pair):
        'It does the distribution for a pair of cleaned and raw seqs'

        get_stats_dir = lambda seq_type: os.path.join(self._get_project_path(),
                              BACKBONE_DIRECTORIES['%s_reads_stats' % seq_type])

        #the statistics for both clean and raw sequences
        lengths = {}
        quals = {}
        for seq_type in ('raw', 'cleaned'):
            stats_dir = get_stats_dir(seq_type)

            stats_fpath = os.path.join(stats_dir,
                                       BACKBONE_BASENAMES['statistics_file'])
            stats_fhand = open(stats_fpath, 'a')

            if seq_type in pair:
                fpath = pair[seq_type].last_version
                basename = pair[seq_type].basename



                # nucleotide freq per position
                out_fpath = os.path.join(stats_dir, basename + '.freq_position')
                if not os.path.exists(out_fpath):
                    plot_fpath = out_fpath  + '.' + PLOT_FILE_FORMAT
                    seqs = seqs_in_file(open(fpath))
                    create_nucleotide_freq_histogram(seqs,
                                                    fhand=open(plot_fpath, 'w'))

                #the names for the output files
                out_fpath = os.path.join(stats_dir, basename + '.length')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'
                if os.path.exists(plot_fpath):
                    continue

                lengths_, quals_ = self._get_lengths_quals_from_file(fpath)
                lengths[seq_type] = lengths_
                quals[seq_type] = quals_

                #the distributions for the lengths
                distrib  = lengths_.calculate_distribution()
                lengths_.draw_distribution(distrib, labels=PLOT_LABELS['seq_length'],
                                           distrib_fhand=open(distrib_fpath, 'w'),
                                           plot_fhand=open(plot_fpath, 'w'))

                #the distributions for the quals
                out_fpath = os.path.join(stats_dir, basename + '.qual')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                if quals_.count != 0:
                    distrib  = quals_.calculate_distribution()
                    quals_.draw_distribution(distrib,
                                             labels=PLOT_LABELS['seq_qual'],
                                             plot_fhand=open(plot_fpath, 'w'),
                                         distrib_fhand=open(distrib_fpath, 'w'))


                #the statistics for the statistics file
                self._write_statistics(stats_fhand, fpath, lengths_, quals_)

        #the statistics for the differences
        if 'raw' in pair and 'cleaned' in pair:
            fpath = pair['cleaned'].last_version
            basename = pair['cleaned'].basename

            #the names for the output files
            stats_dir = get_stats_dir('cleaned')
            out_fpath = os.path.join(stats_dir, basename + '.length.diff')
            plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = out_fpath + '.dat'

            if not os.path.exists(plot_fpath) and lengths:
                #the distributions for the lengths
                lengths = lengths['raw'], lengths['cleaned']
                self._do_diff_distrib_for_numbers(lengths,
                                          plot_fhand= open(plot_fpath, 'w'),
                                    distrib_fhand= open(distrib_fpath, 'w'),
                                                  dist_type='seq_length')
                del lengths

                #the distributions for the quals
                out_fpath = os.path.join(stats_dir, basename + '.qual.diff')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                quals = quals['raw'], quals['cleaned']
                if quals[0].count != 0 and quals[1].count != 0:
                    self._do_diff_distrib_for_numbers(quals,
                                          plot_fhand= open(plot_fpath, 'w'),
                                    distrib_fhand= open(distrib_fpath, 'w'),
                                                      dist_type='seq_qual')
                del quals

        for seq_type in ('raw', 'cleaned'):
            if seq_type in pair:
                stats_dir = get_stats_dir(seq_type)
                fpath = pair[seq_type].last_version
                basename = pair[seq_type].basename

                #the names for the output files
                out_fpath = os.path.join(stats_dir, basename + '.qual.boxplot')
                plot_fpath = out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = out_fpath + '.dat'

                if os.path.exists(plot_fpath):
                    continue

                quals_ = self._get_quals_by_length_from_file(fpath)

                if quals_ and quals_[0]:
                    #boxplot
                    draw_boxplot(quals_, fhand=open(plot_fpath, 'w'),
                                 title=PLOT_LABELS['qual_boxplot']['title'],
                                 xlabel=PLOT_LABELS['qual_boxplot']['xlabel'],
                                 ylabel=PLOT_LABELS['qual_boxplot']['ylabel'],
                                 stats_fhand=open(distrib_fpath, 'w'),
                                 max_plotted_boxes=30)
                del quals_
    def _do_stats_for_file(self, path, stats_dir):
        'It calculates the stats for a file os seqs'
        stats_fpath = os.path.join(stats_dir,
                                       BACKBONE_BASENAMES['statistics_file'])
        stats_fhand = open(stats_fpath, 'a')

        fpath = path.last_version
        basename = path.basename
        #output_files
        freq_pos_out_fpath = os.path.join(stats_dir,
                                          basename + '.freq_position')
        length_out_fpath = os.path.join(stats_dir, basename + '.length')
        qual_out_fpath = os.path.join(stats_dir, basename + '.qual')
        qual_boxplot_out_fpath = os.path.join(stats_dir,
                                              basename + '.qual.boxplot')

        # nucleotide freq per position
        if not os.path.exists(freq_pos_out_fpath):
            plot_fpath = freq_pos_out_fpath + '.' + PLOT_FILE_FORMAT
            seqs = seqs_in_file(open(fpath))
            create_nucleotide_freq_histogram(seqs, fhand=open(plot_fpath, 'w'))

        #Extract lengths and quals
        if (not os.path.exists(length_out_fpath) or
            not  os.path.exists(qual_out_fpath)):
            lengths_, quals_ = self._get_lengths_quals_from_file(fpath)

        #the distributions for the lengths
        if not os.path.exists(length_out_fpath):
            plot_fpath = length_out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = length_out_fpath + '.dat'
            distrib = lengths_.calculate_distribution()
            lengths_.draw_distribution(distrib,
                                 labels=PLOT_LABELS['seq_length'],
                                 distrib_fhand=open(distrib_fpath, 'w'),
                                 plot_fhand=open(plot_fpath, 'w'))

        #the distributions for the quals
        if not  os.path.exists(qual_out_fpath) or quals_.count != 0:
            plot_fpath = qual_out_fpath + '.' + PLOT_FILE_FORMAT
            distrib_fpath = qual_out_fpath + '.dat'

            distrib = quals_.calculate_distribution()
            quals_.draw_distribution(distrib,
                                     labels=PLOT_LABELS['seq_qual'],
                                     plot_fhand=open(plot_fpath, 'w'),
                                 distrib_fhand=open(distrib_fpath, 'w'))

        # qual boxplot
        if not os.path.exists(qual_boxplot_out_fpath):
            quals = self._get_quals_by_length_from_file(fpath)
            if (quals and quals[0]):
                plot_fpath = qual_boxplot_out_fpath + '.' + PLOT_FILE_FORMAT
                distrib_fpath = qual_boxplot_out_fpath + '.dat'
                #boxplot
                draw_boxplot(quals, fhand=open(plot_fpath, 'w'),
                             title=PLOT_LABELS['qual_boxplot']['title'],
                             xlabel=PLOT_LABELS['qual_boxplot']['xlabel'],
                             ylabel=PLOT_LABELS['qual_boxplot']['ylabel'],
                             stats_fhand=open(distrib_fpath, 'w'),
                             max_plotted_boxes=30)

        #the statistics for the statistics file
        self._write_statistics(stats_fhand, fpath, lengths_, quals_)