Esempio n. 1
0
    def get_fancy_results_dict(self,
                               max_per_query=10,
                               defline_white_space_mask=None):
        b6 = b6lib.B6Source(self.output)

        input_fasta = u.SequenceSource(self.input)
        target_db = u.SequenceSource(self.target)

        query_counts = {}
        fancy_results_dict = {}

        while b6.next():
            if not query_counts.has_key(b6.entry.query_id):
                query_counts[b6.entry.query_id] = 1

            if query_counts[b6.entry.query_id] - 1 == max_per_query:
                continue
            else:
                query_counts[b6.entry.query_id] += 1

            if not fancy_results_dict.has_key(b6.entry.query_id):
                fancy_results_dict[b6.entry.query_id] = []

            query_seq = input_fasta.get_seq_by_read_id(
                b6.entry.query_id).replace('-', '')
            target_seq = target_db.get_seq_by_read_id(b6.entry.subject_id)

            if defline_white_space_mask:
                b6.entry = remove_white_space_mask_from_B6_entry(
                    b6.entry, defline_white_space_mask)

            # parts that were aligned during the search are being aligned to each other to generate
            # hsp_match data to include into results
            query_aligned, target_aligned = nw_align(query_seq[int(b6.entry.q_start) - 1:int(b6.entry.q_end)],\
                                                         target_seq[int(b6.entry.s_start) - 1:int(b6.entry.s_end)])

            query_aligned, target_aligned = query_aligned.upper(
            ), target_aligned.upper()

            coverage = (b6.entry.q_end -
                        (b6.entry.q_start - 1)) * 100.0 / b6.entry.q_len
            hsp_match = ''.join([
                '|' if query_aligned[i] == target_aligned[i] else ' '
                for i in range(0, len(query_aligned))
            ])

            entry = copy.deepcopy(b6.entry)
            entry.coverage = coverage
            entry.hsp_query = query_aligned
            entry.hsp_subject = target_aligned
            entry.hsp_match = hsp_match

            entry = remove_white_space_mask_from_B6_entry(entry)

            fancy_results_dict[entry.query_id].append(entry)

        return fancy_results_dict
Esempio n. 2
0
def get_oligo_reps_dict(html_dict, html_output_directory):
    oligos, rep_dir = html_dict['oligos'], html_dict[
        'output_directory_for_reps']

    oligo_reps_dict = {}
    oligo_reps_dict['imgs'] = {}
    oligo_reps_dict['fancy_seqs'] = {}
    oligo_reps_dict['clear_seqs'] = {}
    oligo_reps_dict['frequency'] = {}
    oligo_reps_dict['component_references'] = {}
    oligo_reps_dict['blast_results'] = {}

    for i in range(0, len(oligos)):
        oligo = oligos[i]

        alignment_base_path = os.path.join(rep_dir, '%.5d_' % i + oligo)

        diversity_image_path = alignment_base_path + '_unique.png'
        diversity_image_dest = os.path.join(
            html_output_directory, os.path.basename(diversity_image_path))
        shutil.copy2(diversity_image_path, diversity_image_dest)
        oligo_reps_dict['imgs'][oligo] = os.path.basename(diversity_image_dest)

        unique_sequences_path = alignment_base_path + '_unique'
        uniques = u.SequenceSource(unique_sequences_path)
        oligo_reps_dict['fancy_seqs'][oligo] = []
        oligo_reps_dict['clear_seqs'][oligo] = []
        oligo_reps_dict['frequency'][oligo] = []
        while uniques.next() and uniques.pos <= 20:
            oligo_reps_dict['clear_seqs'][oligo].append(uniques.seq)
            oligo_reps_dict['fancy_seqs'][oligo].append(
                get_decorated_sequence(uniques.seq,
                                       html_dict['entropy_components']))
            oligo_reps_dict['frequency'][oligo].append(
                pretty_print(uniques.id.split('|')[1].split(':')[1]))

        entropy_file_path = alignment_base_path + '_unique_entropy'
        entropy_values_per_column = [0] * html_dict['alignment_length']
        for column, entropy in [
                x.strip().split('\t') for x in open(entropy_file_path)
        ]:
            entropy_values_per_column[int(column)] = float(entropy)

        color_per_column = cPickle.load(
            open(alignment_base_path + '_unique_color_per_column.cPickle'))
        oligo_reps_dict['component_references'][oligo] = ''.join([
            '<span style="background-color: %s;"><a onmouseover="popup(\'\column: %d<br />entropy: %.4f\', 100)" href="">|</a></span>'
            % (color_per_column[i], i, entropy_values_per_column[i])
            for i in range(0, html_dict['alignment_length'])
        ])

        blast_results_dict = alignment_base_path + '_unique_BLAST.cPickle'
        if os.path.exists(blast_results_dict):
            html_dict['blast_results_found'] = True
            oligo_reps_dict['blast_results'][oligo] = cPickle.load(
                open(blast_results_dict))
        else:
            oligo_reps_dict['blast_results'][oligo] = None

    return oligo_reps_dict
Esempio n. 3
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     num_reads_per_file=5000):
    input_fasta = u.SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input_fasta.next():
        if (input_fasta.pos - 1) % num_reads_per_file == 0:
            if part_obj:
                part_obj.close()

            file_path = os.path.join(dest_dir, '%s-%d' % (prefix, next_part))
            parts.append(file_path)
            next_part += 1
            part_obj = u.FastaOutput(file_path)

        part_obj.store(input_fasta, split=False)

    if part_obj:
        part_obj.close()

    return parts
Esempio n. 4
0
def gen_tmpl(taxon,
             otu_id_to_greengenes,
             greengenes_alignment,
             output_file_path=None):
    ids = []

    for id, tax in [
            line.strip().split('\t')
            for line in open(otu_id_to_greengenes).readlines()
    ]:
        if tax.find(taxon) > 0:
            ids.append(id)

    ids = list(set(ids))
    print '%d ids found for %s.' % (len(ids), taxon)

    template = u.FastaOutput('%s.tmpl' % taxon)
    fasta = u.SequenceSource(greengenes_alignment)
    while fasta.next():
        if fasta.id in ids:
            template.store(fasta, split=False)
            ids.remove(fasta.id)

    fasta.close()
    template.close()
def main(input_fasta_path, output_fasta_path=None, reverse=False):
    if not output_fasta_path:
        output_fasta_path = input_fasta_path + '-PADDED-WITH-GAPS'

    fasta = u.SequenceSource(input_fasta_path)
    output = u.FastaOutput(output_fasta_path)

    longest_read = 0
    while next(fasta):
        if len(fasta.seq) > longest_read:
            longest_read = len(fasta.seq)

    fasta.reset()

    while next(fasta):
        if fasta.pos % 10000 == 0:
            sys.stderr.write('\rreads processed so far: %d' % (fasta.pos))
            sys.stderr.flush()

        gaps = longest_read - len(fasta.seq)

        output.write_id(fasta.id)
        if reverse:
            output.write_seq('-' * gaps + fasta.seq, split=False)
        else:
            output.write_seq(fasta.seq + '-' * gaps, split=False)

    fasta.close()
    sys.stderr.write('\n')
Esempio n. 6
0
def split_fasta_file(input_file_path,
                     dest_dir,
                     prefix='part',
                     num_reads_per_file=5000):
    input_fasta = u.SequenceSource(input_file_path)

    parts = []
    next_part = 1
    part_obj = None

    while input_fasta.next():
        if (input_fasta.pos - 1) % num_reads_per_file == 0:
            if part_obj:
                part_obj.close()

            rand_bit = ''.join([
                random.choice(string.ascii_letters + string.digits)
                for n in xrange(8)
            ])
            file_path = os.path.join(
                dest_dir, '%s-%d-%s.fa' % (prefix, next_part, rand_bit))
            parts.append(file_path)
            next_part += 1
            part_obj = u.FastaOutput(file_path)

        part_obj.store(input_fasta, split=False)

    if part_obj:
        part_obj.close()

    return parts
Esempio n. 7
0
def trim_uninformative_columns_from_alignment(input_file_path):
    input_fasta = u.SequenceSource(input_file_path, lazy_init=False)
    input_fasta.next()
    fasta_read_len = len(input_fasta.seq)
    invalid_columns = range(0, fasta_read_len)
    input_fasta.reset()

    while input_fasta.next():
        for i in invalid_columns:
            if input_fasta.seq[i] != '-':
                invalid_columns.remove(i)

    columns_to_keep = [
        x for x in range(0, fasta_read_len) if x not in invalid_columns
    ]

    input_fasta.reset()

    temp_file = tempfile.NamedTemporaryFile(delete=False)
    temp_file_path = temp_file.name
    temp_file.close()

    temp_file = u.FastaOutput(temp_file_path)

    while input_fasta.next():
        new_seq = ''
        for i in columns_to_keep:
            new_seq += input_fasta.seq[i]
        temp_file.write_id(input_fasta.id)
        temp_file.write_seq(new_seq, split=False)

    temp_file.close()

    # overwrite the original file with trimmed content
    shutil.move(temp_file_path, input_file_path)
Esempio n. 8
0
def get_read_objects_from_file(input_file_path):
    input_fasta = u.SequenceSource(input_file_path, unique=True)
    read_objects = []

    while input_fasta.next():
        read_objects.append(UniqueFASTAEntry(input_fasta.seq, input_fasta.ids))

    input_fasta.close()
    return read_objects
Esempio n. 9
0
def get_unique_sequences_from_FASTA(alignment, limit=10):
    unique_sequences = []

    fasta = u.SequenceSource(alignment, unique=True, lazy_init=False)

    while fasta.next() and fasta.pos < limit:
        unique_sequences.append((fasta.seq, len(fasta.ids),
                                 len(fasta.ids) / float(fasta.total_seq)))

    return unique_sequences
Esempio n. 10
0
def mask_defline_whitespaces_in_FASTA(fasta_file_path,
                                      defline_white_space_mask='<$!$>'):
    temp_file_path = fasta_file_path + '.tmp'
    fasta = u.SequenceSource(fasta_file_path)
    output = u.FastaOutput(fasta_file_path + '.tmp')

    while fasta.next():
        output.write_id(fasta.id.replace(' ', defline_white_space_mask))
        output.write_seq(fasta.seq, split=False)

    shutil.move(temp_file_path, fasta_file_path)
Esempio n. 11
0
def main(input_fasta, subsample_to, output_fasta):
    fasta = u.SequenceSource(input_fasta)

    fasta_content = {}

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write(
                '\r[Reading FASTA into memory] reads processed so far: %d' %
                (fasta.pos))
            sys.stderr.flush()

        sample_name = get_sample_name_from_defline(fasta.id)

        if not fasta_content.has_key(sample_name):
            fasta_content[sample_name] = []

        fasta_content[sample_name].append((fasta.id, fasta.seq), )

    samples = sorted(fasta_content.keys())
    sys.stderr.write(
        '\n%d samples found in the FASTA file: %s%s\n' %
        (len(samples),
         ', '.join(samples[0:3] if len(samples) > 3 else ', '.join(samples)),
         ' (...)' if len(samples) > 3 else '.'))

    sample_counter = 0
    for sample in samples:
        sample_counter += 1
        sys.stderr.write('\r[Shuffling] Sample %d of %d' %
                         (sample_counter, len(samples)))
        sys.stderr.flush()

        random.shuffle(fasta_content[sample])

    output = u.FastaOutput(output_fasta)

    sample_counter = 0
    for sample in samples:
        sample_counter += 1
        sys.stderr.write('\r[Writing Output] Sample %d of %d' %
                         (sample_counter, len(samples)))
        sys.stderr.flush()

        for e in fasta_content[sample][0:subsample_to]:
            output.write_id(e[0])
            output.write_seq(e[1], split=False)

    sys.stderr.write('\n')
    sys.stderr.flush()
Esempio n. 12
0
def get_unique_sequences_dict(html_dict):
    oligos, rep_dir = html_dict['oligos'], html_dict['output_directory_for_reps']

    rep_oligo_seqs_clean_dict = {}
    rep_oligo_seqs_fancy_dict = {}
    
    for i in range(0, len(oligos)):
        unique_file_path = os.path.join(rep_dir, '%.5d_' % i + oligos[i] + '_unique')
        f = u.SequenceSource(unique_file_path)
        f.next()
        rep_oligo_seqs_clean_dict[oligos[i]] = f.seq
        rep_oligo_seqs_fancy_dict[oligos[i]] = get_decorated_sequence(f.seq, html_dict['entropy_components'])
        f.close()
    return (rep_oligo_seqs_clean_dict, rep_oligo_seqs_fancy_dict)
Esempio n. 13
0
def get_quals_dict(quals_file,
                   alignment_file,
                   output_file_path=None,
                   verbose=True):
    """This function takes qual scores file in FASTA format, expands each
       entry to match base calls in the corresponding aligned read in the
       FASTA file (which requires deflines to be identical), and finally
       returns a dictionary that contains qual scores as a list of integer
       values that are bound to deflines as key/value pairs"""

    quals_dict = {}
    quals_aligned_dict = {}

    progress = Progress()
    progress.verbose = verbose
    progress.new('Quality scores dictionary is being generated')

    alignment = u.SequenceSource(alignment_file)
    qual = u.QualSource(quals_file)

    while qual.next():
        if qual.pos % 1000 == 0:
            progress.update('Step 1 of 2 :: Quality scores read: %s' %
                            (pretty_print(qual.pos)))
        quals_dict[qual.id] = qual.quals_int

    while alignment.next():
        if alignment.pos % 1000 == 0:
            progress.update('Step 2 of 2 :: Alignments matched: %s' %
                            (pretty_print(alignment.pos)))
            sys.stderr.flush()

        matching_qual = quals_dict[alignment.id]

        qual_aligned = []
        for i in range(0, len(alignment.seq)):
            if alignment.seq[i] != '-':
                qual_aligned.append(matching_qual.pop(0))
            else:
                qual_aligned.append(None)

        quals_aligned_dict[alignment.id] = qual_aligned
    progress.end()

    if output_file_path:
        cPickle.dump(quals_aligned_dict, open(output_file_path, 'w'))

    return quals_aligned_dict
Esempio n. 14
0
def main():
    filelist = os.listdir(os.getcwd())
    filelist.remove("create_test_file.py")
    fieldnames = ["ID", "Anticodon"]

    with open("../extractor_test_file", "w") as writefile:
        writer = csv.DictWriter(writefile,
                                fieldnames=fieldnames,
                                delimiter="\t")
        writer.writeheader()

        for file in filelist:
            cur_fasta = u.SequenceSource(file)
            while cur_fasta.next():
                cur_dict = {}
                cur_list = cur_fasta.id.split(" ")
                cur_dict["ID"] = cur_list[0]
                cur_dict["Anticodon"] = cur_list[5].strip("(").strip(")")
                writer.writerow(cur_dict)
Esempio n. 15
0
def unique_and_store_alignment(alignment_path, output_path):
    output = u.FastaOutput(output_path)
    alignment = u.SequenceSource(alignment_path, unique=True)

    alignment.next()
    most_abundant_unique_read = alignment.seq
    alignment.reset()

    read_ids = []
    unique_read_counts = []
    while alignment.next():
        read_ids += alignment.ids
        unique_read_counts.append(len(alignment.ids))
        output.store(alignment, split=False)

    output.close()
    alignment.close()

    return (read_ids, unique_read_counts, most_abundant_unique_read)
Esempio n. 16
0
def get_oligos_list(oligos_file_path):
    oligos_list = []
    fasta = u.SequenceSource(oligos_file_path)
    while fasta.next():
        oligos_list.append(fasta.seq)
    return oligos_list
Esempio n. 17
0
# don't have them? quals_dict and qual_stats_dict are being generated by gen_dicts_for_qual_stats.py
#

import sys
import cPickle
from scipy import log2 as log

import matplotlib.pyplot as plt
import Oligotyping.lib.fastalib as u

from Oligotyping.utils.utils import get_qual_stats_dict
from Oligotyping.utils.random_colors import get_list_of_colors

COLORS = {'A': 'red', 'T': 'blue', 'C': 'green', 'G': 'purple', 'N': 'white'}

alignment = u.SequenceSource(sys.argv[1])
quals_dict = cPickle.load(open(sys.argv[2]))

quals_dict_filtered = {}

ids_in_alignment_file = []
while alignment.next():
    ids_in_alignment_file.append(alignment.id)
ids_in_alignment_file = set(ids_in_alignment_file)

for read_id in quals_dict:
    if read_id in ids_in_alignment_file:
        quals_dict_filtered[read_id] = quals_dict[read_id]
        ids_in_alignment_file.remove(read_id)

qual_stats_dict = get_qual_stats_dict(quals_dict_filtered)
Esempio n. 18
0
# -*- coding: utf-8 -*-

import sys

import Oligotyping.lib.fastalib as u

# eh..
fasta = u.SequenceSource(sys.argv[1], lazy_init=False)
fasta.next()

len_fasta_entry = len(fasta.seq)
fasta.reset()

invalid_columns = range(0, len_fasta_entry)

while fasta.next():
    if fasta.pos % 100 == 0:
        sys.stderr.write('\rSTEP 1: %.2d%% -- pos: %d' %
                         (fasta.pos * 100 / fasta.total_seq, fasta.pos))
        sys.stderr.flush()

    for i in invalid_columns:
        if fasta.seq[i] != '-':
            invalid_columns.remove(i)

print
fasta.reset()

columns_to_keep = [
    x for x in range(0, len_fasta_entry) if x not in invalid_columns
]
# -*- coding: utf-8 -*-

import sys

import Oligotyping.lib.fastalib as u

alignment = u.SequenceSource(sys.argv[1])
quals = u.SequenceSource(sys.argv[2])

alignment.next()
quals.next()

qual = [int(q) for q in quals.seq.split()]
qual_aligned = []
for i in range(0, len(alignment.seq)):
    if alignment.seq[i] != '-':
        qual_aligned.append(qual.pop(0))
    else:
        qual_aligned.append(None)
print alignment.seq
print qual_aligned
Esempio n. 20
0
def check_input_alignment(alignment_path,
                          sample_name_separator,
                          progress_func=None):
    alignment = u.SequenceSource(alignment_path)
    samples = set([])
    previous_alignment_length = None

    while alignment.next():
        if progress_func and alignment.pos % 5000 == 0:
            progress_func.update('Reading input; %s, %s samples found'\
                                        % (pretty_print(alignment.pos),
                                           pretty_print(len(samples))))

        sample = get_sample_name_from_defline(alignment.id,
                                              sample_name_separator)
        if sample not in samples:
            samples.add(sample)

        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise ConfigError, "Not all reads have the same length."

        previous_alignment_length = len(alignment.seq)

    # if the number of samples we find in the alignment is more than half of the number of
    # reads in the alignment, we might be in trouble.
    if len(samples) * 2 > alignment.pos:
        sys.stderr.write("\n\n")
        sys.stderr.write(
            "Number of samples in the alignment is more than half of the number of reads.\n"
        )
        sys.stderr.write(
            "This usually indicates that the sample name recovery from the defline is not\n"
        )
        sys.stderr.write(
            "working properly. If you believe this is normal, and your sample names\n"
        )
        sys.stderr.write(
            "expected to look like these, you can bypass this check with --skip-check-input\n"
        )
        sys.stderr.write("parameter:\n\n")

        counter = 0
        for sample in samples:
            if counter == 10:
                break
            sys.stderr.write('\t- %s\n' % sample)
            counter += 1
        if len(samples) > 10:
            sys.stderr.write('\t- (%s more)\n' %
                             pretty_print(len(samples) - 10))
        sys.stderr.write("\n\n")
        sys.stderr.write(
            "If there is a problem with the recovery of sample names, please refer\n"
        )
        sys.stderr.write(
            "to the tutorial for the proper formatting of FASTA deflines.")
        sys.stderr.write("\n\n")

        alignment.close()
        return None
    if len(samples) == 1:
        sys.stderr.write("\n\n")
        sys.stderr.write(
            "There is only one sample found in the alignment file during the initial check.\n"
        )
        sys.stderr.write(
            "If this is expected, and the following sample is the only sample in the file,\n"
        )
        sys.stderr.write(
            "please bypass this check by declaring --skip-check-input parameter:\n\n"
        )

        for sample in samples:
            sys.stderr.write('\t- %s\n' % sample)

        sys.stderr.write("\n\n")
        sys.stderr.write(
            "If there is a problem with the recovery of sample names, please refer\n"
        )
        sys.stderr.write(
            "to the tutorial for the proper formatting of FASTA deflines.")
        sys.stderr.write("\n\n")

        alignment.close()
        return None
    else:
        alignment.close()
        return samples
                        required=True,
                        metavar='FASTA_ALIGNMENT',
                        help='align2first output (.paf file)')
    parser.add_argument(
        '-o',
        '--output-fasta',
        required=True,
        help='Output FASTA file to store homopolymer-treated sequences')
    parser.add_argument('-l',
                        '--log',
                        help='Log file. Default, STDOUT.',
                        default=None)

    args = parser.parse_args()

    input_alignment = u.SequenceSource(args.input_alignment)

    if os.path.exists(args.output_fasta):
        sys.stderr.write('Output file ("%s") exists. Overwrite? [Y|n] ' %
                         args.output_fasta)
        response = raw_input()
        if response == '' or response.lower() == 'y':
            output_fasta = open(args.output_fasta, 'w')
        else:
            print 'Exiting.'
            sys.exit(1)
    else:
        output_fasta = open(args.output_fasta, 'w')

    if args.log:
        if os.path.exists(args.log):
Esempio n. 22
0
def main(fasta_file_path, min_percent=95.0, output_file_path=None):
    fasta = u.SequenceSource(fasta_file_path)

    fasta.next()
    alignment_length = len(fasta.seq)
    fasta.reset()

    positions = {}

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write('\rAnalyzing all reads; pos: %d' % fasta.pos)
            sys.stderr.flush()
        for i in range(0, alignment_length):
            if fasta.seq[i] != '-':
                for j in range(i, alignment_length):
                    try:
                        positions[j] += 1
                    except:
                        positions[j] = 1
                break

    fasta.reset()
    sys.stderr.write('\n')

    num_reads = positions[alignment_length - 1]
    trim_location = 0

    for i in range(0, alignment_length):
        pct_reads_will_survive = positions[i] * 100.0 / num_reads
        if pct_reads_will_survive >= min_percent and not trim_location:
            trim_location = i
            trim_location_pct_reads_survive = pct_reads_will_survive
        if pct_reads_will_survive == 100:
            print
            print 'All reads are going to be trimmed from the %dth position.' % (
                trim_location_pct_reads_survive)

            if 100 - trim_location_pct_reads_survive:
                print
                print '%d reads that do not reach to this locaition will be eliminated.' % (
                    (100 - trim_location_pct_reads_survive) / 100.0 *
                    num_reads)

            if min_percent < 100:
                print
                print 'If all reads were to be retained, alignments should have been trimmed from'
                print 'the %dth location, however, this would have required all reads to lose %d' % (
                    i, i - trim_location)
                print 'bases'
            print
            break

    output = u.FastaOutput(
        output_file_path if output_file_path else sys.argv[1] + '-TRIMMED')

    while fasta.next():
        if fasta.pos % 1000 == 0:
            sys.stderr.write('\rStoring trimmed reads; pos: %d' % fasta.pos)
            sys.stderr.flush()

        if fasta.seq[trim_location:].startswith('-'):
            continue
        else:
            output.write_id(fasta.id)
            output.write_seq(fasta.seq[trim_location:], split=False)

    sys.stderr.write('\n')
    sys.stderr.write('\n')
    print 'Trimmed reads stored: "%s"\n' % (
        output_file_path if output_file_path else sys.argv[1] + '-TRIMMED')
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Please read the COPYING file.

import sys
import os
import operator

import Oligotyping.lib.fastalib as u

consensus_sequence = ''

fasta = u.SequenceSource(sys.argv[1])
fasta.next()
alignment_length = len(fasta.seq)
consensus_dict = {}
for i in range(0, alignment_length):
    consensus_dict[i] = {'A': 0, 'T': 0, 'C': 0, 'G': 0, '-': 0}

fasta.reset()

while fasta.next():
    for pos in range(0, alignment_length):
        consensus_dict[pos][fasta.seq[pos]] += 1

for pos in range(0, alignment_length):
    consensus_sequence += sorted(consensus_dict[pos].iteritems(),
                                 key=operator.itemgetter(1),
Esempio n. 24
0
def vis_freq_curve(fasta_file_path,
                   output_file=None,
                   x_limit=20,
                   display=False,
                   freq_from_defline=None,
                   entropy_output_file=None,
                   verbose=False,
                   mini=False,
                   title=None):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int(
            [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    fasta = u.SequenceSource(fasta_file_path)

    frequency_list = []
    while next(fasta):
        try:
            frequency_list.append(freq_from_defline(fasta.id))
        except:
            print('frequency info can not be read from defline.')
            sys.exit()

    frequency_list_to_plot = frequency_list[0:x_limit] + [0] * (x_limit - len(frequency_list) \
                                            if len(frequency_list) < x_limit else 0)

    entropy_values = entropy_analysis(fasta_file_path,
                                      output_file=entropy_output_file,
                                      verbose=verbose,
                                      uniqued=True)

    if mini:
        plt.figure(figsize=(2, 2))
        plt.subplots_adjust(left=0.01, bottom=0, top=1, right=1)
        plt.subplot(1, 1, 1)
        plt.grid(False)
        plt.xticks([])
        plt.yticks([])

        ax = plt.gca()
        plt.setp(ax, frame_on=False)

        y_maximum = 1.1
        x_maximum = len(entropy_values)
        ind = np.arange(len(entropy_values))

        text_x, text_y = x_maximum / 2, y_maximum / 2

        plt.text(text_x,
                 text_y,
                 title if title else 'title',
                 horizontalalignment='center',
                 verticalalignment='center',
                 backgroundcolor='white',
                 fontsize=40,
                 color='red')

        plt.ylim(ymax=y_maximum)
        plt.xlim(xmax=x_maximum)

        plt.bar(ind, entropy_values, color='black', lw=0.5)

    else:
        plt.figure(figsize=(24, 10))
        plt.subplots_adjust(left=0.05, bottom=0.15, top=0.95, right=0.99)
        plt.subplot(2, 1, 1)
        plt.grid(True)
        plt.rcParams.update({'axes.linewidth': 0.9})
        plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)
        plt.xticks(list(range(0, len(entropy_values), 5)),
                   rotation=90,
                   size='x-small')

        plt.plot(frequency_list_to_plot, lw=3, c='black')

        plt.xlabel('Order in the File', size='x-large')
        plt.ylabel('Frequency of the Unique Sequence', size='x-large')
        if title:
            plt.title(title)
        else:
            plt.title('Frequency Distribution of Unique Sequences in %s' %
                      os.path.basename(fasta_file_path))
        plt.ylim(ymin=-max(frequency_list_to_plot) * 0.05,
                 ymax=max(frequency_list_to_plot) * 1.05)
        plt.xlim(xmin=-0.05, xmax=x_limit - 1)
        plt.xticks(list(range(0, x_limit)),
                   [str(i) for i in range(1, x_limit + 1)],
                   rotation=90,
                   size='small')

        plt.subplot(2, 1, 2)
        plt.subplots_adjust(left=0.05, bottom=0.1, top=0.95, right=0.99)

        try:
            plt.grid(axis='y')
        except:
            plt.grid(True)
        plt.rcParams.update({'axes.linewidth': 0.9})
        plt.rc('grid', color='0.40', linestyle='-', linewidth=0.1)

        y_maximum = max(entropy_values) * 1.1
        y_maximum = 1.1 if y_maximum < 1 else y_maximum
        ind = np.arange(len(entropy_values))
        plt.bar(ind, entropy_values, color='black', lw=0.5)
        plt.xlim([0, len(entropy_values)])
        plt.ylim([0, y_maximum])
        plt.xticks(list(range(0, len(entropy_values), 5)),
                   rotation=90,
                   size='x-small')

        plt.xlabel('Position in the Alignment', size='x-large')
        plt.ylabel('Shannon Entropy', size='x-large')

    if output_file:
        plt.savefig(output_file)
    if display:
        plt.show()

    plt.clf()
    plt.close('all')
Esempio n. 25
0
def length_distribution(fasta, output=None, title=None):
    fasta = u.SequenceSource(fasta)

    sequence_lengths = []

    fasta.reset()

    while fasta.next():
        if fasta.pos % 1000 == 0 or fasta.pos == 1:
            sys.stderr.write('\r[fastalib] Reading: %s' % (fasta.pos))
            sys.stderr.flush()
        sequence_lengths.append(len(fasta.seq.replace('-', '')))

    fasta.reset()

    sys.stderr.write('\n')

    max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0)
                                           or 10)

    seq_len_distribution = [0] * (max_seq_len + 1)

    for l in sequence_lengths:
        seq_len_distribution[l] += 1

    fig = plt.figure(figsize=(12, 8))
    plt.rcParams.update({'axes.linewidth': 0.9})
    plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)

    gs = gridspec.GridSpec(20, 1)

    #############################################################################################################

    ax1 = plt.subplot(gs[1:3])
    plt.subplots_adjust(left=0.05, bottom=0.03, top=0.95, right=0.98)
    plt.grid(False)
    plt.yticks([])
    plt.xticks([])
    total_seqs = len(sequence_lengths)
    plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\
        % (pretty_print(total_seqs),
           numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\
           min(sequence_lengths),\
           max(sequence_lengths)),\
        va = 'center', alpha = 0.8, size = 12)

    #############################################################################################################

    ax1 = plt.subplot(gs[4:11])
    plt.grid(True)
    plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98)

    plt.plot(seq_len_distribution, color='black', alpha=0.3)
    plt.fill_between(range(0, max_seq_len + 1),
                     seq_len_distribution,
                     y2=0,
                     color='black',
                     alpha=0.30)
    plt.ylabel('number of sequences')

    xtickstep = (max_seq_len / 50) or 1
    ytickstep = max(seq_len_distribution) / 20 or 1

    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep),
               rotation=90,
               size='xx-small')
    plt.yticks(range(0,
                     max(seq_len_distribution) + 1, ytickstep),
               [y for y in range(0,
                                 max(seq_len_distribution) + 1, ytickstep)],
               size='xx-small')
    plt.xlim(xmin=0, xmax=max_seq_len)
    plt.ylim(ymin=0,
             ymax=max(seq_len_distribution) +
             (max(seq_len_distribution) / 20.0))

    plt.figtext(0.5,
                0.96,
                '%s' % (title or fasta.fasta_file_path),
                weight='black',
                size='xx-large',
                ha='center')

    #############################################################################################################

    ax2 = plt.subplot(gs[12:19])
    plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98)
    plt.grid(True)

    length_abundance = {}
    for l in sequence_lengths:
        if length_abundance.has_key(l):
            length_abundance[l] += 1
        else:
            length_abundance[l] = 1

    percentages = []
    total_percentage = 0
    for i in range(0, max_seq_len):
        if length_abundance.has_key(i):
            total_percentage += length_abundance[i] * 100.0 / total_seqs
            percentages.append(total_percentage)
        else:
            percentages.append(total_percentage)

    xtickstep = (max_seq_len / 50) or 1
    plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep),
               rotation=90,
               size='xx-small')
    plt.yticks(range(0, 101, 5), ['%d%%' % y for y in range(0, 101, 5)],
               size='xx-small')
    plt.ylabel('percent of reads')

    plt.xlim(xmin=0, xmax=max_seq_len)
    plt.ylim(ymin=0, ymax=100)
    plt.plot(percentages)
    plt.fill_between(range(0, max_seq_len + 1),
                     percentages + [100],
                     y2=0,
                     color='blue',
                     alpha=0.30)

    #############################################################################################################

    if output == None:
        output = fasta.fasta_file_path

    try:
        plt.savefig(output + '.pdf')
    except:
        plt.savefig(output + '.png')

    try:
        plt.show()
    except:
        pass

    fasta.close()

    return
Esempio n. 26
0
def get_alignment_length(alignment_path):
    alignment = u.SequenceSource(alignment_path)
    alignment.next()
    return len(alignment.seq)
Esempio n. 27
0
def entropy_analysis(alignment_path,
                     output_file=None,
                     verbose=True,
                     uniqued=False,
                     freq_from_defline=None,
                     weighted=False,
                     qual_stats_dict=None,
                     amino_acid_sequences=False):
    if freq_from_defline == None:
        freq_from_defline = lambda x: int(
            [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0])

    lines = []
    previous_alignment_length = None

    progress = Progress()
    progress.verbose = verbose

    alignment = u.SequenceSource(alignment_path)

    progress.new('Processing the Alignment')

    # processing the alignment file..
    while alignment.next():
        # check the alignment lengths along the way:
        if previous_alignment_length:
            if previous_alignment_length != len(alignment.seq):
                raise EntropyError, "Not all reads have the same length."

        # print out process info
        if alignment.pos % 10000 == 0:
            progress.update('Reads processed: %s' %
                            (pretty_print(alignment.pos)))

        # fill 'lines' variable
        if not uniqued:
            lines.append(alignment.seq)
        else:
            try:
                frequency = freq_from_defline(alignment.id)
            except IndexError:
                raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued."

            for i in range(0, frequency):
                lines.append(alignment.seq)

        previous_alignment_length = len(alignment.seq)

    progress.end()
    if verbose:
        run.info('Number of reads', pretty_print(alignment.pos))

    alignment.close()

    # entropy analysis
    progress.new('Entropy Analysis')
    entropy_tpls = []

    for position in range(0, len(lines[0])):
        progress.update(P(int(position + 1), len(lines[0])))

        if len(set([x[position] for x in lines])) == 1:
            entropy_tpls.append((position, 0.0), )
        else:
            column = "".join([x[position] for x in lines])

            if weighted:
                if not qual_stats_dict:
                    raise EntropyError, "Weighted entropy is selected, but no qual stats are provided"
                e = entropy(column,
                            l_qual=qual_stats_dict[position],
                            amino_acid_sequences=amino_acid_sequences)
            else:
                e = entropy(column, amino_acid_sequences=amino_acid_sequences)

            if e < 0.00001:
                entropy_tpls.append((position, 0.0), )
            else:
                entropy_tpls.append((position, e), )

    sorted_entropy_tpls = sorted(entropy_tpls,
                                 key=operator.itemgetter(1),
                                 reverse=True)

    progress.end()

    if verbose:
        entropy_components_larger_than_0 = [
            e[1] for e in entropy_tpls if e[1] > 0
        ]
        if entropy_components_larger_than_0:
            run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \
                                                        % (len(entropy_components_larger_than_0),
                                                           numpy.mean(entropy_components_larger_than_0),
                                                           numpy.max(entropy_components_larger_than_0),
                                                           numpy.min(entropy_components_larger_than_0)))
        else:
            run.info('Entropy analysis',
                     'None of the nucleotide positions posessed any entropy!')

    if output_file:
        entropy_output = open(output_file, 'w')
        for _component, _entropy in sorted_entropy_tpls:
            entropy_output.write('%d\t%.4f\n' % (_component, _entropy))
        if verbose:
            run.info('Entropy analysis output file path', output_file)
        entropy_output.close()

    return [x[1] for x in entropy_tpls]
#000005530
#000003465
#000003684
#000005301
#000001012

nodes = [x.strip() for x in open(NODE_IDS_OF_INTEREST).readlines()]

for node in nodes:
    print node

    # So this is the other thing that needs to be replaced. You must replace
    # the word 'PATH_TO_MED_ANALYSIS_RESULTS' with the MED analysis
    # output path --the path in which you see all the output files like
    # MATRIX-COUNT.txt or directories like HTML-OUTPUT..
    fasta = u.SequenceSource(
        os.path.join(PATH_TO_MED_ANALYSIS_RESULTS, 'NODES/%s.fa' % node))

    node_to_read_ids[node] = []

    while fasta.next():
        read_id = fasta.id.split('|')[0]

        node_to_read_ids[node].append(read_id)
        read_id_to_node[read_id] = node

    fasta.close()

cPickle.dump(read_id_to_node, open('read_id_to_node.cPickle', 'w'))
cPickle.dump(node_to_read_ids, open('node_to_read_ids.cPickle', 'w'))