Example #1
0
def _offline_counter(args):
    """ Offline counting from SAM/BAM file. """
    # Offline counting from SAM/BAM file:
    counts = read_counter.count_reads(args.bam.name,
                                      in_format=args.f,
                                      min_aln_qual=args.a,
                                      verbose=not args.Q)
    counts = OrderedDict(six.iteritems(counts))

    calc_words = [int(k) for k in args.k.split(",")]

    data = OrderedDict()

    # Calculate sequence properties:
    if args.z is not None:
        lengths, gc_contents, word_freqs = {}, {}, defaultdict(
            lambda: defaultdict(dict))
        ref_iter = seq_util.read_seq_records(args.z)
        if not args.Q:
            sys.stderr.write("Calculating sequence features:\n")
            ref_iter = tqdm.tqdm(ref_iter)

        for ref in ref_iter:
            # Augment counts dictionary with missing reference entries:
            if ref.id not in counts:
                counts[ref.id] = 0
            lengths[ref.id] = len(ref)
            gc_contents[ref.id] = seq_util.gc_content(str(ref.seq))
            if args.k is not None:
                for word_size in calc_words:
                    bf = seq_util.word_composition(ref.seq, word_size)
                    for word, count in six.iteritems(bf):
                        word_freqs[word_size][
                            ref.id][word] = float(count) / len(ref)

        data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)]
        data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)]

    data['Reference'] = list(counts.keys())
    data['Count'] = list(counts.values())

    # Calculate word frequencies:
    if args.k is not None and args.z:
        for ks in calc_words:
            for word in next(iter((word_freqs[ks].values()))).keys():
                tmp = []
                for ref in counts.keys():
                    tmp.append(word_freqs[ks][ref][word])
                data[word] = tmp

    data_frame = pd.DataFrame(data)
    data_frame = data_frame.sort_values(['Count', 'Reference'],
                                        ascending=False)

    if args.t is not None:
        data_frame.to_csv(args.t, sep='\t', index=False)

    if args.p is not None:
        misc.pickle_dump(data, args.p)
Example #2
0
def _get_lengths(in_file, in_format, min_length, max_length, do_log):
    """ Iterate over input and accumulate sequence lengths. """
    input_iterator = seq_util.read_seq_records(in_file, format=in_format)
    lengths = []
    for record in input_iterator:
        length = len(record)
        # Filter for minimum read length:
        if (min_length is not None) and (length < min_length):
            continue
        # Filter for maximum read length:
        if (max_length is not None) and (length > max_length):
            continue
        if do_log:
            length = np.log(length)
        lengths.append(length)
    input_iterator.close()
    return lengths
Example #3
0
parser.add_argument('-p',
                    metavar='results_pickle',
                    type=str,
                    help="Save pickled results in this file.",
                    default=None)
parser.add_argument('input_fastx',
                    nargs='?',
                    help='Input (default: stdin).',
                    type=argparse.FileType('r'),
                    default=sys.stdin)

if __name__ == '__main__':
    args = parser.parse_args()

    in_format = args.f
    input_iterator = seq_util.read_seq_records(args.input_fastx,
                                               format=in_format)

    total_bases = 0
    for record in input_iterator:
        total_bases += len(record)
    results = {'total_bases': total_bases}
    print("Total bases\t{}".format(total_bases))

    if args.s is not None:
        results['genome_size'] = args.s
        results['coverage'] = float(total_bases) / args.s
        print("Genome size\t{}".format(results['genome_size']))
        print("Coverage\t{}".format(results['coverage']))

    if args.p is not None:
        misc.pickle_dump(results, args.p)
Example #4
0
from wub.util import seq as seq_util

# Parse command line arguments:
parser = argparse.ArgumentParser(
    description='Reverse (but not complement!) sequences and qualities in fastq file.')
parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).',
                    type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)',
                    type=argparse.FileType('w'), default=sys.stdout)


def reverse_seq_records(input_iterator):
    """Reverse SeqRecord objects.

    :param input_iterator: Iterator of SeqRecord objects.
    :returns: Generator of reversed SeqRecord objects.
    :rtype: generator
    """
    for record in input_iterator:
        yield record[::-1]


if __name__ == '__main__':
    args = parser.parse_args()

    input_iterator = seq_util.read_seq_records(
        args.input_fastq, format='fastq')
    output_iterator = reverse_seq_records(input_iterator)
    seq_util.write_seq_records(
        output_iterator, args.output_fastq, format='fastq')
Example #5
0
        read_seq = mutated_record.seq
        if direction == '-':
            read_seq = seq_util.reverse_complement(mutated_record.seq)

        yield seq_util.new_dna_record(read_seq, read_name, mock_qualities), sam


if __name__ == '__main__':
    args = parser.parse_args()

    # Set random seed:
    if args.z is not None:
        np.random.seed(args.z)

    # Read in chromosomes of the input genome:
    chromosomes = list(seq_util.read_seq_records(args.input_fasta))

    # Process error weights:
    error_weights = np.array(parse_util.separated_list_to_floats(args.w))
    # Normalise error weights to probabilities:
    error_weights = parse_util.normalise_array(error_weights)
    error_weights = dict(
        zip(['substitution', 'insertion', 'deletion'], error_weights))

    sw = None
    if args.s is not None:
        sw = sam_writer.SamWriter(args.s, build_sam_header(chromosomes))

    simulation_iterator = simulate_sequencing(chromosomes, args.m, args.a,
                                              args.l, args.u, args.e,
                                              error_weights, args.b, args.q,
Example #6
0
                    metavar='output_file',
                    type=str,
                    help="Output SAM file.")

if __name__ == '__main__':
    args = parser.parse_args()

    input_iter = bam_common.pysam_open(args.infile,
                                       args.f).fetch(until_eof=True)

    # Get SAM record names:
    sam_names = [record.query_name for record in input_iter]

    writer = sam_writer.SamWriter(args.outfile)

    for read in seq_util.read_seq_records(args.q, 'fastq'):
        if read.id not in sam_names:
            qual = seq_util.quality_array_to_string(
                read.letter_annotations["phred_quality"])
            sam_record = writer.new_sam_record(qname=read.id,
                                               flag=4,
                                               rname="*",
                                               pos=0,
                                               mapq=0,
                                               cigar="*",
                                               rnext="*",
                                               pnext=0,
                                               tlen=0,
                                               seq=str(read.seq),
                                               qual=qual,
                                               tags="AS:i:0")
Example #7
0
parser.add_argument('output_fastx',
                    nargs='?',
                    help='Output file (default: stdout).',
                    type=argparse.FileType('w'),
                    default=sys.stdout)


def _record_filter(input_iter_bait, input_iter_target):
    """ Filter out SeqRecord objects present in the first iterator. """
    bait_ids = [read.id for read in input_iter_bait]
    for record in input_iter_target:
        if record.id not in bait_ids:
            yield record


if __name__ == '__main__':
    args = parser.parse_args()

    input_iterator_bait = seq_util.read_seq_records(args.input_fastx_bait,
                                                    format=args.i)

    input_iterator_target = seq_util.read_seq_records(args.input_fastx_target,
                                                      format=args.i)

    output_iterator = _record_filter(input_iterator_bait,
                                     input_iterator_target)

    seq_util.write_seq_records(output_iterator,
                               args.output_fastx,
                               format=args.o)