Exemple #1
0
 def test_file_reader_fasta(self):
     '''file_reader should iterate through a fasta file correctly'''
     reader = fastn.file_reader('fastn_unittest.fa')
     counter = 1
     for seq in reader:
         self.assertEqual(seq, fastn.Fasta(str(counter), 'ACGTA'))
         counter += 1
def get_gaps_and_lengths(infile):
    seq_reader = fastn.file_reader(infile)
    lengths = {}
    gaps = {}

    for seq in seq_reader:
        assert seq.id not in lengths
        lengths[seq.id] = len(seq)
        gaps[seq.id] = seq.gaps()

    return lengths, gaps
Exemple #3
0
    def test_print_line_length(self):
        '''__str__ should be formatted correctly with the right number of chars per line of sequence'''
        line_lengths = [0, 3]
        correct_files = [
            'fastn_unittest_one-per-line.fa', 'fastn_unittest_3-per-line.fa'
        ]

        for i in range(len(line_lengths)):
            seq_reader = fastn.file_reader('fastn_unittest_one-per-line.fa')
            fastn.Fasta.line_length = line_lengths[i]
            tmp_out = 'tmp.line_length_test.fa'
            f = utils.open_file_write(tmp_out)
            for s in seq_reader:
                print(s, file=f)
            utils.close(f)
            self.assertTrue(filecmp.cmp(correct_files[i], tmp_out))
            os.unlink(tmp_out)

        fastn.Fasta.line_length = 60
Exemple #4
0
#!/usr/bin/env python3.3

import argparse
import fastn
import utils

parser = argparse.ArgumentParser(
    description='Gets all IDs from a fasta or fastq file',
    usage='%(prog)s <infile> <outfile>')
parser.add_argument('infile', help='Name of fasta/q file to be read')
parser.add_argument('outfile', help='Name of output file')
options = parser.parse_args()

seq_reader = fastn.file_reader(options.infile)
f_out = utils.open_file_write(options.outfile)

for seq in seq_reader:
    print(seq.id, file=f_out)

utils.close(f_out)
Exemple #5
0
    if sam_record.cigar.operations[0].operator == 'S':
        hit_start = sam_record.cigar.operations[0].number

    if sam_record.cigar.operations[-1].operator == 'S':
        hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number

    if sam_record.id not in read_hit_coords:
        read_hit_coords[sam_record.id] = []

    read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1))

external_progs.bwa_index_clean(bwa_index)
os.unlink(bwa_sam)


seq_reader = fastn.file_reader(options.reads_in)
f_fa = utils.open_file_write(options.outprefix + '.fq')
f_log = utils.open_file_write(options.outprefix + '.log')

for seq in seq_reader:
    if seq.id not in read_hit_coords:
        print(seq, file=f_fa)
        print(seq.id, 'no hit', sep='\t', file=f_log)
    else:
        hits = read_hit_coords[seq.id]
        genome_intervals.merge_overlapping_in_list(hits)
        i = 0

        while i < len(hits) - 1:
            if hits[i+1].start - hits[i].end <= options.join_distance:
                hits[i] = hits[i].union_fill_gap(hits[i+1])
    for id in clusters[i]:
        seq = all_seqs[id]
        if strands[id] == '-':
            seq = copy.copy(all_seqs[id])
            seq.revcomp()
        else:
            seq = all_seqs[id]

        print(seq, file=f)
    utils.close(f)

    utils.syscall('cap3 ' + reads_file)
    singlet_count = fastn.count_sequences(reads_file + '.cap.singlets')
    contig_count = fastn.count_sequences(reads_file + '.cap.contigs')
    if singlet_count == 0 and contig_count == 1:
        seq_reader = fastn.file_reader(reads_file + '.cap.contigs')
        for seq in seq_reader:
            seq.id = 'cluster.' + str(i + 1) + '.contig'
            assembled_seqs.append(copy.copy(seq))

        for e in [
                'ace', 'contigs.links', 'contigs.qual', 'info', 'singlets',
                'contigs'
        ]:
            os.unlink(reads_file + '.cap.' + e)
        os.unlink(reads_file)
    else:
        print('Got',
              singlet_count,
              'singlets and',
              contig_count,
Exemple #7
0
 def test_file_reader_fastq(self):
     '''file_reader should iterate through a fastq file correctly'''
     reader = fastn.file_reader('fastn_unittest_good_file.fq')
     for seq in reader:
         self.assertEqual(seq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
parser = argparse.ArgumentParser(
    description=
    'Given a nucmer coords file, reports the regions of the query that have no nucmer hit. Doesn'
    't report gaps - i.e. assumes that all gaps had a hit.',
    usage='%(prog)s [options] <nucmer.coords> <query.fasta/q> <outfile>')
parser.add_argument('nucmer_coords',
                    help='Name of nucmer coords file',
                    metavar='nucmer.coords')
parser.add_argument('query_file',
                    help='Name of query fasta or fastq file',
                    metavar='query.fasta/q')
parser.add_argument('outfile', help='Name of output file')
options = parser.parse_args()

seq_reader = fastn.file_reader(options.query_file)
seq_lengths = {}  # id -> sequence length
covered_regions = {}  # id -> list of covered regions

# get query sequence lengths and gap positions - add each gap coord to the
# list of covered positions for each sequence
for seq in seq_reader:
    assert seq.id not in seq_lengths
    seq_lengths[seq.id] = len(seq)
    covered_regions[seq.id] = seq.gaps()

nucmer_reader = nucmer.file_reader(options.nucmer_coords)

for hit in nucmer_reader:
    assert hit.qry_name in seq_lengths
#!/usr/bin/env python3.3

import argparse
import fastn
import utils

parser = argparse.ArgumentParser(
    description = 'Converts a fastq file to fasta + qual file',
    usage = '%(prog)s [options] <fastq_in> <fasta_out>')
parser.add_argument('fastq_in', help='Name of input fastq file')
parser.add_argument('fasta_out', help='Name of output fasta (fasta_out.qual will also be created)')
options = parser.parse_args()


seq_reader = fastn.file_reader(options.fastq_in)
fasta_out = utils.open_file_write(options.fasta_out)
qual_out = utils.open_file_write(options.fasta_out + '.qual')
fastn.Fasta.line_length = 0

for seq in seq_reader:
    fa, qual = seq.to_Fasta_and_qual()
    print(fa, file=fasta_out)
    print('>' + fa.id, ' '.join([str(x) for x in qual]), sep='\n', file=qual_out)


utils.close(fasta_out)
utils.close(qual_out)

Exemple #10
0
parser = argparse.ArgumentParser(
    description=
    'Takes a random subset of reads from a fasta/q file and optionally the corresponding read '
    + 'from a mates file.  Ouptut is interleaved if mates file given',
    usage=
    '%(prog)s [options] <fasta/q in> <outfile> <percent reads wanted in [0,100]>'
)
parser.add_argument('--mate_file', help='Name of fasta/q mates file')
parser.add_argument('infile', help='Name of fasta/q file to be read')
parser.add_argument('outfile', help='Name of fasta/q output file')
parser.add_argument('read_percent',
                    type=int,
                    help='percent of reads to take from input file')
options = parser.parse_args()

seq_reader = fastn.file_reader(options.infile)
fout = utils.open_file_write(options.outfile)
counter_in = 0
counter_out = 0

if options.mate_file:
    mate_seq_reader = fastn.file_reader(options.mate_file)

for seq in seq_reader:
    counter_in += 1
    if options.mate_file:
        try:
            mate_seq = next(mate_seq_reader)
        except StopIteration:
            print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
            sys.exit(1)