Ejemplo n.º 1
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = ['sequences_test_fail_no_AT.fq',
                     'sequences_test_fail_no_seq.fq',
                     'sequences_test_fail_no_plus.fq',
                     'sequences_test_fail_no_qual.fq']

        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = sequences.Fastq()
            with self.assertRaises(sequences.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = os.path.join(data_dir, 'sequences_test_good_file.fq')
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = sequences.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
Ejemplo n.º 2
0
def fix_blast_coords(blast_file, coords_file, outfile):
    coords_offset = offset_coords_file_to_dict(coords_file)
    fin = utils.open_file_read(blast_file)
    fout = utils.open_file_write(outfile)
    for line in fin:
        # blastn sticks a bunch of header lines in the tabulated
        # output file. Need to ignore them
        if '\t' not in line:
            continue

        # Lines are supposed to be tab delimited. Sometimes they
        # have a space character following a tab character, so
        # split on whitespace. This is OK because the pipeline has already
        # removed whitespace from sequence names
        data = line.rstrip().split()
        if data[0] in coords_offset:
            data[6] = str(int(data[6]) + coords_offset[data[0]][1])
            data[7] = str(int(data[7]) + coords_offset[data[0]][1])
            data[0] = coords_offset[data[0]][0]

        # always reconstruct the line, because of spaces bug mentioned above
        line = '\t'.join(data)

        print(line.rstrip(), file=fout)

    utils.close(fin)
    utils.close(fout)
Ejemplo n.º 3
0
def trim_contigs(infile, outfile, trim):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        if len(seq) < 2 * trim:
            continue

        gaps = seq.gaps()
        bases = list(seq.seq)

        # extend the length of each gap
        for gap in gaps:
            left_start = max(gap.start - trim, 0)
            right_end = min(gap.end + trim + 1, len(seq))

            for i in range(left_start, gap.start):
                bases[i] = 'N'

            for i in range(gap.end, right_end):
                bases[i] = 'N'

        seq.seq = ''.join(bases)

        # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq
        seq.trim(trim, trim)
        seq.trim_Ns()

        # check that there is some non-N sequence left over
        regex = re.compile('[^nN]')
        if regex.search(seq.seq) is not None:
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 4
0
def make_random_contigs(contigs,
                        length,
                        outfile,
                        name_by_letters=False,
                        prefix='',
                        seed=None,
                        first_number=1):
    '''Makes a multi fasta file of random sequences, all the same length'''
    random.seed(a=seed)
    fout = utils.open_file_write(outfile)
    letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    letters_index = 0

    for i in range(contigs):
        if name_by_letters:
            name = letters[letters_index]
            letters_index += 1
            if letters_index == len(letters):
                letters_index = 0
        else:
            name = str(i + first_number)

        fa = sequences.Fasta(
            prefix + name,
            ''.join([random.choice('ACGT') for x in range(length)]))
        print(fa, file=fout)

    utils.close(fout)
Ejemplo n.º 5
0
def split_by_fixed_size_onefile(infile,
                                outfile,
                                chunk_size,
                                tolerance,
                                skip_if_all_Ns=False):
    '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to
       (chunk_size + tolerance) in length'''
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    for seq in seq_reader:
        for i in range(0, len(seq), chunk_size):
            if i + chunk_size + tolerance >= len(seq):
                end = len(seq)
            else:
                end = i + chunk_size

            subseq = seq.subseq(i, end)
            if not (skip_if_all_Ns and subseq.is_all_Ns()):
                subseq.id += '.' + str(i + 1) + '_' + str(end)
                print(subseq, file=f_out)

            if end == len(seq):
                break

    utils.close(f_out)
Ejemplo n.º 6
0
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
                      'from a mates file.  Output is interleaved if mates file given',
        usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
    parser.add_argument('--mate_file', help='Name of mates file')
    parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT')
    options = parser.parse_args()

    random.seed(a=options.seed)
    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)

    if options.mate_file:
        mate_seq_reader = sequences.file_reader(options.mate_file)

    for seq in seq_reader:
        if options.mate_file:
            try:
                mate_seq = next(mate_seq_reader)
            except StopIteration:
                print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
                sys.exit(1)
        if 100 * random.random() <= options.percent:
            print(seq, file=fout)
            if options.mate_file:
                print(mate_seq, file=fout)

    utils.close(fout)
Ejemplo n.º 7
0
def get_seqs_flanking_gaps(infile, outfile, left, right):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    print('#id',
          'gap_start',
          'gap_end',
          'left_bases',
          'right_bases',
          sep='\t',
          file=fout)

    for seq in seq_reader:
        gaps = seq.gaps()

        for gap in gaps:
            left_start = max(gap.start - left, 0)
            right_end = min(gap.end + right + 1, len(seq))
            print(seq.id,
                  gap.start + 1,
                  gap.end + 1,
                  seq.seq[left_start:gap.start],
                  seq.seq[gap.end + 1:right_end],
                  sep='\t',
                  file=fout)

    utils.close(fout)
Ejemplo n.º 8
0
def acgtn_only(infile, outfile):
    '''Replace every non-acgtn (case insensitve) character with an N'''
    f = utils.open_file_write(outfile)
    for seq in sequences.file_reader(infile):
        seq.replace_non_acgt()
        print(seq, file=f)
    utils.close(f)
Ejemplo n.º 9
0
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length
    if check_unique:
        used_names = {}

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if check_unique:
            used_names[seq.id] = used_names.get(seq.id, 0) + 1

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length

    if check_unique:
        all_unique = True

        for name, count in used_names.items():
            if count > 1:
                print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr)
                all_unique = False

        if not all_unique:
            raise Error('Not all sequence names unique. Cannot continue')
Ejemplo n.º 10
0
def stats_from_fai(infile):
    '''Returns dictionary of length stats from an fai file. Keys are: longest, shortest, mean, total_length, N50, number'''
    f = utils.open_file_read(infile)
    try:
        lengths = sorted([int(line.split('\t')[1]) for line in f], reverse=True)
    except:
        raise Error('Error getting lengths from fai file ' + infile)
    utils.close(f)

    stats = {}
    if len(lengths) > 0:
        stats['longest'] = max(lengths)
        stats['shortest'] = min(lengths)
        stats['total_length'] = sum(lengths)
        stats['mean'] = stats['total_length'] / len(lengths)
        stats['number'] = len(lengths)

        cumulative_length = 0
        for length in lengths:
            cumulative_length += length
            if cumulative_length >= 0.5 * stats['total_length']:
                stats['N50'] = length
                break
    else:
        stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}

    return stats
Ejemplo n.º 11
0
def acgtn_only(infile, outfile):
    '''Replace every non-acgtn (case insensitve) character with an N'''
    f = utils.open_file_write(outfile)
    for seq in sequences.file_reader(infile):
        seq.replace_non_acgt()
        print(seq, file=f)
    utils.close(f)
Ejemplo n.º 12
0
	def run(self):	
		original_dir = os.getcwd()
		os.chdir(self.working_directory)	
		
		contigs_in_file = set(self.contigs.keys())		
		if contigs_in_file != self.ids_to_skip and not self.alignments:
			self.alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_alignments_filename(), min_percent_id=self.overlap_percent_identity)
					
		output_fw = fastaqutils.open_file_write(self.output_file)
		for contig_id in sorted(self.contigs.keys()):
			#Look for overlaps, trim if applicable
			if contig_id not in self.ids_to_skip:
				best_overlap = self._find_best_overlap(contig_id)
				trim_status = None			
				if best_overlap and self.trim:
					trim_status = self._trim(contig_id, best_overlap)
			self._write_summary(contig_id, best_overlap, trim_status)
			print(sequences.Fasta(contig_id, self.contigs[contig_id].seq), file=output_fw)	
		fastaqutils.close(output_fw)			
# 		tasks.sort_by_size(self._build_intermediate_filename(), self.output_file) # Sort contigs in final file according to size
		
		if not self.debug:
			utils.delete(self._build_alignments_filename())
# 			utils.delete(self._build_intermediate_filename())

		os.chdir(original_dir)
Ejemplo n.º 13
0
Archivo: tasks.py Proyecto: nds/Fastaq
def trim_contigs(infile, outfile, trim):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        if len(seq) < 2 * trim:
            continue

        gaps = seq.gaps()
        bases = list(seq.seq)

        # extend the length of each gap
        for gap in gaps:
            left_start = max(gap.start - trim, 0)
            right_end = min(gap.end + trim + 1, len(seq))

            for i in range(left_start, gap.start):
                bases[i] = 'N'

            for i in range(gap.end, right_end):
                bases[i] = 'N'

        seq.seq = ''.join(bases)

        # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq
        seq.trim(trim, trim)
        seq.trim_Ns()

        # check that there is some non-N sequence left over
        regex = re.compile('[^nN]')
        if regex.search(seq.seq) is not None:
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 14
0
def fix_blast_coords(blast_file, coords_file, outfile):
    coords_offset = offset_coords_file_to_dict(coords_file)
    fin = utils.open_file_read(blast_file)
    fout = utils.open_file_write(outfile)
    for line in fin:
        # blastn sticks a bunch of header lines in the tabulated
        # output file. Need to ignore them
        if '\t' not in line:
            continue

        # Lines are supposed to be tab delimited. Sometimes they
        # have a space character following a tab character, so
        # split on whitespace. This is OK because the pipeline has already
        # removed whitespace from sequence names
        data = line.rstrip().split()
        if data[0] in coords_offset:
            data[6] = str(int(data[6]) + coords_offset[data[0]][1])
            data[7] = str(int(data[7]) + coords_offset[data[0]][1])
            data[0] = coords_offset[data[0]][0]

        # always reconstruct the line, because of spaces bug mentioned above
        line = '\t'.join(data)

        print(line.rstrip(),file=fout)

    utils.close(fin)
    utils.close(fout)
Ejemplo n.º 15
0
def translate(infile, outfile, frame=0):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print(seq.translate(frame=frame), file=fout)

    utils.close(fout)
Ejemplo n.º 16
0
Archivo: tasks.py Proyecto: nds/Fastaq
def translate(infile, outfile, frame=0):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print(seq.translate(frame=frame), file=fout)

    utils.close(fout)
Ejemplo n.º 17
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    c = Caf()

    while c.get_next_from_file(f):
        yield c

    utils.close(f)
Ejemplo n.º 18
0
def reverse_complement(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.revcomp()
        print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 19
0
def replace_bases(infile, outfile, old, new):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.replace_bases(old, new)
        print(seq, file=f_out)

    utils.close(f_out)
Ejemplo n.º 20
0
Archivo: tasks.py Proyecto: nds/Fastaq
def replace_bases(infile, outfile, old, new):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.replace_bases(old, new)
        print(seq, file=f_out)

    utils.close(f_out)
Ejemplo n.º 21
0
Archivo: tasks.py Proyecto: nds/Fastaq
def reverse_complement(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.revcomp()
        print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 22
0
Archivo: tasks.py Proyecto: nds/Fastaq
def strip_illumina_suffix(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.strip_illumina_suffix()
        print(seq, file=f_out)

    utils.close(f_out)
Ejemplo n.º 23
0
def strip_illumina_suffix(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.strip_illumina_suffix()
        print(seq, file=f_out)

    utils.close(f_out)
Ejemplo n.º 24
0
def trim(infile, outfile, start, end):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim(start, end)
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 25
0
def sort_by_name(infile, outfile):
    '''Sorts input sequence file by sort -d -k1,1, writes sorted output file.'''
    seqs = {}
    file_to_dict(infile, seqs)
    #seqs = list(seqs.values())
    #seqs.sort()
    fout = utils.open_file_write(outfile)
    for name in sorted(seqs):
        print(seqs[name], file=fout)
    utils.close(fout)
Ejemplo n.º 26
0
def trim_Ns_at_end(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim_Ns()
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 27
0
def search_for_seq(infile, outfile, search_string):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        hits = seq.search(search_string)
        for hit in hits:
            print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout)

    utils.close(fout)
Ejemplo n.º 28
0
Archivo: tasks.py Proyecto: nds/Fastaq
def trim_Ns_at_end(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim_Ns()
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 29
0
def sort_by_name(infile, outfile):
    '''Sorts input sequence file by sort -d -k1,1, writes sorted output file.'''
    seqs = {}
    file_to_dict(infile, seqs)
    #seqs = list(seqs.values())
    #seqs.sort()
    fout = utils.open_file_write(outfile)
    for name in sorted(seqs):
        print(seqs[name], file=fout)
    utils.close(fout)
Ejemplo n.º 30
0
Archivo: tasks.py Proyecto: nds/Fastaq
def trim(infile, outfile, start, end):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim(start, end)
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Ejemplo n.º 31
0
Archivo: tasks.py Proyecto: nds/Fastaq
def search_for_seq(infile, outfile, search_string):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        hits = seq.search(search_string)
        for hit in hits:
            print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout)

    utils.close(fout)
Ejemplo n.º 32
0
Archivo: tasks.py Proyecto: nds/Fastaq
def to_fasta_union(infile, outfile, seqname='union'):
    seq_reader = sequences.file_reader(infile)
    new_seq = []

    for seq in seq_reader:
        new_seq.append(seq.seq)

    f_out = utils.open_file_write(outfile)
    print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out)
    utils.close(f_out)
Ejemplo n.º 33
0
def sort_by_size(infile, outfile, smallest_first=False):
    '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first'''
    seqs = {}
    file_to_dict(infile, seqs)
    seqs = list(seqs.values())
    seqs.sort(key=lambda x: len(x), reverse=not smallest_first)
    fout = utils.open_file_write(outfile)
    for seq in seqs:
        print(seq, file=fout)
    utils.close(fout)
Ejemplo n.º 34
0
    def test_get_next_from_embl_file(self):
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl'))
        embl = sequences.Embl()
        counter = 1

        while embl.get_next_from_file(f_in):
            self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 35
0
Archivo: tasks.py Proyecto: nds/Fastaq
def sort_by_size(infile, outfile, smallest_first=False):
    '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first'''
    seqs = {}
    file_to_dict(infile, seqs)
    seqs = list(seqs.values())
    seqs.sort(key=lambda x: len(x), reverse=not smallest_first)
    fout = utils.open_file_write(outfile)
    for seq in seqs:
        print(seq, file=fout)
    utils.close(fout)
Ejemplo n.º 36
0
def offset_coords_file_to_dict(filename):
    f = utils.open_file_read(filename)
    offsets = {}

    for line in f:
        (seq, ref, offset) = line.rstrip().split('\t')
        assert seq not in offsets
        offsets[seq] = (ref, int(offset))

    utils.close(f)
    return offsets
Ejemplo n.º 37
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    for line in f:
        if line.startswith('##FASTA') or line.startswith('>'):
            break
        elif line.startswith('#'):
            continue
        else:
            yield GFF_record(line)

    utils.close(f)
Ejemplo n.º 38
0
Archivo: tasks.py Proyecto: nds/Fastaq
def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2,  gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,):
    assert method in ['tiling', 'gamma', 'uniform']
    assert ins_skip == ins_window == None or None not in [ins_skip, ins_window]
    if seed is not None:
        random.seed(a=seed)
    seq_reader = sequences.file_reader(infile)
    f = utils.open_file_write(outfile)

    for seq in seq_reader:
        if method == 'tiling':
            if len(seq) < fixed_read_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            for i in range(0, len(seq), tile_step):
                end = min(len(seq), i + fixed_read_length)
                fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end])
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)
                if end >= len(seq):
                    break
        elif method == 'gamma':
            if len(seq) < gamma_min_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            total_read_length = 0
            while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length:
                read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))
                while read_length < gamma_min_length or read_length > len(seq):
                    read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale))

                start = random.randint(0, len(seq) - read_length)
                end = start + read_length - 1
                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
                total_read_length += len(fa)
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)
        elif method == 'uniform':
            if len(seq) < fixed_read_length:
                print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr)
                continue
            total_read_length = 0
            while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length:
                start = random.randint(0, len(seq) - fixed_read_length)
                end = start + fixed_read_length - 1
                fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1])
                total_read_length += len(fa)
                if ins_skip:
                    fa.add_insertions(skip=ins_skip, window=ins_window)
                print(fa, file=f)


    utils.close(f)
Ejemplo n.º 39
0
def to_boulderio(infile, outfile):
    '''Converts input sequence file into a "Boulder-IO format", as used by primer3'''
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for sequence in seq_reader:
        print("SEQUENCE_ID=" + sequence.id, file=f_out)
        print("SEQUENCE_TEMPLATE=" + sequence.seq, file=f_out)
        print("=", file=f_out)

    utils.close(f_out)
Ejemplo n.º 40
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, including weirdness in file'''
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa'))
        fa = sequences.Fasta()
        counter = 1

        while fa.get_next_from_file(f_in):
            self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA'))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 41
0
def offset_coords_file_to_dict(filename):
    f = utils.open_file_read(filename)
    offsets = {}

    for line in f:
        (seq, ref, offset) = line.rstrip().split('\t')
        assert seq not in offsets
        offsets[seq] = (ref, int(offset))

    utils.close(f)
    return offsets
def nucmer_file_reader(fname):
    f = utils.open_file_read(fname)
    in_header = True

    for line in f:
        if in_header:
            if line.startswith("["):
                in_header = False
            continue
        yield NucmerHit(line)

    utils.close(f)
Ejemplo n.º 43
0
    def test_get_next_from_gbk_file(self):
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk'))
        embl = sequences.Embl()
        counter = 1
        expected = [
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc',
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa']

        while embl.get_next_from_file(f_in):
            self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1]))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 44
0
Archivo: tasks.py Proyecto: nds/Fastaq
def fastaq_to_fake_qual(infile, outfile, q=40):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        print('>' + seq.id, file=fout)
        if sequences.Fasta.line_length == 0:
            print(' '.join([str(q)] * len(seq)), file=fout)
        else:
            for i in range(0, len(seq), sequences.Fasta.line_length):
                print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout)

    utils.close(fout)
Ejemplo n.º 45
0
Archivo: tasks.py Proyecto: nds/Fastaq
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    for seq in seq_reader:
        orfs = seq.all_orfs(min_length=min_length)
        for coords, revcomp in orfs:
            if revcomp:
                strand = '-'
            else:
                strand = '+'

            print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout)

    utils.close(fout)
Ejemplo n.º 46
0
Archivo: tasks.py Proyecto: nds/Fastaq
def fasta_to_fastq(fasta_in, qual_in, outfile):
    fa_reader = sequences.file_reader(fasta_in)
    qual_reader = sequences.file_reader(qual_in, read_quals=True)
    f_out = utils.open_file_write(outfile)

    for seq in fa_reader:
        qual = next(qual_reader)
        if seq.id != qual.id:
            utils.close(f_out)
            raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id)

        qual.seq = [int(x) for x in qual.seq.split()]
        print(seq.to_Fastq(qual.seq), file=f_out)

    utils.close(f_out)
Ejemplo n.º 47
0
Archivo: tasks.py Proyecto: nds/Fastaq
def fastaq_to_mira_xml(infile, outfile):
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout)

    for seq in seq_reader:
        print('    <trace>',
              '        <trace_name>' + seq.id + '</trace_name>',
              '        <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>',
              '        <clip_vector_left>1</clip_vector_left>',
              '    </trace>', sep='\n', file=fout)


    print('</trace_volume>', file=fout)
    utils.close(fout)
Ejemplo n.º 48
0
Archivo: tasks.py Proyecto: nds/Fastaq
def filter(
      infile,
      outfile,
      minlength=0,
      maxlength=float('inf'),
      regex=None,
      ids_file=None,
      invert=False,
      mate_in=None,
      mate_out=None,
      both_mates_pass=True,
    ):

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error('Error in filter! mate_in provided. Must also provide mate_out')

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)


    def passes(seq):
        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(seq.id) is not None) \
              and (ids_file is None or seq.id in ids_from_file)

    for seq in seq_reader:
        seq_passes = passes(seq)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue')

            mate_passes = passes(seq_mate)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)