Ejemplo n.º 1
0
def fix_blast_coords(blast_file, coords_file, outfile):
    coords_offset = offset_coords_file_to_dict(coords_file)
    fin = utils.open_file_read(blast_file)
    fout = utils.open_file_write(outfile)
    for line in fin:
        # blastn sticks a bunch of header lines in the tabulated
        # output file. Need to ignore them
        if '\t' not in line:
            continue

        # Lines are supposed to be tab delimited. Sometimes they
        # have a space character following a tab character, so
        # split on whitespace. This is OK because the pipeline has already
        # removed whitespace from sequence names
        data = line.rstrip().split()
        if data[0] in coords_offset:
            data[6] = str(int(data[6]) + coords_offset[data[0]][1])
            data[7] = str(int(data[7]) + coords_offset[data[0]][1])
            data[0] = coords_offset[data[0]][0]

        # always reconstruct the line, because of spaces bug mentioned above
        line = '\t'.join(data)

        print(line.rstrip(), file=fout)

    utils.close(fin)
    utils.close(fout)
Ejemplo n.º 2
0
    def test_get_next_from_file(self):
        '''Test get_next_from_file()'''

        f_in = utils.open_file_read(os.path.join(data_dir, 'caf_test.caf'))

        c = caf.Caf()
        c.get_next_from_file(f_in)
        read = caf.Caf()
        read.id = 'read1.p1k'
        read.seq = sequences.Fasta(read.id, 'NACGTAN')
        read.seq = read.seq.to_Fastq([4, 24, 42, 43, 40, 30, 8])
        read.insert_min = 2000
        read.insert_max = 4000
        read.ligation = '12345'
        read.clone = 'clone1'
        read.clip_start = 1
        read.clip_end = 5
        self.assertEqual(c, read)

        c.get_next_from_file(f_in)
        read = caf.Caf()
        read.id = 'read2.p1k'
        read.seq = sequences.Fasta(read.id, 'CGACGTT')
        read.seq = read.seq.to_Fastq([9, 9, 40, 41, 42, 42, 4])
        read.insert_min = 2000
        read.insert_max = 4000
        read.ligation = '23456'
        read.clone = 'clone2'
        read.clip_start = None
        read.clip_end = None
        self.assertEqual(c, read)

        utils.close(f_in)
	def __init__(self, 
				 fasta_file, 
				 working_directory=None, 
				 cutoff_contig_length=2000, 
				 percent_match=95, 
				 skip = None,
				 summary_file="contig_cleanup_summary.txt",
				 summary_prefix="[contig cleanup]",
				 debug=False):
				 
		''' Constructor '''
		self.fasta_file = fasta_file
		self.working_directory = working_directory if working_directory else os.getcwd()			
		self.cutoff_contig_length = cutoff_contig_length
		self.percent_match = percent_match
		self.summary_file = summary_file
		self.summary_prefix = summary_prefix
		self.debug = debug		
		self.contigs = {}
		tasks.file_to_dict(self.fasta_file, self.contigs) #Read contig ids and sequences into dict
		
		self.ids_to_skip = set()		
		if skip:
			if type(skip) == set:
				self.ids_to_skip = set(skip) # Assumes ids is a list
			else:
				fh = fastaqutils.open_file_read(skip)
				for line in fh:
					self.ids_to_skip.add(line.rstrip())
				fastaqutils.close(fh)
		self.output_file = self._build_final_filename()		
Ejemplo n.º 4
0
def stats_from_fai(infile):
    '''Returns dictionary of length stats from an fai file. Keys are: longest, shortest, mean, total_length, N50, number'''
    f = utils.open_file_read(infile)
    try:
        lengths = sorted([int(line.split('\t')[1]) for line in f], reverse=True)
    except:
        raise Error('Error getting lengths from fai file ' + infile)
    utils.close(f)

    stats = {}
    if len(lengths) > 0:
        stats['longest'] = max(lengths)
        stats['shortest'] = min(lengths)
        stats['total_length'] = sum(lengths)
        stats['mean'] = stats['total_length'] / len(lengths)
        stats['number'] = len(lengths)

        cumulative_length = 0
        for length in lengths:
            cumulative_length += length
            if cumulative_length >= 0.5 * stats['total_length']:
                stats['N50'] = length
                break
    else:
        stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}

    return stats
	def _run_prodigal_and_store_gene_starts(self):
		'''Run prodigal and find the start of genes around the middle of each contig''' 
		gene_starts = {}
		# run prodigal
		prodigal_output = utils.run_prodigal(self.fasta_file, self._build_prodigal_filename(), self._get_length_of_fasta_file())
		prodigal_genes = {}
		if(prodigal_output):
			fh = fastaqutils.open_file_read(self._build_prodigal_filename())
			for line in fh:
				if not line.startswith("#"):
					columns = line.split('\t')
					start_location = int(columns[3])
					end_location = int(columns[4])
					contig_id = columns[0]
					strand = columns[6]	
					middle = abs((len(self.contigs[contig_id])/2))
					p = prodigal_hit.ProdigalHit(start_location, end_location, strand, middle)				
					prodigal_genes.setdefault(contig_id, []).append(p)
			fastaqutils.close(fh)
			# look for best distance
			for id in self.contigs.keys():
				best_gene = None
				if id in prodigal_genes.keys():
					all_prodigal_hits = prodigal_genes[id]
					min_distance = abs(len(self.contigs[contig_id])/2)
					for p in all_prodigal_hits:
						if p.distance <= min_distance:
							best_gene = p
							min_distance = p.distance
				if best_gene:
					gene_starts[id] = best_gene
				else:
					gene_starts[id] = None # Could not find a gene			
		return gene_starts	
Ejemplo n.º 6
0
Archivo: tasks.py Proyecto: nds/Fastaq
def filter(
    infile,
    outfile,
    minlength=0,
    maxlength=float('inf'),
    regex=None,
    ids_file=None,
    invert=False,
    mate_in=None,
    mate_out=None,
    both_mates_pass=True,
):

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error(
                'Error in filter! mate_in provided. Must also provide mate_out'
            )

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)

    def passes(seq):
        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(seq.id) is not None) \
              and (ids_file is None or seq.id in ids_from_file)

    for seq in seq_reader:
        seq_passes = passes(seq)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id,
                            ' ... cannot continue')

            mate_passes = passes(seq_mate)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)
Ejemplo n.º 7
0
def fix_blast_coords(blast_file, coords_file, outfile):
    coords_offset = offset_coords_file_to_dict(coords_file)
    fin = utils.open_file_read(blast_file)
    fout = utils.open_file_write(outfile)
    for line in fin:
        # blastn sticks a bunch of header lines in the tabulated
        # output file. Need to ignore them
        if '\t' not in line:
            continue

        # Lines are supposed to be tab delimited. Sometimes they
        # have a space character following a tab character, so
        # split on whitespace. This is OK because the pipeline has already
        # removed whitespace from sequence names
        data = line.rstrip().split()
        if data[0] in coords_offset:
            data[6] = str(int(data[6]) + coords_offset[data[0]][1])
            data[7] = str(int(data[7]) + coords_offset[data[0]][1])
            data[0] = coords_offset[data[0]][0]

        # always reconstruct the line, because of spaces bug mentioned above
        line = '\t'.join(data)

        print(line.rstrip(),file=fout)

    utils.close(fin)
    utils.close(fout)
Ejemplo n.º 8
0
def stats_from_fai(infile):
    '''Returns dictionary of length stats from an fai file. Keys are: longest, shortest, mean, total_length, N50, number'''
    f = utils.open_file_read(infile)
    try:
        lengths = sorted([int(line.split('\t')[1]) for line in f], reverse=True)
    except:
        raise Error('Error getting lengths from fai file ' + infile)
    utils.close(f)

    stats = {}
    if len(lengths) > 0:
        stats['longest'] = max(lengths)
        stats['shortest'] = min(lengths)
        stats['total_length'] = sum(lengths)
        stats['mean'] = stats['total_length'] / len(lengths)
        stats['number'] = len(lengths)

        cumulative_length = 0
        for length in lengths:
            cumulative_length += length
            if cumulative_length >= 0.5 * stats['total_length']:
                stats['N50'] = length
                break
    else:
        stats = {x: 0 for x in ('longest', 'shortest', 'mean', 'N50', 'total_length', 'number')}

    return stats
Ejemplo n.º 9
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = ['sequences_test_fail_no_AT.fq',
                     'sequences_test_fail_no_seq.fq',
                     'sequences_test_fail_no_plus.fq',
                     'sequences_test_fail_no_qual.fq']

        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = sequences.Fastq()
            with self.assertRaises(sequences.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = os.path.join(data_dir, 'sequences_test_good_file.fq')
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = sequences.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
Ejemplo n.º 10
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    c = Caf()

    while c.get_next_from_file(f):
        yield c

    utils.close(f)
Ejemplo n.º 11
0
Archivo: tasks.py Proyecto: nds/Fastaq
def filter(
      infile,
      outfile,
      minlength=0,
      maxlength=float('inf'),
      regex=None,
      ids_file=None,
      invert=False,
      mate_in=None,
      mate_out=None,
      both_mates_pass=True,
    ):

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error('Error in filter! mate_in provided. Must also provide mate_out')

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)


    def passes(seq):
        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(seq.id) is not None) \
              and (ids_file is None or seq.id in ids_from_file)

    for seq in seq_reader:
        seq_passes = passes(seq)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue')

            mate_passes = passes(seq_mate)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)
Ejemplo n.º 12
0
    def test_get_next_from_embl_file(self):
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.embl'))
        embl = sequences.Embl()
        counter = 1

        while embl.get_next_from_file(f_in):
            self.assertEqual(embl, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 13
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, including weirdness in file'''
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa'))
        fa = sequences.Fasta()
        counter = 1

        while fa.get_next_from_file(f_in):
            self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA'))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 14
0
def offset_coords_file_to_dict(filename):
    f = utils.open_file_read(filename)
    offsets = {}

    for line in f:
        (seq, ref, offset) = line.rstrip().split('\t')
        assert seq not in offsets
        offsets[seq] = (ref, int(offset))

    utils.close(f)
    return offsets
Ejemplo n.º 15
0
def file_reader(fname):
    f = utils.open_file_read(fname)
    for line in f:
        if line.startswith('##FASTA') or line.startswith('>'):
            break
        elif line.startswith('#'):
            continue
        else:
            yield GFF_record(line)

    utils.close(f)
Ejemplo n.º 16
0
def offset_coords_file_to_dict(filename):
    f = utils.open_file_read(filename)
    offsets = {}

    for line in f:
        (seq, ref, offset) = line.rstrip().split('\t')
        assert seq not in offsets
        offsets[seq] = (ref, int(offset))

    utils.close(f)
    return offsets
def nucmer_file_reader(fname):
    f = utils.open_file_read(fname)
    in_header = True

    for line in f:
        if in_header:
            if line.startswith('['):
                in_header = False
            continue
        yield NucmerHit(line)

    utils.close(f)
def nucmer_file_reader(fname):
    f = utils.open_file_read(fname)
    in_header = True

    for line in f:
        if in_header:
            if line.startswith("["):
                in_header = False
            continue
        yield NucmerHit(line)

    utils.close(f)
Ejemplo n.º 19
0
def parse_file_or_set(s):
    '''Parse a file or set and return set of items in it '''
    items = set()
    if s:
        if type(s) == set:
            items = s
        else:
            fh = fastaqutils.open_file_read(
                s)  #Will just fail is file not found. Handle properly
            for line in fh:
                items.add(line.rstrip())
            fastaqutils.close(fh)
    return items
Ejemplo n.º 20
0
    def test_get_next_from_gbk_file(self):
        f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.gbk'))
        embl = sequences.Embl()
        counter = 1
        expected = [
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc',
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa']

        while embl.get_next_from_file(f_in):
            self.assertEqual(embl, sequences.Fasta('NAME' + str(counter), expected[counter-1]))
            counter += 1

        utils.close(f_in)
Ejemplo n.º 21
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)

        f = utils.open_file_read('-')
        self.assertEqual(sys.stdin, f)
        f = utils.open_file_write('-')
        self.assertEqual(sys.stdout, f)
Ejemplo n.º 22
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)

        f = utils.open_file_read('-')
        self.assertEqual(sys.stdin, f)
        f = utils.open_file_write('-')
        self.assertEqual(sys.stdout, f)
Ejemplo n.º 23
0
    def test_raise_exception(self):
        '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error')
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
        with self.assertRaises(utils.Error):
            utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))

        with self.assertRaises(utils.Error):
            utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
        with self.assertRaises(utils.Error):
            utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
Ejemplo n.º 24
0
    def __init__(
            self,
            fasta_file,
            gene_file,
            skip=None,  #Avoid circularising contigs with these ids
            hit_percent_id=80,
            match_length_percent=100,
            choose_random_gene=True,
            rename=True,
            working_directory=None,
            summary_file="contig_breaks_summary.txt",
            summary_prefix="[contig break finder]",
            debug=False):
        ''' Attributes '''
        self.fasta_file = fasta_file
        self.gene_file = gene_file
        self.hit_percent_id = hit_percent_id
        self.match_length_percent = match_length_percent
        self.choose_random_gene = choose_random_gene
        self.rename = rename
        self.working_directory = working_directory if working_directory else os.getcwd(
        )
        self.summary_file = summary_file
        self.summary_prefix = summary_prefix
        self.output_file = self._build_final_filename()
        self.debug = debug
        self.contigs = {}
        tasks.file_to_dict(
            self.fasta_file,
            self.contigs)  #Read contig ids and sequences into dict
        self.random_gene_starts = {}

        self.ids_to_skip = set()
        if skip:
            if type(skip) == set:
                self.ids_to_skip = set(skip)  # Assumes ids is a list
            else:
                fh = fastaqutils.open_file_read(skip)
                for line in fh:
                    self.ids_to_skip.add(line.rstrip())
                fastaqutils.close(fh)
Ejemplo n.º 25
0
def length_offsets_from_fai(fai_file):
    '''Returns a dictionary of positions of the start of each sequence, as
       if all the sequences were catted into one sequence.
       eg if file has three sequences, seq1 10bp, seq2 30bp, seq3 20bp, then
       the output would be: {'seq1': 0, 'seq2': 10, 'seq3': 40}'''
    positions = {}
    total_length = 0
    f = utils.open_file_read(fai_file)

    for line in f:
        try:
            (name, length) = line.rstrip().split()[:2]
            length = int(length)
        except:
            raise Error('Error reading the following line of fai file ' + fai_file + '\n' + line)

        positions[name] = total_length
        total_length += length

    utils.close(f)
    return positions
Ejemplo n.º 26
0
def length_offsets_from_fai(fai_file):
    '''Returns a dictionary of positions of the start of each sequence, as
       if all the sequences were catted into one sequence.
       eg if file has three sequences, seq1 10bp, seq2 30bp, seq3 20bp, then
       the output would be: {'seq1': 0, 'seq2': 10, 'seq3': 40}'''
    positions = {}
    total_length = 0
    f = utils.open_file_read(fai_file)

    for line in f:
        try:
            (name, length) = line.rstrip().split()[:2]
            length = int(length)
        except:
            raise Error('Error reading the following line of fai file ' + fai_file + '\n' + line)

        positions[name] = total_length
        total_length += length

    utils.close(f)
    return positions
Ejemplo n.º 27
0
def to_fastg(infile, outfile, circular=None):
    '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs'''
    if circular is None:
        to_circularise = set()
    elif type(circular) is not set:
        f = utils.open_file_read(circular)
        to_circularise = set([x.rstrip() for x in f.readlines()])
        utils.close(f)
    else:
        to_circularise = circular

    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    nodes = 1

    for seq in seq_reader:
        new_id = '_'.join([
            'NODE', str(nodes),
            'length', str(len(seq)),
            'cov', '1',
            'ID', seq.id
        ])

        if seq.id in to_circularise:
            seq.id = new_id + ':' + new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "':" + new_id + "';"
            print(seq, file=fout)
        else:
            seq.id = new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "';"
            print(seq, file=fout)

        nodes += 1

    utils.close(fout)
Ejemplo n.º 28
0
def to_fastg(infile, outfile, circular=None):
    '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs'''
    if circular is None:
        to_circularise = set()
    elif type(circular) is not set:
        f = utils.open_file_read(circular)
        to_circularise = set([x.rstrip() for x in f.readlines()])
        utils.close(f)
    else:
        to_circularise = circular

    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)
    nodes = 1

    for seq in seq_reader:
        new_id = '_'.join([
            'NODE', str(nodes),
            'length', str(len(seq)),
            'cov', '1',
            'ID', seq.id
        ])

        if seq.id in to_circularise:
            seq.id = new_id + ':' + new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "':" + new_id + "';"
            print(seq, file=fout)
        else:
            seq.id = new_id + ';'
            print(seq, file=fout)
            seq.revcomp()
            seq.id = new_id + "';"
            print(seq, file=fout)

        nodes += 1

    utils.close(fout)
Ejemplo n.º 29
0
    def test_raise_exception(self):
        '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error')
        with self.assertRaises(utils.Error):
            utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
        with self.assertRaises(utils.Error):
            utils.open_file_read(
                os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))

        with self.assertRaises(utils.Error):
            utils.open_file_write(
                os.path.join('not_a_directory',
                             'this_file_is_not_here_so_throw_error'))
        with self.assertRaises(utils.Error):
            utils.open_file_write(
                os.path.join('not_a_directory',
                             'this_file_is_not_here_so_throw_error.gz'))
Ejemplo n.º 30
0
def lengths_from_fai(fai_file, d):
    f = utils.open_file_read(fai_file)
    for line in f:
        (id, length) = line.rstrip().split()[:2]
        d[id] = int(length)
    utils.close(f)
Ejemplo n.º 31
0
Archivo: tasks.py Proyecto: nds/Fastaq
def lengths_from_fai(fai_file, d):
    f = utils.open_file_read(fai_file)
    for line in f:
        (id, length) = line.rstrip().split()[:2]
        d[id] = int(length)
    utils.close(f)
Ejemplo n.º 32
0
def filter(infile,
           outfile,
           minlength=0,
           maxlength=float('inf'),
           regex=None,
           ids_file=None,
           invert=False,
           mate_in=None,
           mate_out=None,
           both_mates_pass=True,
           check_comments=False):
    if check_comments and not regex:
        raise IncompatibleParametersError(
            "--check_comments can only be passed with --regex")

    ids_from_file = set()
    if ids_file is not None:
        f = utils.open_file_read(ids_file)
        for line in f:
            ids_from_file.add(line.rstrip())
        utils.close(f)

    if mate_in:
        if mate_out is None:
            raise Error(
                'Error in filter! mate_in provided. Must also provide mate_out'
            )

        seq_reader_mate = sequences.file_reader(mate_in)
        f_out_mate = utils.open_file_write(mate_out)

    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    if regex is not None:
        r = re.compile(regex)

    def passes(seq, name_regex):
        # remove trailing comments from FASTQ readname lines
        matches = name_regex.match(seq.id)
        if matches is not None and not check_comments:
            clean_seq_id = matches.group(1)
        else:
            clean_seq_id = seq.id

        return minlength <= len(seq) <= maxlength \
              and (regex is None or r.search(clean_seq_id) is not None) \
              and (ids_file is None or clean_seq_id in ids_from_file)

    name_regex = re.compile(r'^([^\s]+).*?$')

    for seq in seq_reader:
        seq_passes = passes(seq, name_regex)
        if mate_in:
            try:
                seq_mate = next(seq_reader_mate)
            except:
                utils.close(f_out)
                raise Error('Error getting mate for sequence', seq.id,
                            ' ... cannot continue')

            mate_passes = passes(seq_mate, name_regex)
            want_the_pair = (seq_passes and mate_passes) \
                            or (( seq_passes or mate_passes) and not both_mates_pass)
            if want_the_pair != invert:
                print(seq, file=f_out)
                print(seq_mate, file=f_out_mate)
        elif seq_passes != invert:
            print(seq, file=f_out)
    utils.close(f_out)
    if mate_in:
        utils.close(f_out_mate)
Ejemplo n.º 33
0
def file_reader(fname, read_quals=False):
    '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences'''
    f = utils.open_file_read(fname)
    line = f.readline()
    phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$')
    gbk_regex = re.compile('^LOCUS\s+\S')

    if line.startswith('>'):
        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('##gff-version 3'):
        seq = Fasta()
        # if a GFF file, need to skip past all the annotation
        # and get to the fasta sequences at the end of the file
        while not line.startswith('>'):
            line = f.readline()
            if not line:
                utils.close(f)
                raise Error('No sequences found in GFF file "' + fname + '"')

        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('ID   ') and line[5] != ' ':
        seq = Embl()
        previous_lines[f] = line
    elif gbk_regex.search(line):
        seq = Embl()
        previous_lines[f] = line
    elif line.startswith('@'):
        seq = Fastq()
        previous_lines[f] = line
    elif phylip_regex.search(line):
        # phylip format could be interleaved or not, need to look at next
        # couple of lines to figure that out. Don't expect these files to
        # be too huge, so just store all the sequences in memory
        number_of_seqs, bases_per_seq = line.strip().split()
        number_of_seqs = int(number_of_seqs)
        bases_per_seq = int(bases_per_seq)
        got_blank_line = False

        first_line = line
        seq_lines = []
        while 1:
            line = f.readline()
            if line == '':
                break
            elif line == '\n':
                got_blank_line = True
            else:
                seq_lines.append(line.rstrip())
        utils.close(f)

        if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs:
            sequential = True
        elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ':
            sequential = True
        else:
            sequential = False

        # if the 11th char of second sequence line is a space,  then the file is sequential, e.g.:
        # GAGCCCGGGC AATACAGGGT AT
        # as opposed to:
        # Salmo gairAAGCCTTGGC AGTGCAGGGT
        if sequential:
            current_id = None
            current_seq = ''
            for line in seq_lines:
                if len(current_seq) == bases_per_seq or len(current_seq) == 0:
                    if current_id is not None:
                        yield Fasta(current_id, current_seq.replace('-', ''))
                    current_seq = ''
                    current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:]
                else:
                    new_bases = line.rstrip()

                current_seq += new_bases.replace(' ','')

            yield Fasta(current_id, current_seq.replace('-', ''))
        else:
            # seaview files start all seqs at pos >=12. Other files start
            # their sequence at the start of the line
            if seq_lines[number_of_seqs + 1][0] == ' ':
                first_gap_pos = seq_lines[0].find(' ')
                end_of_gap = first_gap_pos
                while seq_lines[0][end_of_gap] == ' ':
                    end_of_gap += 1
                first_seq_base = end_of_gap
            else:
                first_seq_base = 10

            seqs = []
            for i in range(number_of_seqs):
                name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:]
                seqs.append(Fasta(name, bases))

            for i in range(number_of_seqs, len(seq_lines)):
                seqs[i%number_of_seqs].seq += seq_lines[i]

            for fa in seqs:
                fa.seq = fa.seq.replace(' ','').replace('-','')
                yield fa

        return
    elif line == '':
        utils.close(f)
        return
    else:
        utils.close(f)
        raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip())

    try:
        while seq.get_next_from_file(f, read_quals):
            yield seq
    finally:
        utils.close(f)
import os
from pyfastaq import utils
import pysam

parser = argparse.ArgumentParser(
    description="Works out the layout of the contigs within scaffolds, using the file *.tags_and_sam.gz file made by the script scaffold_test_check_using_tags.py",
    usage="%(prog)s [options] <inprefix> <outprefix>",
)
parser.add_argument(
    "inprefix", help="Prefix of input files. Use the outprefix when scaff_test_check_using_tags.py was run"
)
parser.add_argument("outprefix", help="Prefix of output files")
options = parser.parse_args()

# load flags into memory
f = utils.open_file_read(options.inprefix + ".tags.gz")
flags = f.readlines()
utils.close(f)
flags = [int(x) for x in flags]

# load sam records into memory
sam_reader = pysam.Samfile(options.inprefix + ".tag_pairs.bam", "rb")
lines = []
for sam in sam_reader:
    lines.append(sam)

nodes = {}

# loop over flag pairs, making graph nodes and adjacency lists
for i in range(0, len(lines), 2):
    flag = flags[int(i / 2)]
Ejemplo n.º 35
0
import pysam

parser = argparse.ArgumentParser(
    description=
    'Works out the layout of the contigs within scaffolds, using the file *.tags_and_sam.gz file made by the script scaffold_test_check_using_tags.py',
    usage='%(prog)s [options] <inprefix> <outprefix>')
parser.add_argument(
    'inprefix',
    help=
    'Prefix of input files. Use the outprefix when scaff_test_check_using_tags.py was run'
)
parser.add_argument('outprefix', help='Prefix of output files')
options = parser.parse_args()

# load flags into memory
f = utils.open_file_read(options.inprefix + '.tags.gz')
flags = f.readlines()
utils.close(f)
flags = [int(x) for x in flags]

# load sam records into memory
sam_reader = pysam.Samfile(options.inprefix + '.tag_pairs.bam', 'rb')
lines = []
for sam in sam_reader:
    lines.append(sam)

nodes = {}

# loop over flag pairs, making graph nodes and adjacency lists
for i in range(0, len(lines), 2):
    flag = flags[int(i / 2)]
Ejemplo n.º 36
0
def file_reader(fname, read_quals=False):
    '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences'''
    f = utils.open_file_read(fname)
    line = f.readline()
    phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$')
    gbk_regex = re.compile('^LOCUS\s+\S')

    if line.startswith('>'):
        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('##gff-version 3'):
        seq = Fasta()
        # if a GFF file, need to skip past all the annotation
        # and get to the fasta sequences at the end of the file
        while not line.startswith('>'):
            line = f.readline()
            if not line:
                utils.close(f)
                raise Error('No sequences found in GFF file "' + fname + '"')

        seq = Fasta()
        previous_lines[f] = line
    elif line.startswith('ID   ') and line[5] != ' ':
        seq = Embl()
        previous_lines[f] = line
    elif gbk_regex.search(line):
        seq = Embl()
        previous_lines[f] = line
    elif line.startswith('@'):
        seq = Fastq()
        previous_lines[f] = line
    elif phylip_regex.search(line):
        # phylip format could be interleaved or not, need to look at next
        # couple of lines to figure that out. Don't expect these files to
        # be too huge, so just store all the sequences in memory
        number_of_seqs, bases_per_seq = line.strip().split()
        number_of_seqs = int(number_of_seqs)
        bases_per_seq = int(bases_per_seq)
        got_blank_line = False

        first_line = line
        seq_lines = []
        while 1:
            line = f.readline()
            if line == '':
                break
            elif line == '\n':
                got_blank_line = True
            else:
                seq_lines.append(line.rstrip())
        utils.close(f)

        if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs:
            sequential = True
        elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ':
            sequential = True
        else:
            sequential = False

        # if the 11th char of second sequence line is a space,  then the file is sequential, e.g.:
        # GAGCCCGGGC AATACAGGGT AT
        # as opposed to:
        # Salmo gairAAGCCTTGGC AGTGCAGGGT
        if sequential:
            current_id = None
            current_seq = ''
            for line in seq_lines:
                if len(current_seq) == bases_per_seq or len(current_seq) == 0:
                    if current_id is not None:
                        yield Fasta(current_id, current_seq.replace('-', ''))
                    current_seq = ''
                    current_id, new_bases = line[0:10].rstrip(), line.rstrip()[10:]
                else:
                    new_bases = line.rstrip()

                current_seq += new_bases.replace(' ','')

            yield Fasta(current_id, current_seq.replace('-', ''))
        else:
            # seaview files start all seqs at pos >=12. Other files start
            # their sequence at the start of the line
            if seq_lines[number_of_seqs + 1][0] == ' ':
                first_gap_pos = seq_lines[0].find(' ')
                end_of_gap = first_gap_pos
                while seq_lines[0][end_of_gap] == ' ':
                    end_of_gap += 1
                first_seq_base = end_of_gap
            else:
                first_seq_base = 10

            seqs = []
            for i in range(number_of_seqs):
                name, bases = seq_lines[i][0:first_seq_base].rstrip(), seq_lines[i][first_seq_base:]
                seqs.append(Fasta(name, bases))

            for i in range(number_of_seqs, len(seq_lines)):
                seqs[i%number_of_seqs].seq += seq_lines[i]

            for fa in seqs:
                fa.seq = fa.seq.replace(' ','').replace('-','')
                yield fa

        return
    elif line == '':
        utils.close(f)
        return
    else:
        utils.close(f)
        raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip())

    try:
        while seq.get_next_from_file(f, read_quals):
            yield seq
    finally:
        utils.close(f)