def get_assembly_stats(options): f = utils.open_file_read(options.infile) csv_headers = [] stats = {} float_headers = set([ 'Avg Contig Length', 'Average Quality', 'Insert Size Average', 'Insert Size Std Dev' ]) for line in f: if len(csv_headers) == 0: csv_headers = line.rstrip().split('\t')[2:] stats = {k:[] for k in csv_headers} else: data = line.rstrip().split('\t')[2:] assert len(data) == len(csv_headers) == len(stats) for i in range(len(data)): if csv_headers[i] in float_headers: stats[csv_headers[i]].append(float(data[i])) else: stats[csv_headers[i]].append(int(data[i])) utils.close(f) return stats
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = [ 'sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq', 'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq' ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for fname in bad_files: f_in = utils.open_file_read(fname) fq = sequences.Fastq() with self.assertRaises(sequences.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = os.path.join(data_dir, 'sequences_test_good_file.fq') try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = sequences.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def nucmer_file_reader(fname): f = utils.open_file_read(fname) in_header = True for line in f: if in_header: if line.startswith('['): in_header = False continue yield NucmerHit(line) utils.close(f)
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, including weirdness in file''' f_in = utils.open_file_read(os.path.join(data_dir, 'sequences_test.fa')) fa = sequences.Fasta() counter = 1 while fa.get_next_from_file(f_in): self.assertEqual(fa, sequences.Fasta(str(counter), 'ACGTA')) counter += 1 utils.close(f_in)
def test_write_and_read(self): '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: f = utils.open_file_write(filename) for i in range(3): print(i, file=f) utils.close(f) counter = 0 f = utils.open_file_read(filename) for line in f: self.assertEqual(counter, int(line.strip())) counter += 1 utils.close(f) os.unlink(filename) f = utils.open_file_read('-') self.assertEqual(sys.stdin, f) f = utils.open_file_write('-') self.assertEqual(sys.stdout, f)
def test_get_next_from_embl_file(self): f_in = utils.open_file_read( os.path.join(data_dir, 'sequences_test.embl')) embl = sequences.Embl() counter = 1 while embl.get_next_from_file(f_in): self.assertEqual( embl, sequences.Fasta('seq' + str(counter), expected_embl[counter - 1])) counter += 1 utils.close(f_in)
def test_get_next_from_gbk_file(self): f_in = utils.open_file_read( os.path.join(data_dir, 'sequences_test.gbk')) embl = sequences.Embl() counter = 1 expected = [ 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc', 'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa' ] while embl.get_next_from_file(f_in): self.assertEqual( embl, sequences.Fasta('NAME' + str(counter), expected[counter - 1])) counter += 1 utils.close(f_in)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def filter(infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) for seq in seq_reader: hit = minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) if hit != invert: print(seq, file=f_out) utils.close(f_out)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read( os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def lengths_from_fai(fai_file, d): f = utils.open_file_read(fai_file) for line in f: (id, length) = line.rstrip().split()[:2] d[id] = int(length) utils.close(f)
def file_reader(fname, read_quals=False): '''Iterates over a FASTA or FASTQ file, yielding the next sequence in the file until there are no more sequences''' f = utils.open_file_read(fname) line = f.readline() phylip_regex = re.compile('^\s*[0-9]+\s+[0-9]+$') gbk_regex = re.compile('^LOCUS\s+\S') if line.startswith('>'): seq = Fasta() previous_lines[f] = line elif line.startswith('##gff-version 3'): seq = Fasta() # if a GFF file, need to skip past all the annotation # and get to the fasta sequences at the end of the file while not line.startswith('>'): line = f.readline() if not line: utils.close(f) raise Error('No sequences found in GFF file "' + fname + '"') seq = Fasta() previous_lines[f] = line elif line.startswith('ID ') and line[5] != ' ': seq = Embl() previous_lines[f] = line elif gbk_regex.search(line): seq = Embl() previous_lines[f] = line elif line.startswith('@'): seq = Fastq() previous_lines[f] = line elif phylip_regex.search(line): # phylip format could be interleaved or not, need to look at next # couple of lines to figure that out. Don't expect these files to # be too huge, so just store all the sequences in memory number_of_seqs, bases_per_seq = line.strip().split() number_of_seqs = int(number_of_seqs) bases_per_seq = int(bases_per_seq) got_blank_line = False first_line = line seq_lines = [] while 1: line = f.readline() if line == '': break elif line == '\n': got_blank_line = True else: seq_lines.append(line.rstrip()) utils.close(f) if len(seq_lines) == 1 or len(seq_lines) == number_of_seqs: sequential = True elif seq_lines[0][10] != ' ' and seq_lines[1][10] == ' ': sequential = True else: sequential = False # if the 11th char of second sequence line is a space, then the file is sequential, e.g.: # GAGCCCGGGC AATACAGGGT AT # as opposed to: # Salmo gairAAGCCTTGGC AGTGCAGGGT if sequential: current_id = None current_seq = '' for line in seq_lines: if len(current_seq) == bases_per_seq or len(current_seq) == 0: if current_id is not None: yield Fasta(current_id, current_seq.replace('-', '')) current_seq = '' current_id, new_bases = line[0:10].rstrip(), line.rstrip( )[10:] else: new_bases = line.rstrip() current_seq += new_bases.replace(' ', '') yield Fasta(current_id, current_seq.replace('-', '')) else: # seaview files start all seqs at pos >=12. Other files start # their sequence at the start of the line if seq_lines[number_of_seqs + 1][0] == ' ': first_gap_pos = seq_lines[0].find(' ') end_of_gap = first_gap_pos while seq_lines[0][end_of_gap] == ' ': end_of_gap += 1 first_seq_base = end_of_gap else: first_seq_base = 10 seqs = [] for i in range(number_of_seqs): name, bases = seq_lines[i][0:first_seq_base].rstrip( ), seq_lines[i][first_seq_base:] seqs.append(Fasta(name, bases)) for i in range(number_of_seqs, len(seq_lines)): seqs[i % number_of_seqs].seq += seq_lines[i] for fa in seqs: fa.seq = fa.seq.replace(' ', '').replace('-', '') yield fa return elif line == '': utils.close(f) return else: utils.close(f) raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip()) try: while seq.get_next_from_file(f, read_quals): yield seq finally: utils.close(f)