def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = [ 'fastn_unittest_fail_no_AT.fq', 'fastn_unittest_fail_no_seq.fq', 'fastn_unittest_fail_no_plus.fq', 'fastn_unittest_fail_no_qual.fq' ] for fname in bad_files: f_in = utils.open_file_read(fname) fq = fastn.Fastq() with self.assertRaises(fastn.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = 'fastn_unittest_good_file.fq' try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = fastn.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, fastn.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def file_reader(fname): f = utils.open_file_read(fname) for line in f: yield BlastHit(line) utils.close(f)
def get_scaff_results(dir): flag_counts = {k: 0 for k in possible_flags} flag_counts['skipped'] = 0 flag_counts['lost'] = 0 log_file = dir + '/check_scaffolds.log' if os.path.exists(log_file): f = utils.open_file_read(dir + '/check_scaffolds.log') for line in f: a = line.split() if a[0].isdigit(): flag_counts[int(a[0])] = int(a[1]) elif a[0] in ['lost', 'skipped']: flag_counts[a[0]] = int(a[1]) utils.close(f) else: print('Warning: no log file', log_file, file=sys.stderr) flag_counts['bad_joins'] = 0 flag_counts['bad_joins'] = sum([ flag_counts[x] for x in flag_counts.keys() if x not in [0, 16, 'skipped'] ]) return flag_counts
def __init__(self, filename): f = utils.open_file_read(filename) self.version = None self.mutations = {} # (seq name, pos) -> [list of mutations] for line in f: # first line should define that this is a genome diff file if self.version is None: if not line.startswith('#=GENOME_DIFF'): raise Error("Error. first line of file '" + filename + "' should start with: #=GENOME_DIFF") self.version = line.rstrip().split()[-1] continue # for now, ignore the rest of the metadata if line.startswith('#'): continue fields = line.rstrip().split('\t') if fields[0] in mutation_types: mutation = Mutation(line) self.mutations[mutation.seq_id, mutation.position] = mutation utils.close(f)
def file_reader(fname): f = utils.open_file_read(fname) for line in f: yield MpileupLine(line) utils.close(f)
def file_reader(fname): f = utils.open_file_read(fname) c = Caf() while c.get_next_from_file(f): yield c utils.close(f)
def file_reader(fname): f = utils.open_file_read(fname) for line in f: if line.startswith('@'): continue yield SamRecord(line) utils.close(f)
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, including weirdness in file''' f_in = utils.open_file_read('fastn_unittest.fa') fa = fastn.Fasta() counter = 1 while fa.get_next_from_file(f_in): self.assertEqual(fa, fastn.Fasta(str(counter), 'ACGTA')) counter += 1 utils.close(f_in)
def file_reader(fname): f = utils.open_file_read(fname) in_header = True for line in f: if in_header: if line.startswith('['): in_header = False continue yield NucmerHit(line) utils.close(f)
def __init__(self, bsub_o, log_file, max_joins, extra_cpu=0, extra_mem=0): # get flag counts etc from the log file self.flag_counts = {k: 0 for k in ScaffResults.possible_flags} self.stats = {k: 0 for k in ScaffResults.evaluation_score_keys} if os.path.exists(log_file): f = utils.open_file_read(log_file) for line in f: a = line.split() if a[0].isdigit(): self.flag_counts[int(a[0])] = int(a[1]) elif a[0] == 'lost': self.stats['Lost tags'] = int(a[1]) elif a[0] == 'skipped': self.stats['Skipped tags'] = int(a[1]) utils.close(f) self.stats['Bad joins'] = sum([ self.flag_counts[x] for x in self.flag_counts.keys() if x not in [0, 16] ]) + self.stats['Lost tags'] else: print('Warning: no log file', log_file, file=sys.stderr) # get cpu and mem from bsub file bsub_out = utils.syscall_get_stdout('bsub-out2stats.py -s ' + bsub_o) assert len(bsub_out) == 1 (attempt_no, exit_code, wall_hrs, cpu_secs, cpu_hrs, mem, swap, filename) = bsub_out[0].split('\t') assert exit_code == '0' self.stats['Correct joins'] = self.flag_counts[0] self.cpu = int(round(float(cpu_secs), 0)) self.mem = int(mem) self.extra_cpu = extra_cpu self.extra_mem = extra_mem self.stats['Total CPU'] = self.cpu + extra_cpu self.max_mem = max(self.mem, extra_mem) self.scores = {k: -1 for k in ScaffResults.evaluation_score_keys} self.worksheet_row = -1 self.potential_joins = max_joins self.total_joins = self.stats['Correct joins'] + self.stats['Bad joins'] if self.total_joins > 0: self.percent_joins_correct = self.stats[ 'Correct joins'] / self.total_joins else: self.percent_joins_correct = 0 self.percent_correct_joins_made = self.stats[ 'Correct joins'] / self.potential_joins
def get_sequence_lengths(fname): lengths = {} f = utils.open_file_read(fname) for line in f: if not line.startswith('@'): break elif line.startswith('@SQ'): try: l = line.rstrip().split('\t')[1:] d = {x[:2]:x[3:] for x in l} lengths[d['SN']] = int(d['LN']) except: raise Error('Error getting length from line of BAM header\n' + line) utils.close(f) return lengths
def file2regions(fname): regions = {} f = utils.open_file_read(fname) for line in f: if line.startswith('#'): continue (chr, start, end) = line.rstrip().split() if chr not in regions: regions[chr] = [] regions[chr].append(genome_intervals.Interval(start, end)) utils.close(f) return regions
def test_write_and_read(self): '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: f = utils.open_file_write(filename) for i in range(3): print(i, file=f) utils.close(f) counter = 0 f = utils.open_file_read(filename) for line in f: self.assertEqual(counter, int(line.strip())) counter += 1 utils.close(f) os.unlink(filename)
def file_reader(fname, read_quals=False): f = utils.open_file_read(fname) line = f.readline() if line.startswith('>'): seq = Fasta() previous_lines[f] = line elif line.startswith('@'): seq = Fastq() previous_lines[f] = line elif line == '': utils.close(f) return else: raise Error('Error determining file type from file "' + fname + '". First line is:\n' + line.rstrip()) while seq.get_next_from_file(f, read_quals): yield seq utils.close(f)
parser = argparse.ArgumentParser( description= 'Makes a random genome with sequence lengths and names determined by an fai file. IMPORTANT: not really random, at the moment every base will be an A (or an N if --gaps_file used)', usage='%(prog)s [options] <fai file> <outfile>') parser.add_argument( '--gaps_file', help='File of gaps, each line in the form: "chr start end" (tab separated)' ) parser.add_argument('fai_file', help='Name of fai file') parser.add_argument('outfile', help='Name of output fasta file') options = parser.parse_args() gaps = {} if options.gaps_file: f = utils.open_file_read(options.gaps_file) for line in f: (id, start, end) = line.rstrip().split('\t') gap = genome_intervals.Interval(int(start) - 1, int(end) - 1) if id not in gaps: gaps[id] = [] gaps[id].append(gap) utils.close(f) f_in = utils.open_file_read(options.fai_file) f_out = utils.open_file_write(options.outfile) for line in f_in: a = line.rstrip().split() fa = fastn.Fasta(a[0], 'A' * int(a[1]))
reference_lengths = {} fastn.lengths_from_fai(options.reference_fai, reference_lengths) tags = {} # id -> tag tags_by_chr = {} tags_tsv_file = options.tags_files_prefix + '.tags.tsv' tags_fa_file = options.tags_files_prefix + '.uniquely-tagged.tags.fa' if options.circular: circular_seqs = set(options.circular) else: circular_seqs = set() # load tags from file f = utils.open_file_read(tags_tsv_file) for line in f: a = line.rstrip().split('\t') assert ' ' not in a[-1] (chr, pos, strand) = a[-1].split(':::') assert strand == '+' tag = Tag(a[0], chr, pos, len(a[3])) assert tag.id not in tags tags[tag.id] = tag if tag.chr not in tags_by_chr: tags_by_chr[tag.chr] = [] tags_by_chr[tag.chr].append(tag) utils.close(f)
def test_get_next_from_file(self): '''get_next_from_file() should read caf records from file correctly''' f_in = utils.open_file_read('caf_unittest.caf') c = caf.Caf() c.get_next_from_file(f_in) e = caf.Caf() e.id = 'pknbac5b2Aa01.p1k' seq = ''.join(['NGGAGAGACTCGGACTAGTTCTACACCCTCACACCTTTGTCCTAAACCTTGAATCTAAGT' 'CCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCCTACGACC', 'CTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCAACACCCT', 'TACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCAACAACCTTCATTC', 'CAACACCCCAAACAACATCATTCCAACACCCCAAACAACATCATTCCAACACCCCAAACA', 'ACATCATTCCAACACCCCAAACAACATCATTCCAACACGGCAACAACATCATTCGAACAC', 'CCCTACAACATCATTCCAGCACCCCAACAACCTCCCTGCGAAACCCCGAATCCGAATTTT', 'GACACCCCTACAACCTTATTCTGACACCCCCAACAAACTTTCTCTAACACCCCAACAACG', 'TGACTACTAATACACCTAAAACCTTACTCCTAAACCCGGAATCCGACTTCTAATACCGCA', 'ACAACCTTCATTCCTAAACCCGGAATCTGAACCCTGAACCATTAAAACATAAAACGTGGA', 'AAATGAACCCCTGAACCATGAAAACCGTGAAAACCTATAACTTGGACCATGAACCTCTCA', 'ACCCCGAAATATGAGAACTTTGGAAACCCTAAATTTTGGGAAAACTCCTTTTTTTTTTTT', 'TTATTGTACATCCTGTGCGATGGTATACATTTTGGCGAATGCAAAAGAATTAGCATATAT', 'ATATGTGTAGGTCTTTGTGATGGTCAGGGGGGAGATCGACTAGGGTGTAGGTCTTTGTGA', 'TGGTCAAGGGAGATGGGCCAAAGGGAAGTCGGACAAGGTGAGATGGGCCAAGGAGATGGG', 'CCTAGGGTGGATGGGACAAGGGTGGATGGTCAGAGGTGGATGGTCAAGGGTGGATGGTCA', 'AGGATGAATGGGCAAGGGAGATGGGCAAAGTAGATGGGCAAGGGTGGATGGACAAGGTGG', 'ATGGCCAAAGTGGATGGCAAGGAGGATGGCCCAGGTAATAGGCAAGGAAATGGCCAGGTG', 'GATGGACCAGGTGGTGCCCTAATGGAGGCAGGGTGAAGTCCAGGAGGAGGCCCAGGAAAA', 'GGCCCAGAGAAACCCAAGGAAAGGCCCAGGGGGTGGGACAGGGGAAGCGCCAAGGGATGC', 'CAAGGTGGGGGCCAGAAAATAGCCCAGAAAAGGCCAAAATAAGCCAAGAAAAGCCCCAGA', 'AAACCCAAGAAA']) quals = [4, 4, 4, 4, 6, 6, 8, 6, 6, 6, 6, 10, 12, 11, 13, 13, 20, 19, 9, 10, 9, 9, 9, 19, 19, 34, 34, 39, 35, 35, 35, 37, 35, 34, 26, 26, 16, 17, 11, 21, 21, 32, 35, 37, 37, 32, 45, 23, 17, 17, 18, 27, 29, 32, 35, 32, 32, 32, 32, 39, 35, 35, 35, 35, 35, 37, 42, 31, 31, 14, 13, 13, 25, 25, 35, 40, 33, 29, 23, 23, 15, 25, 24, 35, 35, 35, 35, 23, 36, 18, 18, 23, 28, 33, 29, 29, 32, 32, 32, 32, 35, 35, 32, 35, 35, 32, 35, 35, 44, 44, 37, 35, 28, 26, 24, 19, 23, 30, 33, 40, 32, 32, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 37, 30, 30, 27, 27, 21, 21, 21, 29, 26, 29, 29, 23, 23, 28, 37, 37, 50, 50, 40, 35, 35, 32, 32, 32, 35, 44, 37, 35, 35, 35, 35, 35, 35, 32, 32, 35, 35, 35, 35, 44, 42, 42, 41, 41, 41, 41, 41, 42, 41, 41, 41, 41, 41, 41, 44, 44, 42, 42, 42, 42, 42, 35, 37, 35, 35, 33, 37, 37, 44, 44, 44, 41, 42, 50, 42, 42, 42, 44, 44, 50, 50, 44, 44, 44, 44, 44, 44, 50, 50, 44, 44, 44, 44, 44, 41, 42, 44, 42, 42, 42, 44, 44, 42, 42, 41, 41, 41, 42, 44, 50, 50, 50, 44, 44, 44, 44, 44, 37, 37, 37, 37, 39, 41, 41, 44, 44, 44, 44, 47, 47, 44, 44, 44, 43, 43, 42, 42, 37, 37, 37, 41, 41, 42, 44, 44, 44, 44, 44, 42, 42, 42, 41, 41, 41, 44, 44, 44, 46, 42, 41, 37, 37, 37, 37, 37, 41, 42, 35, 35, 35, 35, 35, 35, 35, 42, 41, 42, 44, 50, 42, 42, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 37, 37, 41, 44, 44, 47, 37, 37, 33, 33, 33, 27, 27, 37, 37, 47, 47, 47, 47, 47, 50, 44, 44, 42, 50, 35, 35, 35, 42, 42, 44, 50, 50, 50, 42, 42, 42, 42, 35, 35, 37, 42, 50, 44, 44, 44, 44, 44, 44, 47, 47, 47, 47, 44, 50, 44, 44, 44, 44, 47, 47, 44, 47, 50, 50, 50, 48, 37, 17, 17, 13, 22, 22, 35, 36, 42, 42, 35, 35, 35, 37, 37, 42, 50, 35, 35, 35, 35, 37, 37, 35, 35, 33, 33, 33, 33, 42, 42, 42, 41, 41, 41, 41, 41, 41, 50, 37, 44, 44, 44, 42, 37, 37, 21, 21, 21, 33, 33, 42, 50, 50, 44, 44, 44, 44, 44, 44, 44, 42, 37, 44, 44, 44, 44, 42, 42, 42, 42, 42, 44, 44, 44, 50, 50, 44, 44, 44, 37, 37, 35, 33, 33, 21, 21, 33, 33, 33, 41, 42, 42, 41, 44, 44, 44, 44, 42, 42, 42, 42, 44, 41, 44, 37, 42, 37, 41, 41, 42, 42, 50, 50, 44, 44, 44, 44, 42, 42, 27, 33, 27, 33, 33, 37, 37, 50, 35, 35, 35, 37, 37, 44, 44, 50, 44, 44, 44, 37, 37, 35, 31, 31, 37, 37, 44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 37, 37, 28, 28, 23, 28, 26, 33, 33, 33, 29, 29, 29, 33, 35, 46, 33, 23, 23, 26, 33, 33, 50, 44, 37, 37, 30, 37, 37, 42, 50, 30, 30, 30, 37, 37, 23, 28, 15, 15, 11, 15, 27, 37, 33, 37, 26, 26, 28, 37, 42, 48, 48, 37, 23, 23, 23, 31, 31, 33, 23, 23, 24, 31, 31, 31, 33, 24, 25, 21, 21, 21, 28, 31, 33, 42, 42, 42, 42, 44, 44, 44, 44, 30, 23, 16, 10, 10, 16, 24, 33, 24, 24, 24, 30, 33, 36, 42, 42, 44, 44, 42, 39, 39, 33, 46, 27, 28, 28, 33, 33, 37, 37, 37, 22, 22, 17, 19, 19, 33, 31, 33, 27, 27, 18, 18, 24, 29, 32, 33, 35, 33, 40, 40, 37, 34, 27, 27, 14, 14, 13, 13, 18, 12, 20, 25, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 47, 56, 56, 56, 47, 42, 42, 42, 42, 27, 23, 15, 11, 11, 27, 33, 42, 42, 33, 24, 10, 10, 10, 13, 15, 18, 14, 14, 14, 14, 14, 25, 30, 33, 30, 21, 21, 27, 27, 22, 15, 13, 22, 22, 19, 19, 15, 15, 11, 10, 17, 27, 27, 21, 15, 18, 13, 13, 16, 22, 24, 37, 31, 40, 40, 37, 47, 40, 37, 27, 27, 24, 24, 17, 20, 13, 10, 10, 11, 11, 14, 12, 19, 10, 10, 12, 14, 11, 10, 10, 10, 10, 15, 11, 15, 15, 25, 12, 12, 8, 8, 10, 17, 10, 21, 21, 8, 8, 8, 10, 10, 19, 25, 21, 19, 10, 10, 8, 9, 10, 12, 14, 17, 24, 22, 16, 16, 10, 9, 8, 14, 12, 12, 9, 9, 9, 9, 9, 9, 13, 19, 15, 18, 22, 22, 15, 15, 15, 15, 9, 10, 9, 8, 8, 9, 10, 14, 10, 10, 19, 15, 12, 9, 15, 4, 4, 4, 8, 8, 10, 12, 9, 8, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 16, 10, 10, 10, 8, 7, 7, 7, 7, 7, 13, 20, 19, 15, 15, 10, 10, 8, 8, 10, 10, 10, 15, 10, 8, 8, 9, 8, 9, 10, 11, 10, 8, 8, 8, 8, 8, 4, 8, 4, 7, 7, 9, 13, 16, 11, 10, 12, 11, 13, 8, 8, 8, 8, 8, 9, 10, 9, 9, 9, 8, 8, 8, 12, 8, 9, 9, 11, 10, 10, 7, 7, 9, 7, 8, 9, 11, 10, 9, 10, 9, 10, 7, 7, 7, 9, 8, 8, 10, 8, 8, 4, 7, 4, 4, 4, 4, 4, 8, 7, 7, 8, 9, 9, 7, 7, 9, 9, 9, 9, 8, 7, 7, 7, 7, 10, 10, 7, 8, 8, 9, 10, 10, 10, 14, 13, 9, 8, 7, 7, 7, 6, 6, 7, 7, 6, 6, 6, 6, 6, 8, 15, 10, 8, 8, 8, 8, 6, 7, 6, 6, 6, 6, 7, 7, 7, 8, 7, 4, 4, 4, 6, 6, 6, 6, 7, 13, 7, 7, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 9, 7, 7, 7, 6, 6, 6, 6, 6, 7, 9, 7, 7, 7, 8, 10, 8, 8, 8, 8, 9, 6, 6, 6, 6, 6, 6, 6, 7, 7, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 7, 6, 6, 7, 9, 7, 7, 11, 6, 6, 7, 6, 6, 8, 7, 7, 8, 8, 10, 8, 8, 8, 6, 6, 7, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 12, 9, 14, 10, 10, 10, 10, 8, 9, 8, 8, 8, 7, 7, 7, 7, 7, 13, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 6, 6, 6, 6, 6, 8, 8, 8, 9, 7, 7, 7, 8, 9, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 12, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 6, 9, 6, 6, 6, 7, 7, 7, 7, 7, 8, 10, 10, 14, 9, 12, 7, 7, 7, 4, 4] e.seq = fastn.Fasta(e.id, seq) e.seq = e.seq.to_Fastq(quals) e.insert_min = 2000 e.insert_max = 4000 e.ligation = '96781' e.clone = 'pknbac5b2' e.clip_start = 23 e.clip_end = 789 self.assertEqual(c, e) c.get_next_from_file(f_in) e = caf.Caf() e.id = 'pknbac5b2Aa02.p1k' seq = ''.join(['AAAGACATACGACCTTTTTTTTTTTCGATAACAAAGGGTATCCTTTCACCAGAAAAAAAA', 'AAAGAACATTCTTCTTTTTTCTTGAAGAACATACATTCTTTTTTTTATTTTATTTTTTTT', 'TTTCGACCCCTCAGTGTTGTGGTAGCATGATGTGTTGGACTTGAATGGTATATGTATTGA', 'TTGTTTCGTTCGTTATGTAATTTCCGGTTTTTCCCCGTGGCATCCGGATAGTGTATAGTA', 'TCCGGTCCCTGTGTTCAAAAAGTTTTTCCTTTTCCCCTTAAAGCAACTGAAGTTAAACCC', 'TGAACCTTACTACTGAACCCGGAATTTGACTTCTAAAACCCTGAAGAATGATTCCTATAA', 'CCCTAAAAAATCCAACCTAAAACATCCAAACTGAACCATAGAACCTTCCTCCTAAACCCG', 'GAATCTATGTTCTAACACCCTGACATCTTTGTCCTAAACCCTGAATCTAAGTTCTAACAT', 'CCTGACAACTCTCCCTCCTAAACCCGGAATCTAAATTCGTACACCCTGACACCTCCCCCC', 'TAAACCCGGAATCCGCATTCTAACACCCTGACAATTTCCTCCTGAAAAGCGGAATCTGAC', 'TTCTAACACCCTGACACCTTTGTCCTGAACCCGGAATCTAAGTTCTTACACCCGGACACC', 'TCCCTCCTAAATCCGGAATCTAAGTTCTAACACCCTCACACCTTTGTCCTAAACCTTGAA', 'TCTAAGTCCTAACACCCTGACACCTTTGTCCTAAGCCCGGAATCTAACTTCTAGCACCCC', 'TACGACCCTTATTCCTAAACCCAGAATCTGACTATTGACACCCCTACAACCCTAATTCCA', 'ACACCCTTACAACCTTCATTCCAACACCGCAACAACCTTCATTCCAGCACCCCTACAACT', 'TCATTCCTACACCCCAAACAACATCATCCCTACACCCCAAACAACATCATTCCTACACCC', 'CAAACACATCATCCAACACCCCATAACACATCATTCCAACACGGCAACAACATCATTCGA', 'AACACCCCTACAAATCATTGCAGCACCCCCACTACCTCCCTGCGTATACCCGTATTCGAA', 'ATTTTGACACCCCTACTACCTTTATCTGACACCCCCAAAAAACTCCTCTTAAACCCAACA', 'AGGGGACTATAATACCCCTAAAACTTTATCTTAACCGGAATCCGAATTCTATACCGAAAA', 'AACTTCTTTCCTAACCGGGATCTGTACCCCGAACTTTTAAAATTAAAGGGGAAATGAACC', 'CCTGACCAGATAACGGGAAACCTTTATTGTGACAGGAACTCCTACCGCAATATGAAAATT', 'GGACCCCAAATTTGGGAAACCCCTTTT']) quals = [9, 9, 6, 4, 4, 4, 4, 7, 6, 6, 8, 6, 6, 6, 7, 7, 14, 8, 8, 8, 10, 17, 21, 12, 9, 10, 10, 9, 11, 8, 9, 11, 11, 21, 12, 15, 15, 21, 24, 33, 32, 35, 29, 29, 22, 22, 15, 29, 25, 26, 18, 18, 18, 31, 31, 47, 56, 56, 56, 42, 36, 44, 28, 28, 28, 39, 33, 35, 30, 36, 33, 35, 35, 36, 35, 37, 42, 35, 35, 31, 29, 26, 26, 20, 33, 15, 22, 22, 29, 29, 32, 35, 35, 36, 35, 35, 42, 42, 37, 37, 42, 47, 47, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 44, 47, 47, 47, 47, 47, 35, 30, 30, 23, 24, 30, 45, 37, 37, 37, 35, 23, 23, 11, 11, 13, 23, 31, 21, 21, 19, 20, 23, 29, 23, 20, 16, 16, 30, 29, 29, 28, 28, 28, 24, 24, 17, 29, 29, 33, 33, 35, 31, 37, 18, 15, 12, 16, 16, 23, 27, 24, 32, 29, 32, 32, 24, 26, 29, 37, 29, 30, 35, 35, 33, 35, 35, 31, 33, 31, 31, 35, 31, 31, 31, 31, 27, 33, 33, 42, 35, 37, 37, 21, 21, 21, 21, 37, 37, 50, 50, 50, 50, 50, 33, 33, 18, 16, 15, 25, 19, 20, 33, 33, 33, 35, 35, 33, 33, 33, 18, 18, 18, 33, 24, 33, 33, 33, 27, 33, 33, 33, 33, 33, 22, 33, 33, 33, 24, 24, 21, 24, 24, 31, 31, 11, 11, 11, 31, 33, 44, 44, 37, 42, 42, 47, 44, 44, 44, 44, 44, 44, 44, 47, 50, 50, 42, 42, 42, 41, 42, 42, 47, 47, 37, 37, 27, 33, 33, 33, 33, 35, 35, 42, 41, 37, 37, 44, 50, 50, 33, 33, 27, 33, 37, 42, 42, 42, 41, 41, 33, 33, 27, 27, 33, 33, 37, 50, 35, 35, 35, 35, 35, 35, 35, 42, 35, 37, 35, 37, 35, 41, 37, 42, 42, 42, 42, 50, 50, 50, 42, 35, 33, 33, 21, 21, 16, 23, 19, 27, 27, 33, 35, 41, 50, 37, 35, 35, 42, 50, 50, 50, 44, 44, 44, 50, 42, 42, 37, 37, 35, 35, 35, 44, 44, 50, 50, 41, 37, 37, 37, 37, 35, 35, 35, 37, 37, 37, 44, 37, 37, 33, 33, 22, 33, 37, 35, 33, 33, 21, 21, 21, 33, 33, 41, 41, 44, 44, 44, 44, 44, 50, 50, 44, 44, 37, 50, 33, 33, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 50, 50, 50, 50, 50, 44, 47, 44, 44, 48, 33, 21, 24, 21, 33, 33, 35, 37, 50, 50, 37, 35, 35, 50, 50, 56, 50, 50, 50, 50, 48, 33, 27, 33, 27, 33, 33, 44, 50, 50, 42, 37, 35, 42, 42, 50, 50, 50, 44, 44, 44, 33, 33, 18, 19, 18, 33, 33, 42, 35, 35, 44, 44, 44, 50, 44, 44, 44, 50, 50, 44, 44, 37, 37, 33, 33, 35, 35, 35, 35, 35, 37, 50, 37, 27, 27, 24, 37, 33, 35, 35, 37, 35, 37, 37, 46, 33, 24, 24, 21, 33, 33, 39, 42, 42, 44, 50, 50, 56, 50, 50, 37, 35, 35, 33, 37, 33, 33, 35, 35, 35, 35, 33, 33, 33, 33, 27, 27, 27, 37, 37, 44, 37, 41, 41, 41, 50, 46, 33, 24, 24, 16, 31, 19, 27, 31, 37, 37, 44, 44, 44, 37, 50, 23, 23, 22, 29, 31, 33, 23, 23, 23, 23, 23, 28, 25, 33, 26, 26, 22, 28, 37, 42, 44, 42, 42, 44, 44, 44, 44, 46, 33, 16, 19, 14, 27, 31, 42, 50, 50, 50, 44, 44, 44, 50, 50, 26, 26, 21, 28, 31, 29, 29, 26, 26, 26, 30, 30, 39, 27, 37, 26, 30, 30, 42, 42, 42, 36, 33, 29, 33, 33, 33, 20, 21, 23, 17, 23, 31, 36, 42, 43, 56, 56, 47, 47, 42, 42, 33, 33, 29, 29, 23, 31, 25, 26, 26, 26, 30, 30, 36, 27, 33, 28, 31, 33, 35, 44, 33, 33, 28, 33, 35, 44, 48, 48, 48, 42, 47, 42, 42, 42, 48, 44, 44, 37, 34, 34, 44, 48, 42, 37, 34, 42, 48, 33, 33, 34, 30, 30, 33, 33, 40, 30, 37, 28, 28, 26, 27, 27, 25, 19, 16, 25, 29, 40, 31, 27, 15, 18, 13, 25, 27, 40, 40, 33, 40, 33, 33, 33, 40, 37, 23, 12, 12, 17, 11, 10, 15, 15, 13, 13, 13, 18, 27, 23, 28, 28, 28, 28, 37, 28, 32, 26, 23, 26, 26, 19, 29, 25, 24, 25, 24, 15, 15, 15, 12, 17, 24, 24, 21, 21, 21, 25, 22, 29, 25, 22, 21, 24, 25, 17, 17, 14, 14, 12, 14, 19, 24, 18, 18, 14, 21, 11, 15, 10, 15, 18, 22, 27, 25, 25, 29, 29, 29, 25, 26, 25, 21, 22, 25, 22, 22, 18, 15, 15, 15, 25, 19, 25, 25, 16, 24, 24, 20, 20, 22, 20, 15, 10, 10, 10, 12, 13, 20, 20, 12, 14, 14, 12, 12, 12, 15, 15, 15, 18, 18, 11, 10, 11, 11, 10, 10, 14, 15, 18, 18, 19, 17, 12, 11, 10, 10, 20, 15, 19, 24, 24, 24, 23, 15, 13, 7, 6, 6, 6, 6, 6, 12, 13, 12, 9, 8, 10, 10, 9, 6, 6, 6, 6, 10, 10, 13, 15, 15, 15, 15, 17, 9, 9, 9, 9, 9, 11, 11, 9, 7, 7, 7, 6, 4, 4, 6, 9, 9, 8, 8, 8, 10, 9, 8, 7, 7, 7, 7, 7, 9, 13, 10, 10, 10, 15, 12, 9, 9, 9, 15, 19, 15, 15, 11, 7, 7, 7, 7, 7, 7, 8, 8, 19, 10, 10, 10, 12, 12, 19, 11, 15, 18, 11, 14, 9, 9, 6, 6, 6, 6, 6, 6, 6, 8, 11, 20, 13, 17, 14, 14, 9, 9, 10, 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 11, 10, 12, 11, 10, 12, 9, 12, 8, 8, 8, 9, 12, 12, 8, 11, 7, 8, 8, 8, 8, 11, 9, 8, 6, 4, 4, 4, 6, 6, 7, 10, 10, 12, 9, 7, 7, 6, 6, 6, 6, 8, 6, 9, 10, 13, 8, 11, 8, 7, 7, 8, 7, 7, 7, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 14, 10, 8, 12, 8, 8, 6, 7, 9, 8, 7, 6, 8, 7, 4, 4, 7, 7, 6, 7, 6, 6, 6, 6, 8, 11, 8, 8, 8, 8, 12, 10, 12, 11, 11, 11, 10, 12, 10, 7, 7, 9, 4, 4, 8, 6, 6, 6, 6, 6, 6, 7, 10, 7, 7, 7, 7, 7, 9, 9, 9, 7, 7, 7, 6, 6, 6, 7, 7, 7, 10, 11, 9, 7, 6, 6, 8, 6, 6, 8, 8, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 9, 12, 10, 15, 15, 16, 7, 7, 6, 6, 6, 7, 6, 6, 6, 6, 6, 8, 7, 7, 8, 7, 8, 7, 7, 9, 8, 7, 7, 8, 8, 9, 7, 6, 7, 6, 9, 6, 7, 11, 7, 7, 11, 8, 8, 7, 10, 8, 9, 8, 6, 6, 6, 6, 7, 7, 7, 6, 6, 6, 8, 8, 7, 7, 6, 9, 7, 6, 6, 6, 6, 6, 6, 6, 8, 7, 7, 7, 7, 7, 7, 7, 10, 12, 19, 13, 13, 10, 9, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 6, 6, 6, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 6, 6, 6, 7, 7, 7, 7, 8, 7, 6, 6, 6, 6, 6, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 11, 12, 9] e.seq = fastn.Fasta(e.id, seq) e.seq = e.seq.to_Fastq(quals) e.insert_min = 2000 e.insert_max = 4000 e.ligation = '96781' e.clone = 'pknbac5b2' e.clip_start = 33 e.clip_end = 848 self.assertEqual(c, e) utils.close(f_in)
parser = argparse.ArgumentParser( description='Gets GAGE stats from bsub stdout file', usage='%(prog)s <gage.o>') parser.add_argument('infile', help='Name of input gage.o bsub stout file') options = parser.parse_args() contigs = -1 scaffs = -1 contig_N50 = -1 scaff_N50 = -1 contig_corr_N50 = -1 scaff_corr_N50 = -1 contig_errs = -1 scaff_errs = -1 f = utils.open_file_read(options.infile) lines = f.readlines() utils.close(f) i = 0 while i < len(lines): line = lines[i].rstrip() if line == 'Contig Stats': contigs = int(lines[i + 1].split()[-1]) if lines[i + 8].startswith('N50'): contig_N50 = int(lines[i + 8].split()[1]) elif line == 'Scaffold Stats': scaffs = int(lines[i + 1].split()[-1]) if lines[i + 8].startswith('N50'): scaff_N50 = int(lines[i + 8].split()[1])
for i in range(len(ScaffResults.headers)): h = ScaffResults.headers[i] worksheet_all.cell(row=0, column=i).value = h if h.endswith(' score') and h[:-6] in ScaffResults.evaluation_score_keys: eval_keys_columns[h] = i current_row = 1 # get the extra cpu and mem usage extra_cpu = {} extra_mem = {} f = utils.open_file_read(options.extra_cpu_file) for line in f: if line.startswith('#'): continue (dataset, scaffolder, cpu, mem) = line.split('\t') mem = int(mem) cpu = int(cpu) extra_cpu[(dataset, scaffolder)] = cpu extra_mem[(dataset, scaffolder)] = mem utils.close(f) # gather all the counts for each scaffolding run results = {k:{} for k in datasets}
def lengths_from_fai(fai_file, d): f = utils.open_file_read(fai_file) for line in f: (id, length) = line.rstrip().split()[:2] d[id] = int(length) utils.close(f)