def checkBIN(a, b): global _dir e = os.system("{} 1>{}/tmp/binaries 2>{}/tmp/binaries2".format( a, _dir, _dir)) IN = open_or_die("{}/tmp/binaries".format(_dir), 'rb', 'can not open {}/tmp/binaries'.format(_dir)) found = 1 while True: line = IN.readline() if not line: break if re.search(b, line): found = 0 IN.close() if found: IN = open_or_die('{}/tmp/binaries2'.format(_dir), 'rb', 'can not open {}/tmp/binaries2'.format(_dir)) while True: line = IN.readline() if not line: break if re.search(b, line): found = 0 IN.close() return found
def cat_to(file_1, file_2): OUT = open_or_die(file_2, 'a', 'cannot print to {}'.format(file_2)) IN = open_or_die(file_1, 'rb', 'cannot read from {}'.format(file_1)) while True: line = IN.readline() if not line: break OUT.write(line) IN.close() OUT.close()
def make_files(file_output, coord_file, thres): TMP1 = open_or_die('{}_all'.format(file_output), 'rb', 'can not open {}'.format('file_output')) PRES = open_or_die(file_output, 'w+', 'can not create '.format(file_output)) counter = 0 while True: l = TMP1.readline() if not l: break l = l.strip() if re.match('^>', l): line = l.split('_xyz123_') if int(line[3]) == thres: # last value counter += 1 seq = TMP1.readline() PRES.write('{}_{}\n{}'.format(line[0], counter, seq)) TMP1.close() PRES.close() COORD = open_or_die(coord_file, 'w+', 'can not create {}'.format(coord_file)) TMP2 = open_or_die('{}_all'.format(coord_file), 'rb', 'can not open {}'.format(coord_file)) counter = 0 while True: l = TMP2.readline() if not l: break l = l.strip() if re.match('^>', l): # split() => split by space, \t, \n line = esplit(l) line2 = line[0].split('_xyz123_') if int(line2[3]) == thres: counter += 1 COORD.write('{}_{}\t{}\t{}\t{}\n'.format( line2[0], counter, line[1], line[2], line[3])) TMP2.close() COORD.close()
def parse_file_solexa(options, file_solexa): global running FILE = open_or_die(file_solexa, 'rb', 'can not open file: {}'.format(file_solexa)) while True: l = FILE.readline() if not l: break if re.search(r'^\s*$', l): continue fields = re.split(r'\s+', l) seq = '' if options.get('-a') == '': seq = fields[8] else: seq = fields[4] seq = seq.strip() seq = re.sub(r'\.', 'N', seq) print('>{}_{}'.format(seq, running)) print(seq) running += 1 FILE.close()
def parse_file_mrd(file_out, _hash): global hash_sig score = None refs = [] FILE = open_or_die(file_out, 'rb', 'can not open {}\n'.format(file_out)) while True: line = FILE.readline() if not line: break m = re.match(r'^score\s+total\s+(\S+)', line) if m: m = m.groups() score = float(m[0]) else: m2 = re.match(r'^(\S+)', line) defined = False if m2: try: hash_sig[m2.groups()[0]] defined = True except KeyError: pass if m2 and defined: refs.append(m2.groups()[0]) elif re.search(r'^>', line) and score is not None: resolve_entry_file_mrd(score, refs, _hash) score = None # ATTR: ($$score,@$refs)=(); resolve_entry_file_mrd(score, refs, _hash)
def printUsedParameters(options): global _dir, ltime, command_line, file_reads, file_genome, file_reads_vs_genome fname = '{}/run_{}_parameters'.format(_dir, ltime) OUT = open_or_die(fname, 'w+', 'can not open {}\n'.format(fname)) OUT.write("Start: {}\n".format(ltime)) OUT.write("Script\t{}\n".format(sys.argv[0])) OUT.write("args {}\n".format(command_line)) OUT.write("dir_with_tmp_files\tdir_moR_{}\n".format(ltime)) d = cwd() OUT.write("dir\t{}\n".format(d)) OUT.write("file_reads\t{}\n".format(file_reads)) OUT.write("file_genome\t{}\n".format(file_genome)) OUT.write("file_reads_vs_genome\t{}\n".format(file_reads_vs_genome)) OUT.write("file_mature_ref_this_species\t{}\n".format( file_mature_ref_this_species)) OUT.write("file_mature_ref_other_species\t{}\n".format( file_mature_ref_other_species)) if options.get('-a'): OUT.write("option -a =\t{}\n".format(options.get('-a'))) if options.get('-b'): OUT.write("option -b =\t{}\n".format(options.get('-b'))) if options.get('-c') == '': OUT.write("option -c =\t{}\n".format(options.get('-c'))) if options.get('-t'): OUT.write("option -t =\t{}\n".format(options.get('-t'))) if options.get('-v') == '': OUT.write("option -v =\tused\n") # if($options{'q'}){print OUT "option{q} =\t$options{'q'}\n";} OUT.close()
def parse_genome_and_excise(TMP1, TMP2, file_fasta): FASTA = open_or_die(file_fasta, 'rb', 'can not open {}'.format(file_fasta)) while True: line = FASTA.readline() if not line: break line = line.strip() m = re.match(r'^>(\S+)(.*)', line) if m: _id = m.groups()[0] desc = m.groups()[1] sequence = '' while True: ll = FASTA.readline() if not ll: break ll = ll.strip() mm = re.match(r'^>(\S+)(.*)', ll) if mm: excise(TMP1, TMP2, _id, sequence) _id = mm.groups()[0] desc = mm.groups()[1] sequence = '' continue sequence += ll excise(TMP1, TMP2, _id, sequence) FASTA.close()
def parse_fasta(file_fasta, _hash): FASTA = open_or_die(file_fasta, 'rb', 'can not open file'.format(file_fasta)) while True: l = FASTA.readline() if not l: break l = l.strip() m = re.match(r'^>(\S+)(.*)', l) if m: m = m.groups() _id = m[0] desc = m[1] sequence = '' while True: ll = FASTA.readline() if not ll: break ll = ll.strip() mm = re.match(r'^>(\S+)(.*)', ll) if mm: mm = mm.groups() defined = False try: _hash[_id] defined = True except KeyError: pass if (defined and options.get('-a') is None) or ( not defined and options.get('-a') == ''): print('>{}{}\n{}'.format(_id, desc, sequence)) _id = mm[0] desc = mm[1] sequence = '' continue sequence += ll defined = False try: _hash[_id] defined = True except KeyError: pass if (defined and options.get('-a') is None) or (not defined and options.get('-a') == ''): print('>{}{}\n{}'.format(_id, desc, sequence)) FASTA.close()
def cat_files(file_1, file_2, file_out): OUT = open_or_die(file_out, 'w+', 'cannot print to {}\n'.format(file_out)) IN_1 = open_or_die(file_1, 'rb', 'cannot read from {}\n'.format(file_1)) while True: line = IN_1.readline() if not line: break OUT.write(line) IN_1.close() IN_2 = open_or_die(file_2, 'rb', 'cannot read from {}\n'.format(file_2)) while True: line = IN_2.readline() if not line: break OUT.write(line) IN_2.close() OUT.close()
def presort(_file): IK = open_or_die(_file, 'rb', 'no arf file given\n') IKT = open_or_die('{}.tmp'.format(_file), 'w+', 'tmp file could not be created\n') index = {} count = 0 l = [] while True: line = IK.readline() if not line: break l = esplit(line) if l[5] not in index.keys(): count += 1 index[l[5]] = count IKT.write('{}\t{}'.format(index[l[5]], line)) IK.close() IKT.close()
def get_longest_id(f): l = 0 IN = open_or_die(f, 'rb', 'No file given for checking\n') while True: line = IN.readline() if not line: break m = re.findall(r'>(\S+)', line) if m: if len(m[0]) > l: l = len(m[0]) IN.close() return l
def parse_file_ids(file_ids, _hash): FILE = open_or_die(file_ids, 'rb', 'can not open {}'.format(file_ids)) while True: line = FILE.readline() if not line: break m = re.match(r'^(\S+)', line) if m: _id = m.groups()[0] _hash[_id] = 1 FILE.close()
def parse_file_ids(_file, _hash): # read id file into hash if options.get('-k') == '': print_stderr('reading id file into memory\n') FILE = open_or_die(_file, 'rb', 'can not open {}\n'.format(_file)) while True: line = FILE.readline() if not line: break m = re.match(r'^(\S+)', line) if m: _id = m.groups()[0] _hash[_id] = 1
def parse_fasta(file_fasta): FASTA = open_or_die(file_fasta, 'rb', 'can not open {}\n'.format(file_fasta)) while True: line = FASTA.readline() if not line: break m = re.match(r'^(>\S+)', line) if m: pprint('{}\n'.format(m.groups()[0])) else: pprint('{}'.format(re.sub('U', 'T', line).upper())) FASTA.close() return
def parse_file_command_line(file_command_line, file_structure, _dir): FILE = open_or_die(file_command_line, 'rb', 'can not open {}'.format(file_command_line)) while True: line = FILE.readline() if not line: break if re.search(r'(\S+)', line): line = line.strip() line = re.sub(file_structure, '{}/precursors_permuted.str'.format(_dir), line, count=1) line = re.sub(r'>.+', '', line, count=1) return line die('{} is empty\n'.format(file_command_line))
def parse_file_arf(file_arf): global count_lines, hash_pos lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip()) if options.get('-b') == '': print_stderr( 'reading the mapping file into memory, total lines=$lines\n'. format(lines)) FILENAME = open_or_die(file_arf, 'rb', 'Could not open file {}'.format(file_arf)) while True: line = FILENAME.readline() if not line: break m = re.match( r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_map_lng = int(m[1]) query_beg = int(m[2]) query_end = int(m[3]) query_seq = m[4] db = m[5] db_map_lng = int(m[6]) db_beg = int(m[7]) db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] freq = find_freq(query) # read into position hash insertfeature(db, strand, db_beg, db_end, freq) count_lines += 1 FILENAME.close()
def handle_config_file(file_reads, MAP, options): FILE = open_or_die( file_reads, 'rb', 'can not open {}\n'.format(file_reads)) while True: l = FILE.readline() if not l: break m = re.match(r'(^\S+)\s+(\S+)\s*.*$', l) if m: m = m.groups() file_reads = m[0] prefix = m[1] if (len(file_reads) < len(prefix)): file_reads = m[1] prefix = m[0] test_prefix(prefix) MAP.write("\nhandling file '{}' with prefix '{}'\n".format( file_reads, prefix)) # check if files in config file are in accordance with option # specified if options.get('-a') == '': check_file_format_and_option(file_reads, 'a') if options.get('-b') == '': check_file_format_and_option(file_reads, 'b') if options.get('-c') == '': check_file_format_and_option(file_reads, 'c') if options.get('-e') == '': check_file_format_and_option(file_reads, 'e') if options.get('-v') == '': print_stderr("\nhandling file '{}' with prefix '{}'\n".format( file_reads, prefix)) handle_one_file(file_reads, prefix, MAP, options) FILE.close()
def parse_file_sig(file_sig, hash_sig): global hash_ref FILE = open_or_die(file_sig, 'rb', 'can not open {}\n'.format(file_sig)) while True: line = FILE.readline() if not line: break m = re.match(r'^(\S+)', line) # Test isdefined defined = False if m: try: hash_ref[m.groups()[0]] defined = True except KeyError: pass if m and defined: # dummy value hash_sig[m.groups()[0]] = -50
def parse_file_mrd_permuted(infile, _hash, options): global hash_sig score, permutation, refs = None, None, [] FILE = open_or_die(infile, 'rb', 'can not open {}\n'.format(infile)) while True: line = FILE.readline() if not line: break m = re.match(r'^permutation\s+(\d+)', line) if m: permutation = m.groups()[0] else: mm = re.match(r'^score\s+total\s+(\S+)', line) if mm: score = float(mm.groups()[0]) else: m3 = re.match(r'^(\S+)', line) defined = False if m3: try: hash_sig[m3.groups()[0]] defined = True except KeyError: pass if m3 and defined: refs.append(m3.groups()[0]) elif re.search(r'^>', line) and score is not None: resolve_entry_file_mrd_permuted( score, permutation, refs, _hash, options) # ATTR: ($$i, @$refs) = () in # resolve_entry_file_mrd_permuted score = None resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options) score = None
def parse_file_ref(filename, hash_ref): _id, desc, seq = None, None, None FASTA = open_or_die(filename, 'rb', 'can not open {}\n'.format(filename)) while True: l = FASTA.readline() if not l: break l = l.strip() m = re.match(r'^>(\S+)(.*)', l) if m: m = m.groups() _id = m[0] desc = m[1] seq = '' while True: ll = FASTA.readline() if not ll: break ll = ll.strip() mm = re.match(r'^>(\S+)(.*)', ll) if mm: mm = mm.groups() hash_ref[_id] = seq _id = mm[0] desc = mm[1] seq = '' continue seq += ll hash_ref[_id] = seq FASTA.close()
def parse_fasta(options, file_fasta): _id = '' seq = '' FASTA = open_or_die(file_fasta, 'rb', 'can not open file {}\n'.format(file_fasta)) while True: l = FASTA.readline() if not l: break l = l.strip() m = re.match(r'^>(\S+)', l) if m: _id = m.groups()[0] seq = '' while True: ll = FASTA.readline() if not ll: break ll = ll.strip() mm = re.match(r'^>(\S+)', ll) if mm: resolve(options, _id, seq) _id = mm.groups()[0] seq = '' continue seq += ll resolve(options, _id, seq) FASTA.close()
def parse_file_struct(file_struct): global db_old FILE_STRUCT = open_or_die(file_struct, 'rb', 'can not open file {}\n'.format(file_struct)) while True: line = FILE_STRUCT.readline() if not line: break line = line.strip() m = re.match(r'^>(\S+)\s*(.*)', line) if m: m = m.groups() _id = m[0] desc = m[1] seq = "" struct = "" mfe = "" while True: line2 = FILE_STRUCT.readline() if not line2: break line2 = line2.strip() mm = re.match(r'^>(\S+)\s*(.*)', line2) if mm: hash_desc[_id] = desc hash_seq[_id] = seq hash_struct[_id] = struct hash_mfe[_id] = mfe _id = mm.groups()[0] desc = mm.groups()[1] seq = "" struct = "" mfe = "" continue m3 = re.match(r'^\w', line2) if m3: line2 = tr(line2, 'uU', 'tT') seq += line2 m3 = re.search(r'((\.|\(|\))+)', line2) if m3: struct += m3.groups()[0] m3 = re.search(r'\((\s*-\d+\.\d+)\)', line2) if m3: mfe = m3.groups()[0] hash_desc[_id] = desc hash_seq[_id] = seq hash_struct[_id] = struct hash_mfe[_id] = mfe # print('\n'.join(sorted(hash_struct.values()))) # print('\n'.join(sorted(hash_desc.keys()))) FILE_STRUCT.close()
#!/usr/bin/env python from __future__ import print_function import re import sys from port import die, esplit, open_or_die if __name__ == '__main__': if len(sys.argv) < 2: die('No csv file given for bed conversion\n') known, novel, _not, line, thres, score, line, strand, label, end = ( None, None, None, None, None, None, None, None, None, None) IN = open_or_die(sys.argv[1], 'r', 'cannot open {}\n'.format(sys.argv[1])) while True: line = IN.readline() if not line: break if re.search(r'novel miRNAs predicted by moRNA Finder', line): novel = 1 known = 0 _not = 0 elif re.search(r'mature miRBase miRNAs detected', line): novel = 0 known = 1 _not = 0 else: l = esplit(line)
def read_stats(options): _hash = {} count = 0 k2 = {} IN = open_or_die(options.get('-s'), 'rb', 'No reads file in fasta format given\n') while True: line = IN.readline() if not line: break m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line) if m: m = m.groups() try: if _hash[m[0]]: continue except KeyError: pass # ATTR: Performance issue below, use logic above # if m[0] in _hash.keys() and _hash[m[0]]: # continue _hash[m[0]] = 1 count += int(m[2]) if m[1] not in k2.keys(): k2[m[1]] = 0 k2[m[1]] += int(m[2]) IN.close() _hash2 = {} count2 = 0 k22 = {} print_stderr('Mapping statistics\n') IN = open_or_die(options.get('-t'), 'rb', 'No mapping file given\n') while True: line = IN.readline() if not line: break m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line) if m: m = m.groups() if m[0] in _hash2.keys() and _hash2[m[0]]: continue _hash2[m[0]] = 1 count2 += int(m[2]) if m[1] not in k22.keys(): k22[m[1]] = 0 k22[m[1]] += int(m[2]) IN.close() print_stderr('\n#desc\ttotal\tmapped\tunmapped\t%mapped\t%unmapped\n') print_stderr("total: {}\t{}\t{}\t".format(count, count2, count - count2)) print_stderr("{0:.3f}\t{1:.3f}\n".format( count2 / float(count), 1 - (count2 / float(count)))) for k in k2.keys(): print_stderr('{}: {}\t{}\t{}\t'.format( k, k2[k], k22[k], k2[k] - k22[k])) print_stderr('{0:.3f}\t{1:.3f}\n'.format( float(k22[k]) / k2[k], 1 - (float(k22[k]) / k2[k])))
def check_file_format_and_option(file_reads, aFormat): print_stderr('\n') warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n [options] -a input file is seq.txt format -b input file is qseq.txt format -c input file is fasta format -e input file is fastq format -d input file is a config file (see moRNA Finder documentation). options -a, -b, -c or -e must be given with option -d. ''' line = None if aFormat == 'a': i = 0 IN = open_or_die( file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads)) while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) # $#line != 4 if len(line) != 5: die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == 'b': IN = open_or_die( file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads)) i = 0 mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n' while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) if len(line) != 11: die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format( len(line), warning)) if not re.search(r'^\S+', line[9]): die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == '-c': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTA file supplied by option -c\n') i = 0 mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^>\S+$', l): die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 2: if not re.search(r'^\S+$', l): die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format( mes)) if i == 3: if not re.search(r'^>\S+$', l): die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 4: if not re.search(r'^\S+$', l): die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 4: break IN.close() elif aFormat == '-e': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTQ file supplied by option -e\n') i = 0 mes = 'Please make sure your file is in accordance with the FASTQ format specifications' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^@\S+', l): die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format( mes, warning)) if i == 2: if re.search(r'^\S+$', l): die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 3: if re.search(r'^\+', l): die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning)) if i == 4: if re.search(r'^\S+$', l): die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format( mes, warning)) if i == 4: break
def parse_file_arf(file_arf, options): global running, gscan, hash_edits FILE_ARF = open_or_die(file_arf, 'rb', 'can not open {}\n'.format(file_arf)) while True: line = FILE_ARF.readline() if not line: break m = re.match( r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_map_lng = int(m[1]) query_beg = m[2] query_end = int(m[3]) query_seq = m[4] db = m[5] db_map_lng = int(m[6]) db_beg = m[7] db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] running += 1 if options.get('-j') == '': (query_map_lng, query_end, query_seq, db_map_lng, db_end, db_seq, edits, edit_string) = remove_trailing_nts( query_map_lng, query_end, query_seq, db_map_lng, db_end, db_seq, edits, edit_string) if '-a' in options.keys() and int(options.get('-a')) < edits: continue if options.get('-b') and query_map_lng < int(options.get('-b')): continue if options.get('-c') and int(options.get('-c')) < query_map_lng: continue if options.get('-d') and query not in hash_queries_incl.keys(): continue if options.get('-e') and query in hash_queries_excl.keys(): continue if options.get('-f') and db not in hash_dbs_incl.keys(): continue if options.get('-g') and db in hash_dbs_excl.keys(): continue if not (options.get('-h') == '' or options.get('-i')): pprint('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'. format(query, query_map_lng, query_beg, query_end, query_seq, db, db_map_lng, db_beg, db_end, db_seq, strand, edits, edit_string)) continue if gscan: create_hash_key_chain(hash_edits, 0, query, edits) hash_edits[query][edits] += 1 else: evaluation = evaluate_query(query, edits, options) if evaluation: pprint( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n". format(query, query_map_lng, query_beg, query_end, query_seq, db, db_map_lng, db_beg, db_end, db_seq, strand, edits, edit_string))
file_output = args.file_output coord_file = args.coord_file pres_max = args.pres_max opts, argss = getopt.getopt(sys.argv[6:], 'b') options = dict(opts) if not re.search(r'^[-]*\d+', pres_max): print_stderr('{} is not an integer number\n'.format(pres_max)) sys.exit(-1) for z in range(1, upper_bound): dblimit[z] = 0 thres_counts[z] = 0 TMP1 = open_or_die('{}_all'.format(file_output), 'w+', 'cannot create file {}'.format(file_output)) TMP2 = open_or_die('{}_all'.format(coord_file), 'w+', 'cannot create file {}'.format(coord_file)) if options.get('-b') == '': print_stderr('finding lengths of genome contigs\n') parse_file_arf(file_arf) if options.get('-b') == '': print_stderr( 'reading the genome into memory and excising potential precursors\n' ) parse_genome_and_excise(TMP1, TMP2, file_fasta)
if __name__ == '__main__': # create a log file for the mapper.py # the latest run of mapper will be on top of the log file # # if len(sys.argv) < 2: print(usage) sys.exit(-1) if os.path.exists('mapper.log'): os.system('mv mapper.log mapper.log_bak') else: os.system('touch mapper.log_bak') MAP = open_or_die('mapper.log_tmp', 'w+', 'could not create mapper.log_tmp\n') # cdir = os.path.dirname(os.path.realpath(__file__)) cdir = os.path.abspath('.') MAP.write('current dir:\t{}\n'.format(cdir)) MAP.write('mapper command:\t{} {}\n'.format( sys.argv[0], ' '.join(sys.argv[1:]))) parser = argparse.ArgumentParser(usage=usage) parser.add_argument('input_file_reads', help='input file') args = parser.parse_args(sys.argv[1:2]) file_reads = args.input_file_reads if not os.path.exists(file_reads): print('No config or reads file could be found')
print i except KeyError: pass while defined and rhash[rn]: rn = (2 * int(random.randrange(lines / 2))) defined = False try: rhash[rn] defined = True except KeyError: pass rhash[rn] = 1 IN = open_or_die(in_file, 'rb', 'can not open {}'.format(in_file)) while True: l = IN.readline().strip() if not l: break m = re.match(r'^\>(.+)$', l) if m: m = m.groups() counter += 1 _id = m[0] if re.search(r'\s+', _id): die('Error in line {}: The identifier\n {}\n contains white spaces\n\nPlease make sure that none of the identifiers contain whitepaces.\nYou could run remove_white_space_in_id.py {} > newfile\nThis will remove everything from the id line after the first whitespace'.format( Nicenumber(counter), l, in_file,
def parse_file_arf(file_arf): ''' read through the signature blastparsed file, fills up a hash with information on queries (deep sequences) mapping to the current db (potential precursor) and resolve each potential precursor in turn ''' global db_old, hash_query FILENAME = open_or_die(file_arf, 'rb', 'could not open file {}\n'.format(file_arf)) while True: line = FILENAME.readline() if not line: break m = re.match( r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_lng = int(m[1]) query_beg = int(m[2]) query_end = int(m[3]) query_seq = m[4] db = m[5] db_lng = int(m[6]) db_beg = int(m[7]) db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] # only reads that map sense to the potential precursor are # considered if strand == "-": continue # if the new line concerns a new db (potential precursor) then the # old db must be resolved if db_old and db_old != db: # print(db, db_old) resolve_potential_precursor() # resolve the number of reads that the deep sequence represents freq = find_freq(query) # read information of the query (deep sequence) into hash create_hash_key_chain(hash_query, db_beg, query, 'db_beg') create_hash_key_chain(hash_query, db_end, query, 'db_end') create_hash_key_chain(hash_query, strand, query, 'strand') create_hash_key_chain(hash_query, freq, query, 'freq') hash_query[query]["db_beg"] = db_beg hash_query[query]["db_end"] = db_end hash_query[query]["strand"] = strand hash_query[query]["freq"] = freq db_old = db resolve_potential_precursor()