def run_mapper( reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file, ): cmd = 'mapper.py {} -c -j -k {} -l 18 -m -p {} -s {} -t {} -v'.format( reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file) print_stderr(cmd, '\n') ret = os.system(cmd) if ret: die('Run mapper.py failed.\n')
def compute_randfold(): global options, dir_tmp if options.get('-c') == '': return # compute randfold p-values for the subset of precursors which are # plausible Dicer substrates pprint("#computing randfold p-values\n") print_stderr("#computing randfold p-values\n") cmd = "select_for_randfold.py {}/signature.arf {}/precursors.str > {}/precursors_for_randfold.ids\n\n".format( dir_tmp, dir_tmp, dir_tmp) print_stderr(cmd) start() ret_select_for_randfold = os.system(cmd) end() start() cmd = "fastaselect.py {}/precursors.fa {}/precursors_for_randfold.ids > {}/precursors_for_randfold.fa\n\n".format( dir_tmp, dir_tmp, dir_tmp) print_stderr(cmd) ret_fasta_select = os.system(cmd) end() start() cmd = "randfold -s {}/precursors_for_randfold.fa 99 > {}/precursors_for_randfold.rand\n\n".format( dir_tmp, dir_tmp) print_stderr(cmd) ret_randfold = os.system(cmd) end()
def make_dir_tmp(): global _dir, ltime, dir_tmp # make temporary directory if not os.path.isdir('moR_runs'): os.mkdir('moR_runs') _dir = "moR_runs/run_{}".format(ltime) print_stderr("mkdir {}\n\n".format(_dir)) os.mkdir(_dir) dir_tmp = "{}/tmp".format(_dir) os.mkdir(dir_tmp)
def parse_file_arf(file_arf): global count_lines lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip()) if options.get('-b') == '': print_stderr( 'reading the mapping file into memory, total lines={}\n'.format( lines)) try: FILENAME = open(file_arf, 'rb') except IOError: print('Could not open file {}'.format(file_arf)) sys.exit(-1) while True: line = FILENAME.read() if not line: break m = re.match( r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_map_lng = int(m[1]) query_beg = int(m[2]) query_end = int(m[3]) query_seq = m[4] db = m[5] db_map_lng = int(m[6]) db_beg = int(m[7]) db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = m[11] edit_string = m[12] freq = find_req(query) insertfeature(db, strand, db_beg, db_end, freq) count_lines += 1 if options.get('b') == '': pass FILENAME.close()
def map_reads(file_reads_latest, MAP, options): global mismatches_seed, threads, orig_file_reads # map reads to genome MAP.write('mapping reads to genome index\n') if options.get('-v') == '': print_stderr('mapping reads to genome index\n') file_genome_latest = options.get('-p') mapping_loc = 5 if '-r' in options.keys(): mapping_loc = options.get('-r') cmd = 'bowtie -p {} -f -n {} -e 80 -l 18 -a -m {} --best --strata {} --al {}/{}_mapped --un {}/{}_not_mapped {} {}/mappings.bwt 2>bowtie.log\n\n'.format( threads, mismatches_seed, mapping_loc, file_genome_latest, _dir, orig_file_reads, _dir, orig_file_reads, file_reads_latest, _dir ) MAP.write(cmd) ret_mapping = os.system(cmd.strip()) file_mapping_latest = '{}/mappings.bwt'.format(_dir) cmd = 'convert_bowtie_output.py {} > {}/mappings.arf\n'.format( file_mapping_latest, _dir) MAP.write(cmd) ret_parse_to_arf = os.system(cmd.strip()) file_mapping_latest = '{}/mappings.arf'.format(_dir) # trim unmapped nts in the 3' end MAP.write("trimming unmapped nts in the 3' ends\n") if options.get('-v') == '': print_stderr("trimming unmapped nts in the 3' ends\n") cmd = 'parse_mappings.py {} -j > {}/mappings_trim.arf\n\n'.format( file_mapping_latest, _dir) MAP.write(cmd) ret_trim = os.system(cmd.strip()) file_mapping_latest = '{}/mappings_trim.arf'.format(_dir) if options.get('-v') == '': cat_to(file_mapping_latest, options.get('-t')) return file_mapping_latest
def parse_file_ids(_file, _hash): # read id file into hash if options.get('-k') == '': print_stderr('reading id file into memory\n') FILE = open_or_die(_file, 'rb', 'can not open {}\n'.format(_file)) while True: line = FILE.readline() if not line: break m = re.match(r'^(\S+)', line) if m: _id = m.groups()[0] _hash[_id] = 1
def scan(file_arf, options): global gscan, running if options.get('-k') == '': lines = os.popen('cat {} | wc -l'.format(file_arf)).read().strip() print_stderr('scanning mappings, total={}\n'.format(lines)) gscan = 1 parse_file_arf(file_arf, options) gscan = 0 if options.get('-k') == '': print_stderr('resolving best mappings for each read\n') fill_hash() running = 0
def core_algorithm(): ''' run moRNA Finder core algorithm ''' global _dir, dir_tmp, file_mature_ref_other_species, ltime pprint("#running moRNA Finder core algorithm\n") print_stderr("#running moRNA Finder core algorithm\n") line = None longest_id = 40 if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): longest_id = get_longest_id("{}/{}".format( dir_tmp, file_mature_ref_this_species)) start() if not re.search('none', file_mature_ref_other_species, re.IGNORECASE): line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50 -l {}".format( dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species, longest_id) else: line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50 -l {}".format( dir_tmp, dir_tmp, longest_id) if not options.get('-c') == '': line += " -y {}/precursors_for_randfold.rand".format(dir_tmp) cmd = "{} > {}/output.mrd\n".format(line, _dir) print_stderr(cmd) ret_mor_core = os.system(cmd) if options.get('-E'): ret_mor_core = os.system('{} -t > {}/error.output.mrd'.format( line, _dir)) end() # check if file is empty fname = "{}/output.mrd".format(_dir) if not file_s(fname): print_stderr("Error:\n\tFile {} is empty\n\n".format(fname)) print_stderr( "Now running core_algorithm.py with option -t to see why all precursors were discarded\n" ) ret_mor_core = os.system('{} -t > error.output.mrd_{}'.format( line, ltime)) print_stderr( "The debug file is called error.output.mrd_{}\n".format(ltime)) die("\nExiting now\n\n")
def parse_mappings(): global file_reads_vs_genome, parsed_arf, dir_tmp # parse mappings to retain only perfect mappings of reads 18 nt <= length # <= 25 nt that map perfectly to five loci or less pprint("#parsing genome mappings\n") print_stderr("#parsing genome mappings\n") cmd = "parse_mappings.py {} -a 0 -b 18 -c 25 -i 5 > {}/{}_parsed.arf\n\n".format( file_reads_vs_genome, dir_tmp, parsed_arf) print_stderr(cmd) start() ret_parse_mappings = os.popen(cmd).read() end() return 0
def parse_file_arf(file_arf): global count_lines, hash_pos lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip()) if options.get('-b') == '': print_stderr( 'reading the mapping file into memory, total lines=$lines\n'. format(lines)) FILENAME = open_or_die(file_arf, 'rb', 'Could not open file {}'.format(file_arf)) while True: line = FILENAME.readline() if not line: break m = re.match( r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)', line) if m: m = m.groups() query = m[0] query_map_lng = int(m[1]) query_beg = int(m[2]) query_end = int(m[3]) query_seq = m[4] db = m[5] db_map_lng = int(m[6]) db_beg = int(m[7]) db_end = int(m[8]) db_seq = m[9] strand = m[10] edits = int(m[11]) edit_string = m[12] freq = find_freq(query) # read into position hash insertfeature(db, strand, db_beg, db_end, freq) count_lines += 1 FILENAME.close()
def excise_struct(struct, beg, end, strand): global db_old lng = len(struct) # begin can be equal to end if only one nucleotide is excised if not (beg <= end): print_stderr( 'begin can not be greater than end for {}\n'.format(db_old)) sys.exit(0) # rarely, permuted combinations of signature and structure cause out of bound excision errors. # this happens once appr. every two thousand combinations if not (beg <= len(struct)): return 0 # the blast parsed format is 1-indexed, substr is 0-indexed sub_struct = substr(struct, beg - 1, end - beg + 1) return sub_struct
def resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options): if permutation is None: print_stderr('The {} file is not properly formatted.\nMaybe it does not contain the lines with \"permutation int\"?\n'.format( options.get('-a') )) sys.exit(0) floor = int(math.floor(score)) create_hash_key_chain(_hash, 0, 'total', permutation, floor) _hash['total'][permutation][floor] += 1 if refs: create_hash_key_chain(_hash, 0, 'known', permutation, floor) _hash['known'][permutation][floor] += 1 else: create_hash_key_chain(_hash, 0, 'novel', permutation, floor) _hash['novel'][permutation][floor] += 1 for i in range(len(refs)): refs.pop()
def handle_config_file(file_reads, MAP, options): FILE = open_or_die( file_reads, 'rb', 'can not open {}\n'.format(file_reads)) while True: l = FILE.readline() if not l: break m = re.match(r'(^\S+)\s+(\S+)\s*.*$', l) if m: m = m.groups() file_reads = m[0] prefix = m[1] if (len(file_reads) < len(prefix)): file_reads = m[1] prefix = m[0] test_prefix(prefix) MAP.write("\nhandling file '{}' with prefix '{}'\n".format( file_reads, prefix)) # check if files in config file are in accordance with option # specified if options.get('-a') == '': check_file_format_and_option(file_reads, 'a') if options.get('-b') == '': check_file_format_and_option(file_reads, 'b') if options.get('-c') == '': check_file_format_and_option(file_reads, 'c') if options.get('-e') == '': check_file_format_and_option(file_reads, 'e') if options.get('-v') == '': print_stderr("\nhandling file '{}' with prefix '{}'\n".format( file_reads, prefix)) handle_one_file(file_reads, prefix, MAP, options) FILE.close()
def perform_controls(): global dir_tmp, _dir, file_mature_ref_other_species, ltime # run permuted controls: pprint("#running permuted controls\n") print_stderr("#running permuted controls\n") start() line = None if not re.search('none', file_mature_ref_other_species, re.IGNORECASE): line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50".format( dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species) else: line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50".format( dir_tmp, dir_tmp) if not (options.get('-c') == ''): line += " -y {}/precursors_for_randfold.rand".format(dir_tmp) cmd = "echo '{} > {}/output.mrd' > {}/command_line\n\n".format( line, _dir, dir_tmp, ) print_stderr(cmd) ret_command_line = os.system(cmd) cmd = "perform_controls.py {}/command_line {}/precursors.str 100 -a > {}/output_permuted.mrd 2>>error_{}.log\n\n".format( dir_tmp, dir_tmp, dir_tmp, ltime) print_stderr(cmd) ret_perform_controls = os.system(cmd) end()
def excise_precursors(): global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres # excise precursors from the genome pprint("#excising precursors\n") print_stderr("#excising precursors\n") start() ret_excise_precursors = None if options.get('-a'): cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min, dir_tmp) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() else: cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() fname = '{}/precursors.fa_stack'.format(dir_tmp) OSS = open_or_die2(fname, 'rb') stack_height_min = OSS.readline().strip() OSS.close() end() fname = '{}/precursors.fa'.format(dir_tmp) # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"): if not file_s(fname) or not os.path.isfile( fname): # empty or not a regular plain file die("No precursors excised\n") return 0
def fold_precursors(): ''' predicting RNA secondary structures with RNAfold ''' global dir_tmp, ltime pprint("#folding precursors\n") print_stderr("#folding precursors\n") print_stderr( "RNAfold < {}/precursors.fa -noPS > {}/precursors.str\n\n".format( dir_tmp, dir_tmp)) start() ret_fold_precursors = os.system( "RNAfold < {}/precursors.fa -noPS > {}/precursors.str 2>>error_{}.log". format(dir_tmp, dir_tmp, ltime)) if ret_fold_precursors: ret_fold_precursors = os.system( "RNAfold < {}/precursors.fa --noPS > {}/precursors.str".format( dir_tmp, dir_tmp)) if ret_fold_precursors: die("Some RNAfold error occurred. Error {}\n".format( ret_fold_precursors)) end()
def parse_file_fasta_seqkey(file_fasta, hsh, options): if options.get('-a') == '': print_stderr('reading file into hash\n') _id = '' seq = '' running_1 = 0 FASTA = open_or_die2(file_fasta, 'rb') while True: l = FASTA.readline().strip() if not l: break m = re.match(r'^>(\S+)', l) if m: _id = m.group() seq = '' while True: ll = FASTA.readline().strip() if not ll: break mm = re.match(r'^>(\S+)', ll) if mm: cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') # ATTR: Performance issue below: # create_hash_key_chain(hsh, 0, seq) try: hsh[seq] = (hsh[seq]) + cnt except KeyError: hsh[seq] = cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) _id = mm.group() seq = '' continue seq += ll cnt = find_cnt(_id) seq = tr(seq, '[acgtun.]', '[ACGTTNN]') create_hash_key_chain(hsh, 0, seq) hsh[seq] += cnt running_1 += 1 if options.get('-a') == '': print_stderr('{}\r'.format(running_1)) FASTA.close()
def prepare_signature(): ''' prepare signature file ''' global file_reads, dir_tmp, read_align_mismatches, file_mature_ref_this_species, ltime pprint("#preparing signature\n") print_stderr("#preparing signature\n") if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): cmd = "prepare_signature.py {} {}/precursors.fa {} -a {}/{} -o {}/signature.arf 2>>error_{}.log\n\n".format( file_reads, dir_tmp, read_align_mismatches, dir_tmp, file_mature_ref_this_species, dir_tmp, ltime) print_stderr(cmd) start() ret_prepare_signature = os.popen(cmd).read() end() else: cmd = "prepare_signature.py {} {}/precursors.fa {} -o {}/signature.arf 2>>error_{}.log\n\n".format( file_reads, dir_tmp, read_align_mismatches, dir_tmp, ltime) start() ret_prepare_signature = os.popen(cmd).read() end() return 0
def excise_seq(seq, beg, end, strand): ''' excise sub sequence from the potential precursor ''' global db_old # begin can be equal to end if only one nucleotide is excised if not (beg <= end): print_stderr('begin can not greater than end for {}\n'.format(db_old)) sys.exit(0) # rarely, permuted combinations of signature and structure cause out of bound excision errors. # this happens once appr. every two thousand combinations if not (beg <= len(seq)): return 0 # the blast parsed format is 1-indexed, substr is 0-indexed sub_seq = substr(seq, beg - 1, end - beg + 1) # if on the minus strand, the reverse complement should be returned if strand == "-": sub_seq = revcom(sub_seq) return sub_seq
def rna2dna(): global dir_tmp, file_mature_ref_other_species, file_mature_ref_this_species, file_precursors # process_input mirna files if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): start() # copy file (file_mature_ref_this_species_tmp, path0, extension0) = fileparse(file_mature_ref_this_species, '\..*') cmd = "rna2dna.py {} > {}/{}{}\n\n".format( file_mature_ref_this_species, dir_tmp, file_mature_ref_this_species_tmp, extension0) print_stderr(cmd) ret_parse_mature_ref_this_species = os.popen(cmd).read() # rename orig file file_mature_ref_this_species = '{}{}'.format( file_mature_ref_this_species_tmp, extension0) if not re.search('none', file_mature_ref_other_species, re.IGNORECASE): # copy file (file_mature_ref_other_species_tmp, path0, extension0) = fileparse(file_mature_ref_other_species, '\..*') cmd = "rna2dna.py {} > {}/{}{}\n\n".format( file_mature_ref_other_species, dir_tmp, file_mature_ref_other_species_tmp, extension0) print_stderr(cmd) # here give file name ret_parse_mature_ref_other_species = os.popen(cmd).read() # rename orig file file_mature_ref_other_species = '{}{}'.format( file_mature_ref_other_species_tmp, extension0) end() if not re.search('none', file_precursors, re.IGNORECASE): # copy file (file_precursors_tmp, path0, extension0) = fileparse(file_precursors, '\..*') cmd = "rna2dna.py {} > {}/{}{}\n\n".format(file_precursors, dir_tmp, file_precursors_tmp, extension0) print_stderr(cmd) # here give file name ret_parse_precursors = os.popen(cmd).read() # rename orig file file_precursors = '{}{}'.format(file_precursors_tmp, extension0) end() return 0
def print_hash_seqkey(hsh): if options.get('-a') == '': print_stderr('sorting hash\n') running_2 = 0 if options.get('-a') == '': print_stderr('printing hash\n') keys = hash_sort_key(hsh, lambda x: (x[1] * -1, x[0])) for key in keys: cnt = hsh[key] # print ">$prefix\_$running_2\_x$cnt\n$key\n"; print('>{}_{}_x{}\n{}'.format(prefix, running_2, cnt, key)) running_2 += cnt if options.get('-a') == '': print_stderr('{}\r'.format(running_2))
def resolve(options, _id, seq): global running running += 1 if options.get('-s') == '': print_stderr('{}\r'.format(running)) lng = len(seq) if options.get('-a') and lng < int(options.get('-a')): print_stderr('>{}\n{}\n'.format(_id, seq)) return if options.get('-b') == '' and not re.match(r'^(a|c|g|t|u|n)+$', seq, re.IGNORECASE): print_stderr('>{}\n{}\n'.format(_id, seq)) return print('>{}'.format(_id)) print(seq)
def make_survey(): # get overview of the output: global _dir, dir_tmp, file_mature_ref_this_species, stack_height_min pprint("#doing survey of accuracy\n") print_stderr("#doing survey of accuracy\n") if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): cmd = "survey.py {}/output.mrd -a {}/output_permuted.mrd -b {}/{} -c {}/signature.arf -d {} > {}/survey.csv\n\n".format( _dir, dir_tmp, dir_tmp, file_mature_ref_this_species, dir_tmp, stack_height_min, _dir) print_stderr(cmd) start() ret_survey = os.system(cmd) end() else: cmd = "survey.py {}/output.mrd -a {}/output_permuted.mrd -d {} > {}/survey.csv\n\n".format( _dir, dir_tmp, stack_height_min, _dir) print_stderr(cmd) start() ret_survey = os.system(cmd) end()
parser.add_argument('file_output', help=usage) parser.add_argument('coord_file', help=usage) parser.add_argument('pres_max', help=usage) args = parser.parse_args(sys.argv[1:6]) file_fasta = args.file_fasta file_arf = args.file_arf file_output = args.file_output coord_file = args.coord_file pres_max = args.pres_max opts, argss = getopt.getopt(sys.argv[6:], 'b') options = dict(opts) if not re.search(r'^[-]*\d+', pres_max): print_stderr('{} is not an integer number\n'.format(pres_max)) sys.exit(-1) for z in range(1, upper_bound): dblimit[z] = 0 thres_counts[z] = 0 TMP1 = open_or_die('{}_all'.format(file_output), 'w+', 'cannot create file {}'.format(file_output)) TMP2 = open_or_die('{}_all'.format(coord_file), 'w+', 'cannot create file {}'.format(coord_file)) if options.get('-b') == '': print_stderr('finding lengths of genome contigs\n')
coord_file = args.coord_file opts, argss = getopt.getopt(sys.argv[4:], 'a:b') options = dict(opts) try: PF = open(coord_file, 'w+') except: print('cannot create file {}'.format(coord_file)) sys.exit(-1) if options.get('-a'): freq_min = int(options.get('-a')) if options.get('-b') == '': print_stderr('finding lengths of genome contigs\n') parse_file_arf(file_arf) if options.get('-b') == '': print_stderr( 'reading the genome into memory and excising potential precursors\n' ) parse_genome_and_excise(PF, file_fasta) if options.get('-b') == '': print_stderr('potential precursors excised\n') close(PF)
def check_file_format_and_option(file_reads, aFormat): print_stderr('\n') warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n [options] -a input file is seq.txt format -b input file is qseq.txt format -c input file is fasta format -e input file is fastq format -d input file is a config file (see moRNA Finder documentation). options -a, -b, -c or -e must be given with option -d. ''' line = None if aFormat == 'a': i = 0 IN = open_or_die( file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads)) while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) # $#line != 4 if len(line) != 5: die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == 'b': IN = open_or_die( file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads)) i = 0 mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n' while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) if len(line) != 11: die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format( len(line), warning)) if not re.search(r'^\S+', line[9]): die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == '-c': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTA file supplied by option -c\n') i = 0 mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^>\S+$', l): die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 2: if not re.search(r'^\S+$', l): die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format( mes)) if i == 3: if not re.search(r'^>\S+$', l): die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 4: if not re.search(r'^\S+$', l): die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 4: break IN.close() elif aFormat == '-e': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTQ file supplied by option -e\n') i = 0 mes = 'Please make sure your file is in accordance with the FASTQ format specifications' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^@\S+', l): die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format( mes, warning)) if i == 2: if re.search(r'^\S+$', l): die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 3: if re.search(r'^\+', l): die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning)) if i == 4: if re.search(r'^\S+$', l): die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format( mes, warning)) if i == 4: break
def read_stats(options): _hash = {} count = 0 k2 = {} IN = open_or_die(options.get('-s'), 'rb', 'No reads file in fasta format given\n') while True: line = IN.readline() if not line: break m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line) if m: m = m.groups() try: if _hash[m[0]]: continue except KeyError: pass # ATTR: Performance issue below, use logic above # if m[0] in _hash.keys() and _hash[m[0]]: # continue _hash[m[0]] = 1 count += int(m[2]) if m[1] not in k2.keys(): k2[m[1]] = 0 k2[m[1]] += int(m[2]) IN.close() _hash2 = {} count2 = 0 k22 = {} print_stderr('Mapping statistics\n') IN = open_or_die(options.get('-t'), 'rb', 'No mapping file given\n') while True: line = IN.readline() if not line: break m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line) if m: m = m.groups() if m[0] in _hash2.keys() and _hash2[m[0]]: continue _hash2[m[0]] = 1 count2 += int(m[2]) if m[1] not in k22.keys(): k22[m[1]] = 0 k22[m[1]] += int(m[2]) IN.close() print_stderr('\n#desc\ttotal\tmapped\tunmapped\t%mapped\t%unmapped\n') print_stderr("total: {}\t{}\t{}\t".format(count, count2, count - count2)) print_stderr("{0:.3f}\t{1:.3f}\n".format( count2 / float(count), 1 - (count2 / float(count)))) for k in k2.keys(): print_stderr('{}: {}\t{}\t{}\t'.format( k, k2[k], k22[k], k2[k] - k22[k])) print_stderr('{0:.3f}\t{1:.3f}\n'.format( float(k22[k]) / k2[k], 1 - (float(k22[k]) / k2[k])))
def process_reads(file_reads_latest, prefix, MAP): global _dir, orig_file_reads orig_file_reads = file_reads_latest m = re.search(r'([_\-.a-zA-Z0-9]+)$', file_reads_latest) if m: orig_file_reads = m.groups()[0] _dir = make_dir_tmp("_{}_{}".format(prefix, orig_file_reads), MAP) # parse solexa to fasta if options.get('-h') == '': if options.get('-e') == '': MAP.write('parsing fastq to fasta format\n') if options.get('-v') == '': print_stderr('parsing fastq to fasta format\n') cmd = 'fastq2fasta.py {} > {}/reads.fa\n'.format( file_reads_latest, _dir) MAP.write(cmd) ret_format = os.system(cmd) file_reads_latest = '{}/reads.fa'.format(_dir) else: MAP.write('parsing Solexa / Illumina output to fasta format\n') if options.get('-v') == '': print_stderr( 'parsing Solexa / Illumina output to fasta format\n') line = 'illumina_to_fasta.py {}'.format(file_reads_latest) if options.get('-b') == '': line += ' -a' cmd = '{} > {}/reads.fa\n'.format(line, _dir) MAP.write(cmd) ret_format = os.system(cmd) file_reads_latest = '{}/reads.fa'.format(_dir) # RNA to DNA if options.get('-i') == '': MAP.write('converting rna to dna alphabet\n') if options.get('-v') == '': print_stderr('converting rna to dna alphabet\n') ret_rna2dna = os.system( 'rna2dna.py {} > {}/reads_dna.fa'.format(file_reads_latest, _dir)) file_reads_latest = '{}/reads_dna.fa'.format(_dir) # discard entries that contain non-canonical letters if options.get('-j') == '': MAP.write('discarding sequences with non-canonical letters\n') if options.get('-v') == '': print_stderr('discarding sequences with non-canonical letters\n') cmd = 'fastaparse.py {} -b > {}/reads_letters.fa 2>{}/reads_discarded.fa\n'.format( file_reads_latest, _dir, _dir) MAP.write(cmd) ret_clip = os.system(cmd.strip()) file_reads_latest = '{}/reads_letters.fa'.format(_dir) # clip 3' adapters if options.get('-k'): MAP.write("clipping 3' adapters\n") if options.get('-v') == '': print_stderr("clipping 3' adapters\n") cmd = 'clip_adapters.py {} {} > {}/reads_clip.fa\n'.format( file_reads_latest, options.get('-k'), _dir) MAP.write(cmd) ret_clip = os.system(cmd.strip()) file_reads_latest = '{}/reads_clip.fa'.format(_dir) if options.get('-l'): MAP.write('discarding short reads\n') if options.get('-v') == '': print_stderr('discarding short reads\n') cmd = 'fastaparse.py {} -a {} > {}/reads_no_short.fa 2>{}/reads_too_short.fa\n'.format( file_reads_latest, options.get('-l'), _dir, _dir) MAP.write(cmd) ret_rem_short = os.system(cmd.strip()) file_reads_latest = '{}/reads_no_short.fa'.format(_dir) # collapse reads if options.get('-m') == '': MAP.write('collapsing reads\n') if options.get('-v') == '': print_stderr('collapsing reads\n') cmd = 'collapse_reads_md.py {} {} > {}/reads_nr.fa\n'.format( file_reads_latest, prefix, _dir) MAP.write(cmd) ret_collapse = os.system(cmd) file_reads_latest = '{}/reads_nr.fa'.format(_dir) # printing reads if options.get('-s'): cat_to(file_reads_latest, options.get('-s')) return file_reads_latest
check_options(options, file_reads) if '-o' in options.keys(): threads = options.get('-o') cores = os.popen('grep -ic ^processor /proc/cpuinfo').read() if not re.search(r'^\d+$', cores): cores = os.popen('sysctl -n hw.physicalcpu').read() if not re.search(r'^\d+$', cores): cores = os.popen('sysctl -n hw.logicalcpu').read() if not re.search(r'^\d+$', cores): cores = 1 if threads > cores: print_stderr( 'More threads specified than cores on the system. Reducing the number of threads to {}\n'.format(cores)) threads = cores if options.get('-q') == '': mismatches_seed = 1 prefix_global = 'seq' if options.get('-g'): prefix_global = options.get('-g') if options.get('-d'): handle_config_file(file_reads) else: handle_one_file(file_reads, prefix_global, MAP, options)
file_precursors = args.file_precursors read_align_edit_distance = args.read_align_edit_distance opts, argss = getopt.getopt(sys.argv[4:], "a:bo:") options = dict(opts) ltime = long(time.time()) _dir = 'dir_prepare_signature{}'.format(ltime) if '-o' not in options.keys() or options.get('-o') == '': die('no outfile specified with option -o\n') outfile = options.get('-o') if options.get('-b') == '': print_stderr('preparing signature file\n') os.mkdir(_dir) shutil.copy(file_precursors, _dir) if options.get('-b') == '': print_stderr('constructing index of precursors\n') os.system('bowtie-build {} {}/precursors.ebwt > /dev/null'.format( file_precursors, _dir)) if options.get('-b') == '': print_stderr('mapping reads to precursors\n') cmd = 'bowtie -f -v {} -a --best --strata --norc {}/precursors.ebwt {} {}/reads_vs_precursors.bwt 2> /dev/null\n'.format( read_align_edit_distance, _dir, file_reads, _dir) print_stderr(cmd) os.system(cmd)