def excise_precursors(): global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres # excise precursors from the genome pprint("#excising precursors\n") print_stderr("#excising precursors\n") start() ret_excise_precursors = None if options.get('-a'): cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min, dir_tmp) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() else: cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format( file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres) print_stderr(cmd) ret_excise_precursors = os.popen(cmd).read() fname = '{}/precursors.fa_stack'.format(dir_tmp) OSS = open_or_die2(fname, 'rb') stack_height_min = OSS.readline().strip() OSS.close() end() fname = '{}/precursors.fa'.format(dir_tmp) # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"): if not file_s(fname) or not os.path.isfile( fname): # empty or not a regular plain file die("No precursors excised\n") return 0
def run_quantifier(mrna_hp, mature_this_file, collapsed_file, timestamp): cmd = 'quantifier.py -p {} -m {} -r {} -t cel -y {}'.format( mrna_hp, mature_this_file, collapsed_file, timestamp) print_stderr(cmd, '\n') ret = os.system(cmd) if ret: die('Run quantifier.py failed.\n')
def run_bedtools_cmd(run_times, fafile, mrna_places, mrna_hp): for i in range(run_times): cmd = 'bedtools getfasta -fi {} -bed {} -name -fo {}'.format( fafile, mrna_places, mrna_hp) print_stderr(cmd, '\n') ret = os.system(cmd) if ret: die('Run bedtools failed.\n')
def test_first_argument(): if len(sys.argv) < 2: die(usage) if sys.argv[1] == '-u': os.system('make_html.py -u -y 1') sys.exit(0) if sys.argv[1] == '-h' or sys.argv[1] == '--help': die(usage)
def test_bedtools(): ret = os.system('which bedtools > /dev/null 2>&1') if ret: die('''bedtools is not installed in your environment. For ubuntu/debian, run `apt-get install bedtools`. For Redhat/CentOS, run `yum install BEDTools`. For MacOS, run `brew install bedtools`. If you need more information about how to install bedtools, please visit http://bedtools.readthedoc.org. ''')
def run_mapper( reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file, ): cmd = 'mapper.py {} -c -j -k {} -l 18 -m -p {} -s {} -t {} -v'.format( reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file) print_stderr(cmd, '\n') ret = os.system(cmd) if ret: die('Run mapper.py failed.\n')
def check_line(line): if re.search(r'-h\s+\d/', line) or re.search(r'-h\s+\w/', line): die("option -h should not be given with an integer or string\n") if re.search(r'-i\s+\d/', line) or re.search(r'-i\s+\w/', line): die("option -i should not be given with an integer or string\n") if re.search(r'-j\s+\d/', line) or re.search(r'-j\s+\w/', line): die("option -j should not be given with an integer or string\n") if re.search(r'-m\s+\d/', line) or re.search(r'-m\s+\w/', line): die("option -m should not be given with an integer or string\n") if re.search(r'-q\s+\d/', line) or re.search(r'-q\s+\w/', line): die("option -q should not be given with an integer or string\n")
def core_algorithm(): ''' run moRNA Finder core algorithm ''' global _dir, dir_tmp, file_mature_ref_other_species, ltime pprint("#running moRNA Finder core algorithm\n") print_stderr("#running moRNA Finder core algorithm\n") line = None longest_id = 40 if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): longest_id = get_longest_id("{}/{}".format( dir_tmp, file_mature_ref_this_species)) start() if not re.search('none', file_mature_ref_other_species, re.IGNORECASE): line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50 -l {}".format( dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species, longest_id) else: line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50 -l {}".format( dir_tmp, dir_tmp, longest_id) if not options.get('-c') == '': line += " -y {}/precursors_for_randfold.rand".format(dir_tmp) cmd = "{} > {}/output.mrd\n".format(line, _dir) print_stderr(cmd) ret_mor_core = os.system(cmd) if options.get('-E'): ret_mor_core = os.system('{} -t > {}/error.output.mrd'.format( line, _dir)) end() # check if file is empty fname = "{}/output.mrd".format(_dir) if not file_s(fname): print_stderr("Error:\n\tFile {} is empty\n\n".format(fname)) print_stderr( "Now running core_algorithm.py with option -t to see why all precursors were discarded\n" ) ret_mor_core = os.system('{} -t > error.output.mrd_{}'.format( line, ltime)) print_stderr( "The debug file is called error.output.mrd_{}\n".format(ltime)) die("\nExiting now\n\n")
def read_handler(handle): global counter while True: rin = handle.readline().strip() if not rin: break rin = rin.strip() counter += 1 if re.match(r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$', rin): pass else: die('\nWrong format in line {}: The row\n{}\ndoes not correspond to the format\nread_id_wo_whitespaces\tlength\tstart\tend\tread_sequence \tgenomicID_wo_whitspaces\tlength\tstart\tend\tgenomic_sequence \tstrand\t#mismatches\teditstring\ne.g. read_22_x10000 \t22 \t1 \t22 \tagtcgtgactgactgactgacg\tchromosomeIII_x12312312\t22 \t1001 \t1022\tagtcgtgactgactgactgacg\t+- \t0 \tmmmmmmmmmmmmmmmmmmmmm\nPlease make sure that all lines have the above described format.\n'.format( Nicenumber(counter), rin ))
def parse_file_command_line(file_command_line, file_structure, _dir): FILE = open_or_die(file_command_line, 'rb', 'can not open {}'.format(file_command_line)) while True: line = FILE.readline() if not line: break if re.search(r'(\S+)', line): line = line.strip() line = re.sub(file_structure, '{}/precursors_permuted.str'.format(_dir), line, count=1) line = re.sub(r'>.+', '', line, count=1) return line die('{} is empty\n'.format(file_command_line))
def fold_precursors(): ''' predicting RNA secondary structures with RNAfold ''' global dir_tmp, ltime pprint("#folding precursors\n") print_stderr("#folding precursors\n") print_stderr( "RNAfold < {}/precursors.fa -noPS > {}/precursors.str\n\n".format( dir_tmp, dir_tmp)) start() ret_fold_precursors = os.system( "RNAfold < {}/precursors.fa -noPS > {}/precursors.str 2>>error_{}.log". format(dir_tmp, dir_tmp, ltime)) if ret_fold_precursors: ret_fold_precursors = os.system( "RNAfold < {}/precursors.fa --noPS > {}/precursors.str".format( dir_tmp, dir_tmp)) if ret_fold_precursors: die("Some RNAfold error occurred. Error {}\n".format( ret_fold_precursors)) end()
def read_handler(handle): global counter while True: rin = handle.readline().strip() if not rin: break counter += 1 m = re.match(r'^\>(.+)$', rin) if m: m = m.groups() _id = m[0] if re.search(r'\s+', _id): die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n' .format(Nicenumber(counter), _id, hint)) else: create_hash_key_chain(hash_num, 0, _id) hash_num[_id] += 1 elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin): die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}' .format(Nicenumber(counter), rin, hint))
def test_installed_binaries(): global scripts stdm = 'If you used the install.py script make sure that you started a complete new shell window after installation.\nIf this did not help please restart youer workstation.\n\n' ret = None ret = checkBIN("bowtie --version", "version") if ret: die( "Error: \tbowtie not found\nCheck if bowtie is correctly installed and all Pathes were set correctly.\n", stdm) ret = checkBIN("RNAfold -h", "gamma") if ret: die( "Error: \tRNAfold not found\nCheck if RNAfold is correctly installed and all Pathes were set correctly.\n", stdm) ret = checkBIN("randfold", "let7") if ret: die( "Error: \trandfold not found\nCheck if randfold is correctly installed and all Pathes were set correctly.\n", stdm) # TODO, perl PDF lib requirement # ret = checkBIN("perl -e \'use PDF::API2; print \"installed\";\'","installed") # if ret: # die "Error: \tPerl PDF::API2 package not found\nCheck if the perl # PDF::API2 package is correctly installed and all Pathes were set # correctly.\n$stdm" if($ret); if not os.path.isfile('{}/Rfam_for_moR.fa'.format(scripts)): die("Error:\t Rfam_for_moR.fa not found in your moRNA Finder scripts directory\nPlease copy this file from the moRNA Finder archive to your moRNA Finder scripts directory\n\n" ) return 0
scripts = os.popen('which moR.py').read() # scripts = re.sub(r'moR.py', '', scripts, count=1) # scripts = re.sub(r'\s+', '', scripts) scripts = os.path.dirname(scripts) + '/' pprint('#Starting moRNA Finder\n') print_stderr('#Starting moRNA Finder\n{} {}\n\n'.format( sys.argv[0], ' '.join(sys.argv[1:]))) print_stderr("moRNA Finder started at {}\n\n\n".format(sTimeG)) test_first_argument() command_line = "{} {}\n".format(sys.argv[0], ' '.join(sys.argv[1:])) if len(sys.argv) < 7: die(usage) parser = argparse.ArgumentParser(usage=usage) parser.add_argument('reads', help='reads') parser.add_argument('genome', help='genome') parser.add_argument('mappings', help='mappings') parser.add_argument('miRNAs_ref', help='miRNAs_ref') parser.add_argument('miRNAs_other', help='miRNAs_other') parser.add_argument('precursors', help='precursors') args = parser.parse_args(sys.argv[1:7]) file_reads = args.reads file_genome = args.genome file_reads_vs_genome = args.mappings file_mature_ref_this_species = args.miRNAs_ref file_mature_ref_other_species = args.miRNAs_other file_precursors = args.precursors
survey_known(score) if options.get('-a'): survey_signal_to_noise(score) if options.get('-d'): read_stack_min = options.get('-d') pprint('\t{}'.format(read_stack_min)) pprint('\n') if __name__ == '__main__': if len(sys.argv) < 2: die(usage) parser = argparse.ArgumentParser(usage=usage) parser.add_argument('file_out', help='output file') args = parser.parse_args(sys.argv[1:2]) file_out = args.file_out opts, argss = getopt.getopt(sys.argv[2:], 'a:b:c:d:') options = dict(opts) if (options.get('-b') and not options.get('-c')) or (not options.get('-b') and options.get('-c')): die('options -b and -c must be used in conjunction\n') if options.get('-b') and options.get('-c'): parse_file_ref(options.get('-b'), hash_ref)
def check_file_format_and_option(file_reads, aFormat): print_stderr('\n') warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n [options] -a input file is seq.txt format -b input file is qseq.txt format -c input file is fasta format -e input file is fastq format -d input file is a config file (see moRNA Finder documentation). options -a, -b, -c or -e must be given with option -d. ''' line = None if aFormat == 'a': i = 0 IN = open_or_die( file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads)) while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) # $#line != 4 if len(line) != 5: die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == 'b': IN = open_or_die( file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads)) i = 0 mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n' while True: l = IN.readline().strip() if not l: break i += 1 line = esplit(l) if len(line) != 11: die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format( len(line), warning)) if not re.search(r'^\S+', line[9]): die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning)) if i == 4: break IN.close() elif aFormat == '-c': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTA file supplied by option -c\n') i = 0 mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^>\S+$', l): die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 2: if not re.search(r'^\S+$', l): die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format( mes)) if i == 3: if not re.search(r'^>\S+$', l): die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format( mes, warning)) if i == 4: if not re.search(r'^\S+$', l): die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 4: break IN.close() elif aFormat == '-e': IN = open_or_die(file_reads, 'rb', 'Cannot open FASTQ file supplied by option -e\n') i = 0 mes = 'Please make sure your file is in accordance with the FASTQ format specifications' while True: l = IN.readline().strip() if not l: break i += 1 if i == 1: if not re.search(r'^@\S+', l): die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format( mes, warning)) if i == 2: if re.search(r'^\S+$', l): die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format( mes, warning)) if i == 3: if re.search(r'^\+', l): die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning)) if i == 4: if re.search(r'^\S+$', l): die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format( mes, warning)) if i == 4: break
def test_prefix(prefix): if not (re.search(r'^\w\w\w$', prefix) and not re.search(r'_', prefix)): die('prefix $prefix does not contain exactly three alphabet letters\n')
def check_options(options, file_reads): formats = 0 if options.get('-a') == '': formats += 1 if options.get('-d') != '': check_file_format_and_option(file_reads, 'a') if options.get('-b') == '': formats += 1 if options.get('-d') != '': check_file_format_and_option(file_reads, 'b') if options.get('-c') == '': formats += 1 if options.get('-d') != '': check_file_format_and_option(file_reads, 'c') if options.get('-e') == '': formats += 1 if options.get('-d') != '': check_file_format_and_option(file_reads, 'e') if formats != 1: die('exactly one input format (-a, -b , -e or -c) must be designated\n') # check if file supplied matches option, otherwise quit processing_steps = 0 if options.get('-h') == '': processing_steps += 1 if options.get('-i') == '': processing_steps += 1 if options.get('-j') == '': processing_steps += 1 if options.get('-k'): processing_steps += 1 if options.get('-l'): processing_steps += 1 if options.get('-m') == '': processing_steps += 1 if options.get('-p'): processing_steps += 1 if processing_steps <= 0: die('at least one processing/mapping step (-h, -i, -j, -k, -l, -m or -p) must be designated\n') file_output = 0 if '-o' in options.keys(): if not(re.search(r'\d+', options.get('-o')) and int(options.get('-o')) > 0): die('options -o must be positive integer\n') if options.get('-s'): file_output += 1 if options.get('-t'): file_output += 1 if file_output <= 0: die('at least one output file (-s or -t) must be designated\n') if options.get('-s') and os.path.exists(options.get('-s')) and not options.get('-n') == '': die("file {} already exists\n".format(options.get('-s'))) if options.get('-t') and os.path.exists(options.get('-t')) and not options.get('-n') == '': die("file {} already exists\n".format(options.get('-t'))) if options.get('-a') == '' or options.get('-b') == '' or options.get('-e') == '' and options.get('-h') != '': die("raw illumina output must be parsed to fasta format with options -h\n") if options.get('-c') == '' and options.get('-h') == '': die("input file is already designated as a fasta file, so option -h should not be used\n") if options.get('-c') == '' and not(options.get('-i') == '' or options.get('-j') == '' or options.get('-k') or options.get('-l') or options.get('-m') == '' or options.get('-p')): die("at least one processing/mapping step (-i, -j, -k, -l, -m or -p) must be designated\n") if options.get('-d') == '' and not(options.get('-a') != '' or options.get('-b') == '' or options.get('-c') == '' or options.get('-e')) == '': die("option -d must be given with option -a, -b, -c or -e \n") if options.get('-d') == '' and options.get('-g'): die("option -d and -g are mutually exclusive. If -d is given, the prefixes must be contained in the config file\n") if options.get('-g'): test_prefix(options.get('-g')) if options.get('-i') == '' and not(options.get('-c') == '' or options.get('-h') == ''): die("option -i must be used on reads in fasta format or with option -h \n") if options.get('-j') == '' and not(options.get('-c') == '' or options.get('-h') == ''): die("option -j must be used on reads in fasta format or with option -h \n") if options.get('-k') and not(options.get('-c') == '' or options.get('-h') == ''): die("option -k must be used on reads in fasta format or with option - h\n") if options.get('-l') and not(options.get('-c') == '' or options.get('-h') == ''): die("option -l must be used on reads in fasta format or with option -h \n") if options.get('-m') == '' and not(options.get('-c') == '' or options.get('-h') == ''): die("option -m must be used on reads in fasta format or with option -h \n") if options.get('-p') and not(options.get('-c') == '' or options.get('-h') == ''): die("option -p must be used on reads in fasta format or with option -h\n") if options.get('-q') == '' and not(options.get('p')): die("option -q must be given with option -p\n") if options.get('-s') and not(options.get('-h') == '' or options.get('-i') == '' or options.get('-j') == '' or options.get('-k') or options.get('-l') or options.get('-m') == '' or options.get('-p')): die("at least one processing step (-h, -i, -j, -k, -l, -m or -p) must be designated if processed file should be output (-s)\n") if options.get('-t') and not(options.get('-p')): die("reads must be mapped (-p) if mappings are to be output (-t)\n") if options.get('-k') and re.search(r'^-', options.get('-k')): die("please make sure that the adapter sequence designated with the -k option is correct\n") if options.get('-l') and re.search(r'^-', options.get('-l')): die("please make sure that the int given with the -l option is correct\n") if options.get('-p') and re.search(r'ebwt$', options.get('-p')): die("please make sure that you are using the -p option correctly.\nThe argument given after -p must be the _prefix_ of the bowtie\nindexed files and should not contain 'ebwt'. For instance,\nif the first indexed file is called 'h_sapiens_37_asm.1.ebwt'\nthen the prefix is 'h_sapiens_37_asm'.\n") if options.get('-p') and re.search(r'^-', options.get('-p')): die("please make sure that the genome index designated with the -p option is correct\n") # added by SM to check if bowtie is installed when reads should be mapped # to genome if options.get('-p'): # TODO: my $binst=`bowtie --version 2>&1`; binst = os.system('bowtie --version 2>&1') if binst: printErr() die("Bowtie mapping tool not installed.\nPlease download from http://downloads.sourceforge.net/project/bowtie-bio/bowtie/ the latest version and install it.\n\n") if options.get('-s') and re.search(r'^-', options.get('-s')): die("please make sure that the output file designated with the -s option is correct\n") if options.get('-t') and re.search(r'^-', options.get('-t')): die("please make sure that the output file designated with the -t option is correct\n")
hash_seq[tag] except KeyError: hash_seq[tag] = {} hash_seq[tag][mm[0]] = counter else: die('Error in line {}: Either the sequence\n\n{}\n\ncontains less than 17 characters or contains characters others than [acgtunACGTUN]\n\n\nPlease make sure that your file only comprises sequences that have at least 17 characters\n\ncontaining letters [acgtunACGTUN]\n'.format( Nicenumber(counter), rin )) if __name__ == '__main__': if len(sys.argv) < 2: die(usage) in_file = sys.argv[1] # FILE = open_or_die(in_file, 'rb', "Can't open {}".format(in_file)) # while True: # aBuffer = FILE.read(4096) # if not aBuffer: # break # lines += int(tr(aBuffer, '\n', '')) # FILE.close() lines = os.popen("wc -l {}".format(in_file)).read().strip() lines = int(re.split(r'\s', lines)[0]) if lines / 2 > 5000000: rhash = {}
def read_handler(handle): global counter while True: rin = handle.readline() if not rin: break rin = rin.strip() counter += 1 m = re.match(r'^\>(\S\S\S)(.+)$', rin) if m: m = m.groups() _id = '{}{}'.format(m[0], m[1]) tag = m[0] if re.search(r'\s+', _id): die('Error in line {}: The identifier\n \n{}\n \ncontains white spaces\n\n\nPlease make sure that none of the identifiers contain whitepaces.\nYou could run remove_white_space_in_id.py {} > newfile\nThis will remove everything from the id line after the first whitespace'.format( Nicenumber(counter), rin, in_file )) elif not re.match(r'^(\S\S\S)_(\d+)_(x\d+)$', _id): die('Error in line {}: The identifier\n\n{}\n\nhas to have the format\nname_uniqueNumber_xnumber\n\n\nPlease make sure that all identifiers are unique and have the format described above.'.format( Nicenumber(counter), _id, )) else: mm = re.match(r'^(\S\S\S)_(\d+)_(x\d+)$', _id) if mm: mm = mm.groups() try: hash_num[m[1]] += 1 except KeyError: hash_num[m[1]] = 1 else: mm = re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]{17,})$', rin) if mm: mm = mm.groups() defined = False try: hash_seq[tag][seq] defined = True except KeyError: pass if defined and hash_seq[tag][seq]: die('Error in line {}: The sequence\n\n{}\n\noccures at least twice in sample {} in your reads file.\n\nAt first it occured at line {}\n\nPlease make sure that your reads file only contains unique sequences within each sample.\n'.format( Nicenumber(counter), mm[0], tag, Nicenumber(hash_seq[tag][mm[0]]) )) else: try: hash_seq[tag] except KeyError: hash_seq[tag] = {} hash_seq[tag][mm[0]] = counter else: die('Error in line {}: Either the sequence\n\n{}\n\ncontains less than 17 characters or contains characters others than [acgtunACGTUN]\n\n\nPlease make sure that your file only comprises sequences that have at least 17 characters\n\ncontaining letters [acgtunACGTUN]\n'.format( Nicenumber(counter), rin ))
if not line: break l = esplit(line) if l[5] not in index.keys(): count += 1 index[l[5]] = count IKT.write('{}\t{}'.format(index[l[5]], line)) IK.close() IKT.close() if __name__ == '__main__': if len(sys.argv) < 4: die(usage) parser = argparse.ArgumentParser(usage) parser.add_argument('file_reads', help='file reads') parser.add_argument('file_precursors', help='file precursors') parser.add_argument('read_align_edit_distance', help='read align edit distance') args = parser.parse_args(sys.argv[1:4]) file_reads = args.file_reads file_precursors = args.file_precursors read_align_edit_distance = args.read_align_edit_distance opts, argss = getopt.getopt(sys.argv[4:], "a:bo:") options = dict(opts)
def test_input_files(): global file_reads, file_reads_vs_genome, file_genome, file_precursors, minpreslen, file_mature_ref_other_species, file_mature_ref_this_species IN = open_or_die2(file_reads, 'rb') line = IN.readline().strip() if not re.search(r'^>\S+', line): printErr() die("The first line of file $file_reads does not start with '>identifier'\nReads file {} is not a valid fasta file\n\n" .format(file_reads)) if re.search(r'\s', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n' .format(file_reads, file_reads)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n' .format(file_reads, file_reads)) IN.close() IN = open_or_die2(file_genome, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nGenome file {} is not a fasta file\n\n" .format(file_genome, file_genome)) if re.search(r'\s', line): printErr() die('Genome file {} has not allowed whitespaces in its first identifier\n\n' .format(file_genome)) # get genome ids tmps = os.popen('grep ">" {}'.format(file_genome)).read().strip() genomeids = dict(map(lambda x: (x, 1), re.split("\n", tmps))) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nGenome file {} is not a fasta file\n\n' .format(file_genome, file_genome)) IN.close() IN = open_or_die2(file_reads_vs_genome, 'rb') line = IN.readline() if not re.search( r'^(\S+_x\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$', line): printErr() die('Mapping file {} is not in arf format\n\nEach line of the mapping file must consist of the following fields\nreadID_wo_whitespaces length start end read_sequence genomicID_wo_whitspaces length start end genomic_sequence strand #mismatches editstring\nThe editstring is optional and must not be contained\nThe readID must end with _xNumber and is not allowed to contain whitespaces.\nThe genomeID is not allowed to contain whitespaces.' .format(file_reads_vs_genome)) IN.close() # get ids from arf file and compare them with ids from the genome file tmps = os.popen( 'cut -f6 {}|sort -u'.format(file_reads_vs_genome)).read().strip() for s in re.split("\n", tmps): if not genomeids.get(">{}".format(s)): die("The mapped reference id {} from file {} is not an id of the genome file {}\n\n" .format(s, file_reads_vs_genome, file_genome)) if not re.search('none', file_mature_ref_this_species): IN = open_or_die2(file_mature_ref_this_species, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_this_species, file_mature_ref_this_species)) if re.search(r'\s', line): printErr() die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n" .format(file_mature_ref_this_species)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_this_species, file_mature_ref_this_species)) IN.close() if not re.search('none', file_mature_ref_other_species): IN = open_or_die2(file_mature_ref_other_species, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_other_species, file_mature_ref_other_species)) if re.search(r'\s', line): printErr() die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n" .format(file_mature_ref_other_species)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_mature_ref_other_species, file_mature_ref_other_species)) IN.close() if not re.search('none', file_precursors): IN = open_or_die2(file_precursors, 'rb') line = IN.readline().strip() if not re.search(r'>\S+', line): printErr() die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_precursors, file_precursors)) if re.search(r'\s', line): printErr() die("precursor file {} has not allowed whitespaces in its first identifier\n\n" .format(file_precursors)) line = IN.readline() if not re.search(r'^[ACGTUNacgtun]*$', line): printErr() die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n" .format(file_precursors, file_precursors)) if len(line) < minpreslen: printErr() die("The precursor file {} does not contain sequences of at least {} nt\nPlease make sure that you provided the correct file and the correct parameter ordering when calling {}\nIf you have precursors with less than {} please use option -p <int> to specify this length\n" .format(file_precursors, minpreslen, sys.argv[0], minpreslen)) IN.close() # ################################################# # precheck finished # ################################################# # do stringent testing of all input files pprint("#testing input files\n") print_stderr("#testing input files\n") if not re.search('none', file_mature_ref_this_species): start() cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format( file_mature_ref_this_species) print_stderr(cmd) ret_file_mature_ref_this_species = os.popen(cmd).read().strip() if ret_file_mature_ref_this_species: printErr() die("problem with {} {}\n".format( file_mature_ref_this_species, ret_file_mature_ref_this_species)) end() if not re.search(r'none', file_mature_ref_other_species): start() cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format( file_mature_ref_other_species) print_stderr(cmd) ret_file_mature_ref_other_species = os.popen(cmd).read().strip() if ret_file_mature_ref_other_species: printErr() die("problem with {} {}\n".format( file_mature_ref_other_species, ret_file_mature_ref_other_species)) end() cmd = "sanity_check_reads_ready_file.py {} 2>&1\n\n".format(file_reads) print_stderr(cmd) start() ret_test_file_reads = os.popen(cmd).read().strip() if ret_test_file_reads: printErr() die("problem with {} {}\n".format(file_reads, ret_test_file_reads)) end() start() cmd = "sanity_check_genome.py {} 2>&1;\n\n".format(file_genome) print_stderr(cmd) ret_test_file_genome = os.popen(cmd).read().strip() if ret_test_file_genome: printErr() die("problem with {} {}\n".format(file_genome, ret_test_file_genome)) end() start() cmd = "sanity_check_mapping_file.py {} 2>&1".format(file_reads_vs_genome) print_stderr(cmd) ret_test_file_reads_genome = os.popen(cmd).read().strip() if ret_test_file_reads_genome: printErr() die("problem with {} {}\n".format(file_reads_vs_genome, ret_test_file_reads_genome)) end() if not re.search('none', file_precursors): start() cmd = "sanity_check_mature_ref.py {} 2>&1".format(file_precursors) print_stderr(cmd) ret_file_precursors = os.popen(cmd).read().strip() if ret_file_precursors: printErr() die("problem with {} {}\n".format(file_precursors, ret_file_precursors)) end() start() if not re.search('none', file_mature_ref_this_species, re.IGNORECASE): print_stderr("Quantitation of expressed miRNAs in data\n\n\n") species = '' if options.get('-t'): species = "-t {}".format(options.get('-t')) file_star = '' if options.get('-s'): if file_s(options.get('-s')): file_star = "-s {}".format(options.get('-s')) else: print_stderr( "File {} specified by option -s is empty or not found\n" .format(options.get('-s'))) options['-s'] = 0 print("#Quantitation of known miRNAs in data\n") dopt = "" Popt = "" if options.get('-d') == '': dopt = "-d" if options.get('-P') == '': Popt = "-P" quant = "quantifier.py -p {} -m {} -r {} {} {} -y {} -k {} {}".format( file_precursors, file_mature_ref_this_species, file_reads, file_star, species, ltime, dopt, Popt) print_stderr(quant, "\n") os.system(quant) options[ '-q'] = "expression_analyses/expression_analyses_{}/miRBase.mrd".format( ltime) end() else: print_stderr( "Pre-quantitation is skipped caused by missing file with known miRNAs\n\n\n" ) else: print_stderr( "Pre-quantitation is skipped caused by missing file with known precursor miRNAs\n\n\n" )
if re.search(r'(\S+)', line): line = line.strip() line = re.sub(file_structure, '{}/precursors_permuted.str'.format(_dir), line, count=1) line = re.sub(r'>.+', '', line, count=1) return line die('{} is empty\n'.format(file_command_line)) if __name__ == '__main__': if len(sys.argv) < 4: die(usage) parser = argparse.ArgumentParser(usage=usage) parser.add_argument('file_command_line', help='command list file') parser.add_argument('file_structure', help='structure file') parser.add_argument('rounds', help='rounds') args = parser.parse_args(sys.argv[1:4]) file_command_line = args.file_command_line file_structure = args.file_structure rounds = int(args.rounds) opt, argss = getopt.getopt(sys.argv[4:], "a") options = dict(opt) ltime = long(time.time())
#!/usr/bin/env python from __future__ import print_function import re import sys from port import die, esplit, open_or_die if __name__ == '__main__': if len(sys.argv) < 2: die('No csv file given for bed conversion\n') known, novel, _not, line, thres, score, line, strand, label, end = ( None, None, None, None, None, None, None, None, None, None) IN = open_or_die(sys.argv[1], 'r', 'cannot open {}\n'.format(sys.argv[1])) while True: line = IN.readline() if not line: break if re.search(r'novel miRNAs predicted by moRNA Finder', line): novel = 1 known = 0 _not = 0 elif re.search(r'mature miRBase miRNAs detected', line): novel = 0 known = 1 _not = 0 else: l = esplit(line)
def run_bowtie_cmd(fafile, fa_prefix): cmd = 'bowtie-build {} {}'.format(fafile, fa_prefix) print_stderr(cmd, '\n') ret = os.system(cmd) if ret: die('Run bowtie failed.\n')