def split_files(input_filename='in.fa', split_size=20): """ Split input files into split_0/in.fa, split_1/in.fa... Return the list of split directories """ i = 0 count = 0 d = "split_0" if not os.path.exists(d): os.makedirs(d) f = open(os.path.join(d, 'in.fa'), 'w') split_dirs = [d] run_external_call("cp in.weights {0}/in.weights".format(d)) for r in SeqIO.parse(open(input_filename), 'fasta'): if count >= split_size: f.close() count = 0 i += 1 d = "split_" + str(i) if not os.path.exists(d): os.makedirs(d) split_dirs.append(d) run_external_call("cp in.weights {0}/in.weights".format(d)) f = open(os.path.join(d, 'in.fa'), 'w') f.write(">{0}\n{1}\n".format(r.id, r.seq)) count += 1 f.close() return split_dirs
def split_files(input_filename="in.fa", split_size=20): """ Split input files into split_0/in.fa, split_1/in.fa... Return the list of split directories """ i = 0 count = 0 d = "split_0" if not os.path.exists(d): os.makedirs(d) f = open(os.path.join(d, "in.fa"), "w") split_dirs = [d] run_external_call("cp in.weights {0}/in.weights".format(d)) for r in SeqIO.parse(open(input_filename), "fasta"): if count >= split_size: f.close() count = 0 i += 1 d = "split_" + str(i) if not os.path.exists(d): os.makedirs(d) split_dirs.append(d) run_external_call("cp in.weights {0}/in.weights".format(d)) f = open(os.path.join(d, "in.fa"), "w") f.write(">{0}\n{1}\n".format(r.id, r.seq)) count += 1 f.close() return split_dirs
def sanity_check_mash_exists(): try: run_external_call("mash --version") except: print("mash executable does not exist. Please install mash first!", file=sys.stderr) sys.exit(-1)
def sanity_check_gmapl_exists(): """ GMAP version that comes with smrtanalysis 2.3 is old and does not contain gmapl =__= User must install newer version of gmap to get gmapl """ try: run_external_call("gmapl --version") except: print("gmapl executable does not exist. You probably have an old version of GMAP." \ "Please install a newer version of GMAP and try again.", file=sys.stderr) sys.exit(-1)
def sanity_check_gmapl_exists(): """ GMAP version that comes with smrtanalysis 2.3 is old and does not contain gmapl =__= User must install newer version of gmap to get gmapl """ try: run_external_call("gmapl --version") except: print >> sys.stderr, "gmapl executable does not exist. You probably have an old version of GMAP." \ "Please install a newer version of GMAP and try again." sys.exit(-1)
def run_Cogent_on_split_files(split_dirs, depth): """ 1. run Cogent individually on each split directory 2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it """ time1 = time.time() olddir = os.getcwd() for d in split_dirs: os.chdir(d) if os.path.exists('cogent2.fa'): print >> sys.stderr, "skipping {0} because done already".format(d) os.chdir(olddir) continue run_Cogent_on_input() # clean up cogent in the split dir if os.path.exists('cogent') and os.path.isdir('cogent'): cleanup_gmap('cogent') if os.path.exists('cogent2') and os.path.isdir('cogent2'): cleanup_gmap('cogent2') os.chdir(olddir) if os.path.exists('combined'): run_external_call("rm -rf combined") os.makedirs('combined') # now combine all the cogent2 results and pretend they are the "INPUT" f = open('combined/in.fa', 'w') f2 = open('combined/in.weights', 'w') i = 0 for d in split_dirs: for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'): f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq)) f2.write("fake_input_path{0}\t1\n".format(i)) i += 1 f.close() f2.close() os.chdir('combined') if i > cc_settings.MAX_POST_SPLIT_IN_SIZE and depth < cc_settings.MAX_RECUR_DEPTH: dirs = split_files(input_filename='in.fa', split_size=cc_settings.MAX_POST_SPLIT_IN_SIZE) run_Cogent_on_split_files(dirs, depth+1) run_Cogent_on_input() os.chdir('../') # now take the output from combined and run LP against it, # using the real input this time with open('in.trimmed.fa', 'w') as f: for r in SeqIO.parse(open('in.fa'), 'fasta'): f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq)))) if os.path.exists('post_combined'): run_external_call("rm -rf post_combined") os.makedirs('post_combined') os.chdir('post_combined') run_external_call("ln -s ../combined/cogent2.fa cogent.fa") run_external_call("ln -s ../in.weights in.weights") run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa") run_gmap() post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')]) os.chdir('../') # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa") run_gmap(dbname='cogent2', infile='in.trimmed.fa') #post_gmap_processing() time4 = time.time() log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4-time1))
def run_Cogent_on_split_files(split_dirs, depth): """ 1. run Cogent individually on each split directory 2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it """ time1 = time.time() olddir = os.getcwd() for d in split_dirs: os.chdir(d) if os.path.exists('cogent2.fa'): print("skipping {0} because done already".format(d), file=sys.stderr) os.chdir(olddir) continue try: run_Cogent_on_input() os.chdir(olddir) except CycleDetectedException: os.chdir(olddir) raise CycleDetectedException if os.path.exists('combined'): run_external_call("rm -rf combined") os.makedirs('combined') # now combine all the cogent2 results and pretend they are the "INPUT" f = open('combined/in.trimmed.fa', 'w') f2 = open('combined/in.weights', 'w') i = 0 for d in split_dirs: for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'): f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq)) f2.write("fake_input_path{0}\t1\n".format(i)) i += 1 f.close() f2.close() os.chdir('combined') if i > cc_settings.MAX_POST_SPLIT_IN_SIZE and depth < cc_settings.MAX_RECUR_DEPTH: dirs = split_files(input_filename='in.trimmed.fa', split_size=cc_settings.MAX_POST_SPLIT_IN_SIZE) run_Cogent_on_split_files(dirs, depth + 1) run_Cogent_on_input() os.chdir('../') if os.path.exists('post_combined'): run_external_call("rm -rf post_combined") os.makedirs('post_combined') os.chdir('post_combined') run_external_call("ln -s ../combined/cogent2.fa cogent.fa") run_external_call("ln -s ../in.weights in.weights") run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa") sam_file = run_minimap2('cogent.fa', 'in.trimmed.fa', 'SAM') post_minimap2_processing( 'cogent.fa', sam_file, 'cogent2', seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')]) os.chdir('../') # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa") run_minimap2('cogent2.fa', 'in.trimmed.fa', format='SAM') time4 = time.time() log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
def sanity_check_mash_exists(): try: run_external_call("mash --version") except: print >> sys.stderr, "mash executable does not exist. Please install mash first!" sys.exit(-1)
def sanity_check_minimap2_exists(): try: run_external_call("minimap2 --version") except: print >> sys.stderr, "minimap2 executable does not exist. Please install minimap2 first!" sys.exit(-1)
def run_Cogent_on_split_files(split_dirs): """ 1. run Cogent individually on each split directory 2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it """ time1 = time.time() olddir = os.getcwd() for d in split_dirs: os.chdir(d) run_Cogent_on_input() os.chdir(olddir) if os.path.exists('combined'): run_external_call("rm -rf combined") os.makedirs('combined') # now combine all the cogent2 results and pretend they are the "INPUT" f = open('combined/in.fa', 'w') f2 = open('combined/in.weights', 'w') i = 0 for d in split_dirs: for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'): f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq)) f2.write("fake_input_path{0}\t1\n".format(i)) i += 1 f.close() f2.close() os.chdir('combined') run_Cogent_on_input() os.chdir('../') # now take the output from combined and run LP against it, # using the real input this time with open('in.trimmed.fa', 'w') as f: for r in SeqIO.parse(open('in.fa'), 'fasta'): f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq)))) if os.path.exists('post_combined'): run_external_call("rm -rf post_combined") os.makedirs('post_combined') os.chdir('post_combined') run_external_call("ln -s ../combined/cogent2.fa cogent.fa") run_external_call("ln -s ../in.weights in.weights") run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa") run_gmap() post_gmap_processing( seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')]) os.chdir('../') # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa") run_gmap(dbname='cogent2', infile='in.trimmed.fa') #post_gmap_processing() time4 = time.time() log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
def run_Cogent_on_split_files(split_dirs): """ 1. run Cogent individually on each split directory 2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it """ time1 = time.time() olddir = os.getcwd() for d in split_dirs: os.chdir(d) run_Cogent_on_input() os.chdir(olddir) if os.path.exists("combined"): run_external_call("rm -rf combined") os.makedirs("combined") # now combine all the cogent2 results and pretend they are the "INPUT" f = open("combined/in.fa", "w") f2 = open("combined/in.weights", "w") i = 0 for d in split_dirs: for r in SeqIO.parse(open(os.path.join(d, "cogent2.fa")), "fasta"): f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq)) f2.write("fake_input_path{0}\t1\n".format(i)) i += 1 f.close() f2.close() os.chdir("combined") run_Cogent_on_input() os.chdir("../") # now take the output from combined and run LP against it, # using the real input this time with open("in.trimmed.fa", "w") as f: for r in SeqIO.parse(open("in.fa"), "fasta"): f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq)))) if os.path.exists("post_combined"): run_external_call("rm -rf post_combined") os.makedirs("post_combined") os.chdir("post_combined") run_external_call("ln -s ../combined/cogent2.fa cogent.fa") run_external_call("ln -s ../in.weights in.weights") run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa") run_gmap() post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open("in.trimmed.fa"), "fasta")]) os.chdir("../") # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa") run_gmap(dbname="cogent2", infile="in.trimmed.fa") # post_gmap_processing() time4 = time.time() log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))