def zip_up_directory(run_date, dirPath, mode='a'): """ This should be run at the end of each process to zip the files in each directory """ files_to_compress = ['fa', 'db', 'names', 'sff', 'fasta', 'fastq'] assert os.path.isdir(dirPath) zipFilePath = os.path.join(dirPath, run_date + '.zip') zf = zipfile.ZipFile(zipFilePath, mode) for (archiveDirPath, dirNames, fileNames) in os.walk(dirPath): for file in fileNames: if file.split('.')[-1] in files_to_compress: filePath = os.path.join(dirPath, file) zf.write(filePath, compress_type=zipfile.ZIP_DEFLATED) zipInfo = zipfile.ZipInfo(zipFilePath) for i in zf.infolist(): dt = datetime.datetime(*(i.date_time)) logger.debug("%s\tSize: %sb\tCompressed: %sb\t\tModified: %s" % (i.filename, i.file_size, i.compress_size, dt.ctime())) os.remove(i.filename) zf.close()
def __init__(self, lane_keys, runobj): self.inputFileName = {} self.orphans = {} self.lane_keys = lane_keys self.base_dir = runobj.output_dir self.trim_dir = os.path.join(self.base_dir, 'analysis/trimming') #self.chimera_dir = os.path.join(self.base_dir, 'analysis/chimera') self.deleted_ids = {} for lane_key in lane_keys: self.inputFileName[lane_key] = os.path.join( self.trim_dir, lane_key + ".trimmed.fa") self.orphans[lane_key] = {} deleted_file = os.path.join(self.trim_dir, lane_key + ".deleted.txt") self.deleted_ids[lane_key] = [] if not (os.path.exists(deleted_file) and os.path.getsize(deleted_file) > 0): logger.debug("No deleted sequences for lane: " + lane_key) continue del_fh = open(deleted_file, "r") #deleted_id_list = self.deleted_ids[lane_key] = [] for line in del_fh.readlines(): lst = line.strip().split() #deleted_id_list.append(lst[0]) self.deleted_ids[lane_key].append(lst[0])
def write_clean_fasta_file(self): """ def to write a new fasta from the original fasta file using the deleted file The deleted file contains the trimming deleted as well as the chimera deleted Then write the uniques from Meren's fastalib """ sleep(2) for lane_key in self.lane_keys: logger.debug("write_clean_fasta_file working on lanekey: " + lane_key); deleted_id_list = [] original_trimmed_file = os.path.join(self.outputdir, lane_key + ".trimmed.fa" ) new_trimmed_file_name = os.path.join(self.outputdir, lane_key + ".newtrimmed.fa") new_trimmed_file = FastaOutput(new_trimmed_file_name) # open trimmed file and read a line trimmedfasta = SequenceSource(original_trimmed_file) logger.debug("write_clean_fasta_file about to check trimmedfasta file"); deleted_id_list = self.deleted_ids[lane_key] if len(deleted_id_list) == 0: continue while trimmedfasta.next(): if trimmedfasta.id not in deleted_id_list: new_trimmed_file.store(trimmedfasta) new_trimmed_file.close() # rename to newtrimmed => trimmed os.rename(original_trimmed_file, os.path.join(self.outputdir, lane_key + ".trimmed_with_chimera.fa" )) os.rename(new_trimmed_file_name, original_trimmed_file )
def get_keys(runobj): try: idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # {"status": "success", "new_lane_keys": ["1_GATGA"]} except: # here we have no idx_keys - must create them from run # if illumina they are index_runkey_lane concatenation # if 454 the are lane_key if runobj.vamps_user_upload: #print 'KEYS: '+' '.join(runobj.run_keys) idx_keys=runobj.samples.keys() else: if runobj.platform == 'illumina': idx_keys = runobj.idx_keys ct = 0 for h in runobj.samples: logger.debug(h) # logger.debug(h,runobj.samples[h]) #TypeError: not all arguments converted during string formatting ct +=1 elif runobj.platform == '454': idx_keys = runobj.idx_keys elif runobj.platform == 'ion_torrent': idx_keys = runobj.idx_keys else: logger.debug("GAST: No keys found - Exiting") runobj.run_status_file_h.write("GAST: No keys found - Exiting\n") sys.exit() if type(idx_keys) is types.StringType: return idx_keys.split(',') elif type(idx_keys) is types.ListType: return idx_keys else: return None return idx_keys
def trim_anchor_helper(anchor_name, expanded_anchor_sequences, freedom, length, start, sequence): exact = '' exactTrimmedOff = '' logger.debug('looking for anchor: ' + anchor_name + " start: " + str(start) + " length: " + str(length)) max_divergence = C.max_divergence logger.debug('anchor_list: ' + str(expanded_anchor_sequences)) list_of_tuples = anchortrim.generate_tuples(start, freedom, length, list_of_tuples=[], reversed_read=False) logger.debug('anchor tuples: ' + str(list_of_tuples)) anchor, location = anchortrim.find_best_distance( sequence, expanded_anchor_sequences, max_divergence, list_of_tuples) if anchor and location: logger.debug('anchor: ' + anchor + ' loc tuple: ' + str(location)) trimmed_sequence = sequence[:location[ 1]] # same thing here for the reversed == False exact = anchor exactTrimmedOff = sequence[location[1]:] else: logger.debug('no anchor location found') trimmed_sequence = sequence return exact, exactTrimmedOff, trimmed_sequence
def trim(run): # (re) create the trim status file run.trim_status_file_h = open(run.trim_status_file_name, "w") # do the trim work mytrim = TrimRun(run) # pass True to write out the straight fasta file of all trimmed non-deleted seqs # Remember: this is before chimera checking trim_codes = mytrim.trimrun(True) trim_results_dict = {} if trim_codes[0] == 'SUCCESS': # setup to write the status new_lane_keys = trim_codes[2] trim_results_dict['status'] = "success" trim_results_dict['new_lane_keys'] = new_lane_keys logger.debug("Trimming finished successfully") # write the data files mytrim.write_data_files(new_lane_keys) run.trim_status_file_h.write(json.dumps(trim_results_dict)) run.trim_status_file_h.close() else: logger.debug("Trimming finished ERROR") trim_results_dict['status'] = "error" trim_results_dict['code1'] = trim_codes[1] trim_results_dict['code2'] = trim_codes[2] run.trim_status_file_h.write(json.dumps(trim_results_dict)) run.trim_status_file_h.close() sys.exit()
def illumina_files(runobj): logger.debug("Start Overlap, filter and unique reads") utils = PipelneUtils() start = time.time() # illumina_files_demultiplex_only(runobj) illumina_files_inst = IlluminaFiles(runobj) if runobj.do_perfect: # illumina_files_inst.perfect_reads() script_file_name = illumina_files_inst.merge_perfect() utils.run_until_done_on_cluster(script_file_name) script_file_name = illumina_files_inst.trim_primers_perfect() utils.run_until_done_on_cluster(script_file_name) else: # illumina_files_inst.partial_overlap_reads() # pass # TODO: test utils.run_until_done_on_cluster(illumina_files_inst.partial_overlap_reads_cluster()) # TODO: add cutting to 251 script_file_name = illumina_files_inst.partial_overlap_reads_cluster() utils.run_until_done_on_cluster(script_file_name) script_file_name = illumina_files_inst.filter_mismatches_cluster() utils.run_until_done_on_cluster(script_file_name) # illumina_files_inst.filter_mismatches() # illumina_files_inst.uniq_fa() script_file_name = illumina_files_inst.uniq_fa_cluster() utils.run_until_done_on_cluster(script_file_name) # illumina_chimera(runobj) elapsed = (time.time() - start) logger.debug("illumina_files time = %s" % str(elapsed))
def get_keys(runobj): try: idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name, "r").read()))[ "new_lane_keys" ] # {"status": "success", "new_lane_keys": ["1_GATGA"]} except: # here we have no idx_keys - must create them from run # if illumina they are index_runkey_lane concatenation # if 454 the are lane_key if runobj.platform == "illumina": idx_keys = runobj.idx_keys ct = 0 for h in runobj.samples: logger.debug(h, runobj.samples[h]) ct += 1 print ct elif runobj.platform == "454": idx_keys = runobj.idx_keys elif runobj.platform == "ion_torrent": idx_keys = runobj.idx_keys elif runobj.platform == "vamps": idx_keys = [runobj.user + runobj.run] else: logger.debug("GAST: No keys found - Exiting") runobj.run_status_file_h.write("GAST: No keys found - Exiting\n") sys.exit() if type(idx_keys) is types.StringType: return idx_keys.split(",") elif type(idx_keys) is types.ListType: return idx_keys else: return None return idx_keys
def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: print "Problems with this command: %s" % (uchime_cmd) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) raise
def process(runobj, steps): requested_steps = steps.split(",") if "clean" in requested_steps and len(requested_steps) > 1: sys.exit("The clean step cannot be combined with other steps - Exiting") # create output directory: # this should have been created in pipeline-ui.py. but just in case.... if not os.path.exists(runobj.output_dir): logger.debug("Creating output directory: " + runobj.output_dir) os.makedirs(runobj.output_dir) # Open run STATUS File here. # open in append mode because we may start the run in the middle # say at the gast stage and don't want to over write. # if we re-run trimming we'll get two trim status reports runobj.run_status_file_h = open(runobj.run_status_file_name, "a") # loop through official list...this way we execute the # users requested steps in the correct order for step in C.existing_steps: if step in requested_steps: # call the method in here step_method = globals()[step] step_method(runobj)
def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: self.utils.print_both("Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e)) raise
def is_local(self): curr_hostname = os.uname()[1] logger.debug("curr_hostname: %s" % curr_hostname) dev_comps = ["ashipunova.mbl.edu", "as-macbook.home", "as-macbook.local", "Ashipunova.local", "Annas-MacBook-new.local", "Annas-MacBook.local", "Anna's MacBook Pro", "annasmacbooknew.mbl.edu mblad.mbl.edu printers.mbl.edu jbpc.mbl.edu jbpc-np.mbl.edu"] if curr_hostname in dev_comps: return True else: return False
def is_vamps(self): curr_hostname = os.uname()[1] logger.debug("curr_hostname: %s" % curr_hostname) vamps_comps = ['bpcweb8','bpcweb7','bpcweb7.bpcservers.private', 'bpcweb8.bpcservers.private', 'vampsdev', 'vampsdb'] if curr_hostname in vamps_comps: return True else: return False
def insert_sequence_uniq_info_ill(self, fasta, gast_dict): (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id] sequence_ill_id = self.seq_id_dict[fasta.seq] if taxonomy in self.tax_id_dict: try: taxonomy_id = self.tax_id_dict[taxonomy] except Exception, e: logger.debug("Error = %s" % e) raise
def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb return ref_db
def illumina_files_demultiplex_only(runobj): logger.debug("Start Demultiplex Illumina files by index/run_key/lane") start = time.time() illumina_files_inst = IlluminaFiles(runobj) illumina_files_inst.open_dataset_files() illumina_files_inst.split_files(compressed=runobj.compressed) elapsed = (time.time() - start) logger.debug("illumina_files demultiplex only time = %s" % str(elapsed))
def illumina_files_demultiplex_only(runobj): logger.debug("Start Demultiplex Illumina files by index/run_key/lane") start = time.time() illumina_files_inst = IlluminaFiles(runobj) illumina_files_inst.open_dataset_files() illumina_files_inst.split_files(compressed = runobj.compressed) elapsed = (time.time() - start) logger.debug("illumina_files demultiplex only time = %s" % str(elapsed))
def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': ref_db = C.chimera_checking_its_refdb logger.debug("got an ITS dna region so using refdb: " + ref_db) else: ref_db = C.chimera_checking_refdb if self.utils.is_local(): ref_db = C.chimera_checking_refdb_local logger.debug("using standard refdb: " + ref_db) return ref_db
def is_vamps(self): curr_hostname = os.uname()[1] logger.debug("curr_hostname: %s" % curr_hostname) vamps_comps = [ 'bpcweb8', 'bpcweb7', 'bpcweb7.bpcservers.private', 'bpcweb8.bpcservers.private', 'vampsdev', 'vampsdb' ] if curr_hostname in vamps_comps: return True else: return False
def status(runobj): f = open(runobj.run_status_file_name) lines = f.readlines() f.close() logger.debug("=" * 40) logger.debug("STATUS LOG: ") for line in lines: line = line.strip() logger.debug("line in run_status_file: ") logger.debug(line) logger.debug("=" * 40 + "\n")
def trim_fuzzy_distal(anchors_list, seq, trim_type, start, end): """Doc string here..""" max_distance = 3 best_distance = max_distance + 1 found_fuzzy = 0 fuzzy_match = "" for anchor in anchors_list: anchor_length = len(anchor) for pos in range(start, end): seq_window = seq[pos:anchor_length] dist = 0 #dist1 = abs( Levenshtein.ratio( anchor, seq_window ) ) # dist2 = abs( Levenshtein.ratio( seq_window, anchor ) ) dist1 = abs(levenshtein(anchor, seq_window)) dist2 = abs(levenshtein(seq_window, anchor)) if dist1 >= dist2: dist = dist1 else: dist = dist2 if (dist <= max_distance) and (dist < best_distance) and ( seq_window[:2] == anchor[:2]): if seq_window[-3:] != anchor[-3:]: # check for deletion if (seq_window[-4:][:3] == anchor[-3:]): seq_window.strip() logger.debug("Fuzzy with deletion " + seq_window) # check for insertion elif (seq_window[-3:] == anchor[-4:][:3]): seq_window = seq_window + anchor[-1:] logger.debug("fuzzy with insertion " + seq_window) # Found a fuzzy match within tolerances, so store it found_fuzzy = 1 best_distance = dist best_position = pos fuzzy_match = seq_window if dist == 0: found_exact = 1 break fuzz_right = '' if found_fuzzy: fuzzy_right = seq if (trim_type == 'internal'): seq = seq[:best_position + len(fuzzy_match)] else: seq = seq[:best_position] fuzzy_right = fuzz_right[len(seq):] return fuzz_right, best_distance, seq, fuzzy_match
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences): # for filename in filenames: insert_seq_time_start = time.time() try: logger.debug("\n----------------\nfilename = %s" % filename) my_file_to_db_upload.seq.insert_seq(sequences) insert_seq_time = (time.time() - insert_seq_time_start) logger.debug("insert_seq() took %s sec to finish" % insert_seq_time) except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:") # handle unexpected exceptions logger.error(sys.exc_info()[0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def trim_fuzzy_distal(anchors_list, seq, trim_type, start, end): """Doc string here..""" max_distance = 3 best_distance = max_distance + 1 found_fuzzy = 0 fuzzy_match = "" for anchor in anchors_list: anchor_length = len(anchor) for pos in range(start,end): seq_window = seq[pos:anchor_length] dist = 0 #dist1 = abs( Levenshtein.ratio( anchor, seq_window ) ) # dist2 = abs( Levenshtein.ratio( seq_window, anchor ) ) dist1 = abs( levenshtein( anchor, seq_window ) ) dist2 = abs( levenshtein( seq_window, anchor ) ) if dist1 >= dist2: dist = dist1 else: dist = dist2 if (dist <= max_distance) and (dist < best_distance) and (seq_window[:2] == anchor[:2]): if seq_window[-3:] != anchor[-3:]: # check for deletion if(seq_window[-4:][:3] == anchor[-3:]): seq_window.strip() logger.debug( "Fuzzy with deletion " + seq_window) # check for insertion elif(seq_window[-3:] == anchor[-4:][:3]): seq_window = seq_window + anchor[-1:] logger.debug("fuzzy with insertion " + seq_window) # Found a fuzzy match within tolerances, so store it found_fuzzy = 1; best_distance = dist; best_position = pos; fuzzy_match = seq_window; if dist == 0: found_exact = 1 break fuzz_right = '' if found_fuzzy: fuzzy_right = seq if( trim_type == 'internal'): seq = seq[:best_position + len(fuzzy_match)] else: seq = seq[:best_position] fuzzy_right = fuzz_right[len(seq):] return fuzz_right, best_distance, seq, fuzzy_match
def is_local(self): curr_hostname = os.uname()[1] logger.debug("curr_hostname: %s" % curr_hostname) dev_comps = [ "ashipunova.mbl.edu", "as-macbook.home", "as-macbook.local", "Ashipunova.local", "Annas-MacBook-new.local", "Annas-MacBook.local", "Anna's MacBook Pro", "annasmacbooknew.mbl.edu mblad.mbl.edu printers.mbl.edu jbpc.mbl.edu jbpc-np.mbl.edu", "AnnasMacBook.local" ] if curr_hostname in dev_comps: return True else: return False
def get_gasta_result(self, filename): gast_file_name = self.gast_filename(filename) try: with open(gast_file_name) as fd: gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd]) return gast_dict except IOError, e: # print dir(e) #['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror'] # print "errno = %s" % e.errno logger.debug("errno = %s" % e.errno) if e.errno == 2: # suppress "No such file or directory" error pass
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences): # for filename in filenames: insert_seq_time_start = time.time() try: logger.debug("\n----------------\nfilename = %s" % filename) my_file_to_db_upload.seq.insert_seq(sequences) insert_seq_time = (time.time() - insert_seq_time_start) logger.debug("insert_seq() took %s sec to finish" % insert_seq_time) except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:" ) # handle unexpected exceptions logger.error(sys.exc_info() [0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def create_chimera_cmd(self, ref_db): """ /usr/local/bin/vsearch -uchime_denovo /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa -notrunclabels --- /usr/local/bin/vsearch -uchime_ref /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa -notrunclabels -strand plus -db /groups/g454/blastdbs/rRNA16S.gold.fasta """ command_line = [] ref_or_novo_options = { self.denovo_suffix: "-uchime_denovo", self.ref_suffix: "-uchime_ref" } for suff, opt in ref_or_novo_options.items(): input_file_name = self.indir + "/$filename_base" + self.chg_suffix output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff ref_add = "" if (opt == "-uchime_ref"): ref_add = "-strand plus -db %s" % ref_db uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s """ % (self.usearch_cmd, opt, input_file_name, output_file_name, output_file_name, self.chimeric_suffix, ref_add) logger.debug("UUU = uchime_cmd = %s" % uchime_cmd) logger.debug("+++") command_line.append(uchime_cmd) return command_line
def insert_taxonomy(self, fasta, gast_dict): if gast_dict: (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id] "if we already had this taxonomy in this run, just skip it" if taxonomy in self.tax_id_dict: next else: tax_id = self.get_id("taxonomy", taxonomy) if tax_id: self.tax_id_dict[taxonomy] = tax_id else: my_sql = """INSERT IGNORE INTO taxonomy (taxonomy) VALUES ('%s')""" % (taxonomy.rstrip()) tax_id = self.my_conn.execute_no_fetch(my_sql) self.tax_id_dict[taxonomy] = tax_id return tax_id else: print "ERROR: can't read gast files! No taxonomy information will be processed." logger.debug("ERROR: can't read gast files! No taxonomy information will be processed.")
def process(run, steps): # create output directory: requested_steps = steps.split(",") if not os.path.exists(run.output_dir): logger.debug("Creating output directory: "+run.output_dir) os.makedirs(run.output_dir) # loop through official list...this way we execute the # users requested steps in the correct order for step in requested_steps: if step not in existing_steps: print "Invalid processing step: " + step sys.exit() else: # call the method in here step_method = globals()[step] step_method(run)
def check_if_array_job_is_done(self, job_name): cluster_done = False check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) logger.debug("qstat is running %s '%s' processes" % (num_proc, job_name)) # pprint(p) if (num_proc == 0): cluster_done = True # print("cluster_done from check_if_cluster_is_done = %s" % cluster_done) except: logger.error("%s can be done only on a cluster." % job_name) raise return cluster_done
def trim(runobj): # def is in utils.py # open_zipped_directory(runobj.run_date, runobj.output_dir) # (re) create the trim status file runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w") # do the trim work mytrim = TrimRun(runobj) # pass True to write out the straight fasta file of all trimmed non-deleted seqs # Remember: this is before chimera checking if runobj.platform == "illumina": trim_codes = mytrim.trimrun_illumina(True) elif runobj.platform == "454": trim_codes = mytrim.trimrun_454(True) elif runobj.platform == "ion-torrent": trim_codes = mytrim.trimrun_ion_torrent(True) else: trim_codes = ["ERROR", "No Platform Found"] trim_results_dict = {} if trim_codes[0] == "SUCCESS": # setup to write the status new_lane_keys = trim_codes[2] trim_results_dict["status"] = "success" trim_results_dict["new_lane_keys"] = new_lane_keys logger.debug("Trimming finished successfully") # write the data files mytrim.write_data_files(new_lane_keys) runobj.trim_status_file_h.write(json.dumps(trim_results_dict)) runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.run_status_file_h.close() else: logger.debug("Trimming finished ERROR") trim_results_dict["status"] = "error" trim_results_dict["code1"] = trim_codes[1] trim_results_dict["code2"] = trim_codes[2] runobj.trim_status_file_h.write(json.dumps(trim_results_dict)) runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.run_status_file_h.close() sys.exit("Trim Error")
def process(runobj, steps): requested_steps = steps.split(",") if 'clean' in requested_steps and len(requested_steps) > 1: sys.exit("The clean step cannot be combined with other steps - Exiting") # Open run STATUS File here. # open in append mode because we may start the run in the middle # say at the gast stage and don't want to over write. # if we re-run trimming we'll get two trim status reports runobj.run_status_file_h = open(runobj.run_status_file_name, "a") # loop through official list...this way we execute the # users requested steps in the correct order for step in C.existing_steps: if step in requested_steps: # call the method in here logger.debug('RUN: %s' % step) step_method = globals()[step] step_method(runobj)
def trim_anchor_helper(anchor_name, expanded_anchor_sequences, freedom, length, start, sequence): exact = '' exactTrimmedOff = '' logger.debug( 'looking for anchor: ' + anchor_name + " start: " + str(start) + " length: " + str(length)) max_divergence = C.max_divergence logger.debug('anchor_list: ' + str(expanded_anchor_sequences)) list_of_tuples = anchortrim.generate_tuples(start, freedom, length, list_of_tuples = [], reversed_read=False) logger.debug('anchor tuples: ' + str(list_of_tuples)) anchor, location = anchortrim.find_best_distance(sequence, expanded_anchor_sequences, max_divergence, list_of_tuples) if anchor and location: logger.debug( 'anchor: ' + anchor + ' loc tuple: ' + str(location)) trimmed_sequence = sequence[:location[1]] # same thing here for the reversed == False exact = anchor exactTrimmedOff = sequence[location[1]:] else: logger.debug( 'no anchor location found' ) trimmed_sequence = sequence return exact, exactTrimmedOff, trimmed_sequence
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload): total_time = 0 try: my_file_to_db_upload.get_gast_result(os.path.basename(filename)) filename_base_no_suff = get_filename_base_no_suff(filename) run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id(filename_base_no_suff) if run_info_ill_id: my_file_to_db_upload.collect_project_ids(run_info_ill_id) seq_in_file = len(my_file_to_db_upload.seq.fasta_dict) my_file_to_db_upload.put_seq_statistics_in_file(filename, seq_in_file) total_time += seq_in_file start_fasta_next = time.time() start_insert_pdr_info_time = 0 start_insert_pdr_info_time = time.time() my_file_to_db_upload.insert_pdr_info(run_info_ill_id) insert_pdr_info_time = (time.time() - start_insert_pdr_info_time) start_insert_taxonomy_time = 0 start_insert_taxonomy_time = time.time() my_file_to_db_upload.insert_taxonomy() insert_taxonomy_time = (time.time() - start_insert_taxonomy_time) insert_sequence_uniq_info_time = 0 start_insert_sequence_uniq_info_time = time.time() my_file_to_db_upload.insert_sequence_uniq_info() insert_sequence_uniq_info_time = (time.time() - start_insert_sequence_uniq_info_time) logger.debug("start_fasta_loop took %s sec to finish" % (time.time() - start_fasta_next)) logger.debug("insert_pdf_info_query_time took %s sec to finish" % insert_pdr_info_time) logger.debug("start_insert_taxonomy_upload_time took %s sec to finish" % insert_taxonomy_time) logger.debug("insert_sequence_uniq_info_time took %s sec to finish" % insert_sequence_uniq_info_time) return total_time else: utils = PipelneUtils() no_run_info_list.append(filename_base_no_suff) utils.print_both( "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % filename) return 0 except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:") # handle unexpected exceptions logger.error(sys.exc_info()[0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def update_sequence_uniq_info_ill(self, fasta, gast_dict): if gast_dict: (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id] seq_upper = fasta.seq.upper() sequence_ill_id = self.seq_id_dict[seq_upper] if taxonomy in self.tax_id_dict: try: taxonomy_id = self.tax_id_dict[taxonomy] except Exception, e: logger.debug("Error = %s" % e) raise my_sql = """INSERT IGNORE INTO sequence_uniq_info_ill (sequence_ill_id, taxonomy_id, gast_distance, refssu_count, rank_id, refhvr_ids) VALUES ( %s, %s, '%s', '%s', (SELECT rank_id FROM rank WHERE rank = '%s'), '%s' ) ON DUPLICATE KEY UPDATE updated = (CASE WHEN taxonomy_id <> %s THEN NOW() ELSE updated END), taxonomy_id = %s, gast_distance = '%s', refssu_count = '%s', rank_id = (SELECT rank_id FROM rank WHERE rank = '%s'), refhvr_ids = '%s' """ % (sequence_ill_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip(), taxonomy_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip()) # my_sql = """UPDATE IGNORE sequence_uniq_info_ill # SET updated = (CASE WHEN taxonomy_id <> %s THEN NOW() ELSE updated END), # taxonomy_id = %s, # gast_distance = '%s', # refssu_count = '%s', # rank_id = (SELECT rank_id FROM rank WHERE rank = '%s'), # refhvr_ids = '%s' # WHERE sequence_ill_id = %s # """ % (taxonomy_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip(), sequence_ill_id) res_id = self.my_conn.execute_no_fetch(my_sql) return res_id
def process(runobj, steps): requested_steps = steps.split(",") if 'clean' in requested_steps and len(requested_steps) > 1: sys.exit( "The clean step cannot be combined with other steps - Exiting") # Open run STATUS File here. # open in append mode because we may start the run in the middle # say at the gast stage and don't want to over write. # if we re-run trimming we'll get two trim status reports runobj.run_status_file_h = open(runobj.run_status_file_name, "a") # loop through official list...this way we execute the # users requested steps in the correct order for step in C.existing_steps: if step in requested_steps: # call the method in here logger.debug('RUN: %s' % step) step_method = globals()[step] step_method(runobj)
def chimera_checking(self): chimera_region_found = False file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix) logger.debug("FFF = file_list = %s" % (file_list)) # TODO: method dna_region = list( set([ self.runobj.samples[idx_key].dna_region for idx_key in self.input_file_names ]))[0] if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) ref_db = self.get_ref_db(dna_region) command_line = self.create_chimera_cmd(ref_db) sh_script_file_name = self.create_job_array_script( "chimera_checking", command_line, self.indir, file_list) script_file_name_full = os.path.join(self.indir, sh_script_file_name) self.utils.call_sh_script(script_file_name_full, self.indir) self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir)) self.dirs.chmod_all(self.indir) logger.debug('sh_script_file_name: ' + sh_script_file_name) if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The vsearch commands were created")
def env454upload_main(runobj, full_upload): """ Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) Tables: sequence_ill sequence_pdr_info_ill taxonomy sequence_uniq_info_ill """ whole_start = time.time() my_env454upload = dbUpload(runobj) filenames = my_env454upload.get_fasta_file_names() if not filenames: logger.debug("\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_env454upload.fasta_dir) # sequences = get_sequences(my_env454upload, filenames) for filename in filenames: sequences = my_env454upload.make_seq_upper(filename) if full_upload: env454upload_seq(my_env454upload, filename, sequences) wrapped = wrapper(my_env454upload.get_seq_id_dict, sequences) get_seq_id_dict_time = timeit.timeit(wrapped, number=1) logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time) total_seq = env454upload_all_but_seq(my_env454upload, filenames, full_upload) my_env454upload.check_seq_upload() logger.debug("total_seq = %s" % total_seq) whole_elapsed = (time.time() - whole_start) print "The whole upload took %s s" % whole_elapsed
def vampsupload(runobj): """ Upload data files to VAMPS database """ # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key idx_keys = get_keys(runobj) # if(runobj.vamps_user_upload): # idx_keys = [runobj.user+runobj.runcode] # else: # idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # NOT NEEDED HERE: Find duplicate project names # if vamps user uploads this has already been done and this project is # already in vamps_upload_info table # if data from a csv file (illumina and 454) this also is not needed # as data is checked in metadata.py myvamps = Vamps(runobj, idx_keys) # Create files myvamps.create_vamps_files() # put files in db result_code = myvamps.load_vamps_db() if result_code[:5] == 'ERROR': logger.error("load_vamps_db failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST_ERROR", result_code) sys.exit("load_vamps_db failed") elif runobj.vamps_user_upload: logger.debug("Finished loading VAMPS data. %s" % result_code) write_status_to_vamps_db(runobj.site, runobj.run, 'GAST_SUCCESS', 'Loading VAMPS Finished')
def __init__(self, lane_keys, runobj): self.inputFileName = {} self.orphans = {} self.lane_keys = lane_keys self.base_dir = runobj.output_dir self.trim_dir = os.path.join(self.base_dir, 'analysis/trimming') #self.chimera_dir = os.path.join(self.base_dir, 'analysis/chimera') self.deleted_ids = {} for lane_key in lane_keys: self.inputFileName[lane_key] = os.path.join(self.trim_dir, lane_key + ".trimmed.fa") self.orphans[lane_key] = {} deleted_file = os.path.join(self.trim_dir, lane_key + ".deleted.txt" ) self.deleted_ids[lane_key] = [] if not (os.path.exists(deleted_file) and os.path.getsize(deleted_file) > 0): logger.debug("No deleted sequences for lane: " + lane_key) continue del_fh = open(deleted_file,"r") #deleted_id_list = self.deleted_ids[lane_key] = [] for line in del_fh.readlines(): lst = line.strip().split() #deleted_id_list.append(lst[0]) self.deleted_ids[lane_key].append(lst[0])
def chimera_reference(self,lane_keys): chimera_region_found = False output = {} cluster_id_list = [] for lane_key in lane_keys: dna_region = self.run.samples[lane_key].dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue out_fileName = self.prefix[lane_key] + ".chimeras.db" # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd = ["clusterize"] uchime_cmd.append(self.usearch_cmd) uchime_cmd.append("--uchime") uchime_cmd.append(self.files[lane_key]['abund']) uchime_cmd.append("--uchimeout") uchime_cmd.append(out_fileName) uchime_cmd.append("--db") uchime_cmd.append(ref_db) try: print "chimera referenc command: " + str(uchime_cmd) output[lane_key] = subprocess.check_output(uchime_cmd) #print 'outsplit',output[lane_key].split()[2] cluster_id_list.append(output[lane_key].split()[2]) #print 'Have %d bytes in output' % len(output) #print 'ref',lane_key,output,len(output) if len(output[lane_key]) < 50 and len(output[lane_key]) > 40: logger.debug(lane_key + " uchime ref seems to have been submitted successfully") else: print >>sys.stderr, "uchime ref may be broke" except OSError, e: print >>sys.stderr, "Execution failed:", e
def trim(runobj): # def is in utils.py # open_zipped_directory(runobj.run_date, runobj.output_dir) # (re) create the trim status file runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w") idx_keys = get_keys(runobj) # do the trim work mytrim = TrimRun(runobj, idx_keys) # pass True to write out the straight fasta file of all trimmed non-deleted seqs # Remember: this is before chimera checking # trim_codes should alwas be a tuple with 3 elements! if runobj.vamps_user_upload: trim_codes = mytrim.trimrun_vamps(True) else: if runobj.platform == 'illumina': trim_codes = mytrim.filter_illumina() # trim_codes = mytrim.trim_illumina(file_list = trim_codes[2]) elif runobj.platform == '454': trim_codes = mytrim.trimrun_454(True) elif runobj.platform == 'ion-torrent': trim_codes = mytrim.trimrun_ion_torrent(True) else: trim_codes = ('ERROR', 'No Platform Found', '') trim_results_dict = {} if trim_codes[0] == 'SUCCESS': # setup to write the status new_lane_keys = trim_codes[2] trimmed_seq_count = trim_codes[1] if trimmed_seq_count == 0 or trimmed_seq_count == '0': trim_results_dict['status'] = "ERROR" logger.debug("Trimming finished: ERROR: no seqs passed trim") else: trim_results_dict['status'] = "success" logger.debug("Trimming finished successfully") trim_results_dict['new_lane_keys'] = new_lane_keys trim_results_dict['trimmed_seq_count'] = trimmed_seq_count # write the data files mytrim.write_data_files(new_lane_keys) runobj.trim_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.run_status_file_h.close() else: logger.debug("Trimming finished ERROR") trim_results_dict['status'] = "ERROR" trim_results_dict['code1'] = trim_codes[1] trim_results_dict['code2'] = trim_codes[2] runobj.trim_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict) + "\n") runobj.run_status_file_h.close() sys.exit("Trim Error")
def write_clean_fasta_file(self): """ def to write a new fasta from the original fasta file using the deleted file The deleted file contains the trimming deleted as well as the chimera deleted Then write the uniques from Meren's fastalib """ sleep(2) for lane_key in self.lane_keys: logger.debug("write_clean_fasta_file working on lanekey: " + lane_key) deleted_id_list = [] original_trimmed_file = os.path.join(self.trim_dir, lane_key + ".trimmed.fa") new_trimmed_file_name = os.path.join(self.trim_dir, lane_key + ".newtrimmed.fa") new_trimmed_file = fa.FastaOutput(new_trimmed_file_name) # open trimmed file and read a line trimmedfasta = fa.SequenceSource(original_trimmed_file) logger.debug( "write_clean_fasta_file about to check trimmedfasta file") deleted_id_list = self.deleted_ids[lane_key] if len(deleted_id_list) == 0: continue while trimmedfasta.next(): if trimmedfasta.id not in deleted_id_list: new_trimmed_file.store(trimmedfasta) new_trimmed_file.close() # rename to newtrimmed => trimmed os.rename( original_trimmed_file, os.path.join(self.trim_dir, lane_key + ".trimmed_with_chimera.fa")) os.rename(new_trimmed_file_name, original_trimmed_file)
def trim(runobj): # def is in utils.py #open_zipped_directory(runobj.run_date, runobj.output_dir) # (re) create the trim status file runobj.trim_status_file_h = open(runobj.trim_status_file_name, "w") idx_keys = get_keys(runobj) # do the trim work mytrim = TrimRun(runobj, idx_keys) # pass True to write out the straight fasta file of all trimmed non-deleted seqs # Remember: this is before chimera checking # trim_codes should alwas be a tuple with 3 elements! if runobj.vamps_user_upload: trim_codes = mytrim.trimrun_vamps(True) else: if runobj.platform == 'illumina': trim_codes = mytrim.filter_illumina() # trim_codes = mytrim.trim_illumina(file_list = trim_codes[2]) elif runobj.platform == '454': trim_codes = mytrim.trimrun_454(True) elif runobj.platform == 'ion-torrent': trim_codes = mytrim.trimrun_ion_torrent(True) else: trim_codes = ('ERROR','No Platform Found','') trim_results_dict = {} if trim_codes[0] == 'SUCCESS': # setup to write the status new_lane_keys = trim_codes[2] trimmed_seq_count = trim_codes[1] if trimmed_seq_count == 0 or trimmed_seq_count == '0': trim_results_dict['status'] = "ERROR" logger.debug("Trimming finished: ERROR: no seqs passed trim") else: trim_results_dict['status'] = "success" logger.debug("Trimming finished successfully") trim_results_dict['new_lane_keys'] = new_lane_keys trim_results_dict['trimmed_seq_count'] = trimmed_seq_count # write the data files mytrim.write_data_files(new_lane_keys) runobj.trim_status_file_h.write(json.dumps(trim_results_dict)+"\n") runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict)+"\n") runobj.run_status_file_h.close() else: logger.debug("Trimming finished ERROR") trim_results_dict['status'] = "ERROR" trim_results_dict['code1'] = trim_codes[1] trim_results_dict['code2'] = trim_codes[2] runobj.trim_status_file_h.write(json.dumps(trim_results_dict)+"\n") runobj.trim_status_file_h.close() runobj.run_status_file_h.write(json.dumps(trim_results_dict)+"\n") runobj.run_status_file_h.close() sys.exit("Trim Error")
def insert_sequence_uniq_info_ill(self, fasta, gast_dict): if gast_dict: (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts, refhvr_ids) = gast_dict[fasta.id] seq_upper = fasta.seq.upper() sequence_ill_id = self.seq_id_dict[seq_upper] if taxonomy in self.tax_id_dict: try: taxonomy_id = self.tax_id_dict[taxonomy] except Exception, e: logger.debug("Error = %s" % e) raise my_sql = """INSERT IGNORE INTO sequence_uniq_info_ill (sequence_ill_id, taxonomy_id, gast_distance, refssu_count, rank_id, refhvr_ids) VALUES ( %s, %s, '%s', '%s', (SELECT rank_id FROM rank WHERE rank = '%s'), '%s' ) """ % (sequence_ill_id, taxonomy_id, distance, refssu_count, rank, refhvr_ids.rstrip()) res_id = self.my_conn.execute_no_fetch(my_sql) return res_id
def trim_stop_seq( stop_seqs, seq, trim_type, start, end ): for anchor in stop_seqs: anchor_length = len(anchor) logger.debug("trim_stop_seq: " + anchor + " " + str(start) + " " + str(end) + " " + str(len(seq))) for pos in range(start,end): seq_window = seq[pos:pos+anchor_length] dist = abs( Levenshtein.ratio( anchor, seq_window ) ) #dist2 = abs( Levenshtein.ratio( seq_window, anchor ) ) if dist == 1.0: # perfect match # do I trim off before or after anchor? #before return anchor,seq[pos+anchor_length:],seq[:pos] #after (include anchor in trimmed seq): #return anchor,seq[pos:],seq[:pos+anchor_length] if dist >= C.max_divergence: pass #print(pos,seq_window,dist1,dist2) return '','',seq
def trim_stop_seq(stop_seqs, seq, trim_type, start, end): for anchor in stop_seqs: anchor_length = len(anchor) logger.debug(anchor + " " + str(start) + " " + str(end) + " " + str(len(seq))) for pos in range(start, end): seq_window = seq[pos:pos + anchor_length] dist = abs(Levenshtein.ratio(anchor, seq_window)) #dist2 = abs( Levenshtein.ratio( seq_window, anchor ) ) if dist == 1.0: # perfect match # do I trim off before or after anchor? #before return anchor, seq[pos + anchor_length:], seq[:pos] #after (include anchor in trimmed seq): #return anchor,seq[pos:],seq[:pos+anchor_length] if dist >= C.max_divergence: pass #print pos,seq_window,dist1,dist2 return '', '', seq
def run_until_done_on_cluster(self, job_name): start = time.time() time_before = self.get_time_now() logger.debug("time_before = %s" % time_before) logger.debug("Waiting for the cluster...") while True: if self.is_local(): time.sleep(1) else: time.sleep(120) cluster_done = self.check_if_array_job_is_done(job_name) logger.debug("cluster_done = %s" % cluster_done) if (cluster_done): break elapsed = (time.time() - start) logger.debug("Cluster is done with %s in: %s" % (job_name, elapsed))
def illumina_chimera_after_cluster(runobj): mychimera = Chimera(runobj) mychimera.illumina_rm_size_files() start = time.time() mychimera.illumina_size_to_freq_in_chimer() elapsed = (time.time() - start) logger.debug("illumina_size_to_freq_in_chimer time: %s" % elapsed) # start = time.time() # logger.debug("Check chimeric statistics. If ref > 15% and ratio ref to de-novo > 2 use only de-novo") # mychimera.check_chimeric_stats() # elapsed = (time.time() - start) # logger.debug("check_chimeric_stats time: %s" % elapsed) start = time.time() logger.debug("Creating nonchimeric files in %s" % mychimera.indir) mychimera.move_out_chimeric() elapsed = (time.time() - start) logger.debug("move_out_chimeric time: %s" % elapsed) logger.debug("illumina_chimera_after_cluster time = %s" % str(elapsed))
def clean(runobj): """ Removes a run from the database and output directory """ answer = raw_input("\npress 'y' to delete the run '" + runobj.run_date + "': ") if answer == 'y' or answer == 'Y': for (archiveDirPath, dirNames, file_names) in os.walk(runobj.output_dir): logger.debug("Removing run:", runobj.run_date) for f in file_names: file_path = os.path.join(runobj.output_dir, f) logger.debug("file_path: ") logger.debug(file_path) os.remove(os.path.join(runobj.output_dir, f))
def check_projects_and_datasets(self, data): self.get_my_conn() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print(p) my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print("\t%s" % (d)) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn)
def get_keys(runobj): try: idx_keys = convert_unicode_dictionary_to_str( json.loads(open(runobj.trim_status_file_name, "r").read()))["new_lane_keys"] # {"status": "success", "new_lane_keys": ["1_GATGA"]} except: # here we have no idx_keys - must create them from run # if illumina they are index_runkey_lane concatenation # if 454 the are lane_key if runobj.vamps_user_upload: # logger.debug('KEYS: '+' '.join(runobj.run_keys)) idx_keys = runobj.samples.keys() else: if runobj.platform == 'illumina': idx_keys = runobj.idx_keys ct = 0 for h in runobj.samples: logger.debug("get_keys, h:") logger.debug(h) # logger.debug(h,runobj.samples[h]) #TypeError: not all arguments converted during string formatting ct += 1 elif runobj.platform == '454': idx_keys = runobj.idx_keys elif runobj.platform == 'ion_torrent': idx_keys = runobj.idx_keys else: logger.debug("GAST: No keys found - Exiting") runobj.run_status_file_h.write( "GAST: No keys found - Exiting\n") sys.exit() if isinstance(idx_keys, str): return idx_keys.split(',') elif isinstance(idx_keys, list): return idx_keys else: return None
v.convert_and_save_ini(data_object['output_dir']) data_object = v.validate(data_object['output_dir']) #general_data = v.get_general_data() answer = v.get_confirmation(args.steps, data_object['general']) #print('do2',data_object) if answer == 'q': sys.exit() elif answer == 'v': # view CONFIG file contents fh = open( os.path.join(dirs.analysis_dir, data_object['general']['run'] + '.ini')) lines = fh.readlines() logger.debug("\n=== START ===\n") for line in lines: line = line.strip() logger.debug("line in INI: ") logger.debug(line) logger.debug("==== END ====\n") sys.exit() elif answer != 'c': sys.exit() ############## # # CREATE THE RUN OBJECT (see runconfig.py for details) # ############## runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__)))
def wait_for_cluster_to_finish(my_running_id_list): #print('My IDs',running_id_list) logger.debug('Max run time set to ' + str(C.cluster_max_wait) + ' seconds') logger.debug('These are my running qsub IDs ' + str(my_running_id_list)) my_working_id_list = my_running_id_list counter = 0 time.sleep(C.cluster_initial_check_interval) while my_working_id_list: qstat_codes = get_qstat_id_list() if not qstat_codes['id']: #print('No qstat ids') logger.debug( "id list not found: may need to increase initial_interval if you haven't seen running ids." ) return ( 'SUCCESS', 'id list not found', '', ) if 'Eqw' in qstat_codes['code']: logger.debug( "Check cluster: may have error code(s), but they may not be mine!" ) got_one = False #print('working ids',my_working_id_list) if my_working_id_list[0] in qstat_codes['id']: got_one = True name = qstat_codes['name'][qstat_codes['id'].index( my_working_id_list[0])] user = qstat_codes['user'][qstat_codes['id'].index( my_working_id_list[0])] code = qstat_codes['code'][qstat_codes['id'].index( my_working_id_list[0])] if code == 'Eqw': return ('FAIL', 'Found Eqw code', my_working_id_list[0]) elif code == 'qw': logger.debug("id is still queued: " + str(my_working_id_list[0]) + " " + str(code)) elif code == 'r': logger.debug("id is still running: " + str(my_working_id_list[0]) + " " + str(code)) else: logger.debug('Unknown qstat code ' + str(code)) else: my_working_id_list = my_working_id_list[1:] logger.debug('id finished ' + str(my_working_id_list)) if not my_working_id_list: return ('SUCCESS', 'not my_working_id_list', '') #if not got_one: #print('IN not got one',) # return ('SUCCESS','not got one','') time.sleep(C.cluster_check_interval) counter = counter + C.cluster_check_interval if counter >= C.cluster_max_wait: return ('FAIL', 'Max Time exceeded', C.cluster_max_wait) return ('FAIL', 'Unknown', 'Unknown')
def delete_file(self, filename): try: os.remove(filename) logger.debug("DELETE %s" % (filename)) except OSError: pass
def print_both(self, message): logger.debug("print_both: ") print(message) logger.debug(message)