def wrapper_bedtools_intersect2(bedfile1, bedfile2, outfile=None): """ Using two bedfile to get the intsersection of pairs :param bigg_one: :param bigg_two: :return: """ if outfile is None: prefix1 = get_file_prefix(bedfile1) prefix2 = get_file_prefix(bedfile2) location = get_file_location(bedfile1) outfile = location + "/" + "_".join([prefix1, prefix2]) + ".bed" sort_cmd1 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile1) sort_cmd2 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile2) _ = myexe(sort_cmd1) _ = myexe(sort_cmd2) # generate the bedfile cmd = "bedtools intersect -wa -wb -a {bedfile1}_s -b {bedfile2}_s>{out}".format( bedfile1=bedfile1, bedfile2=bedfile2, out=outfile) _ = myexe(cmd) ### cleanup bed1s = bedfile1 + "_s" bed2s = bedfile2 + "_s" del_files([bedfile1, bedfile2, bed1s, bed2s]) return outfile
def save_models(self, val_losses=None, del_prev=True, not_saved=()): """save models in 1) during epoch val_loss hits new low, 2) after one epoch""" # - delete previous weights if del_prev: del_files(self.opt.save_model_path) # - save new weightspy if val_losses is None: val_losses = self.val_losses_min has_depth = not isinstance( val_losses['da/a1'], (int, list)) # when not initialized, da/a1 == 10 or [] weights_name = '_'.join( ['weights', str(val_losses['loss/total'].numpy())[2:5]]) if not self.opt.disable_gt and has_depth: weights_name = '_'.join( [weights_name, str(val_losses['da/a1'].numpy())[2:4]]) print('-> Saving weights with new low loss:\n', self.val_losses_min) weights_path = os.path.join(self.opt.save_model_path, weights_name) if not os.path.isdir(weights_path) and not self.opt.debug_mode: os.makedirs(weights_path) for m_name, model in self.models.items(): if m_name not in not_saved: m_path = os.path.join(weights_path, m_name + '.h5') tf.print("saving {} to:".format(m_name), m_path, output_stream=sys.stdout) model.save_weights(m_path)
def prefilter_smallexon(bigg_list,bigg_list_gff, cutoff=50): """ remove two kind of reads: 1. not same strand as in current annotation 2. with less than cutoff intersection with current annotation :param bigg_list: :param cutoff: at least 50bp intersection with current annotation :return: retained list """ ## The anti-sense reads should not be removed ## in direct RNA sequencing, or novel anti-sence ## transcripts will be lost, thus filter 1 is disabled ## Intergenic transcripts will be lost due to filter 2, ## so the whole function will not be used in the main flow if len(bigg_list_gff)==0: return bigg_list ## # filter 1 ## strand=bigg_list_gff[0].strand ## bigg_list_strand=[x for x in bigg_list if x.strand==strand] bigg_list_strand = bigg_list if len(bigg_list_strand)==0: return None # filter 2 nano_exon, nano_intron=bigglist_to_bedfile(bigg_list_strand) gff_exon, gff_intron=bigglist_to_bedfile(bigg_list_gff) exonfile=wrapper_bedtools_intersect2(nano_exon, gff_exon) out_d=pandas_summary(exonfile) keep_name=set() for k, intersection in out_d.items(): nano_name, gff_name=k if intersection > cutoff: keep_name.add(nano_name) bigg_list_new=[] for bigg in bigg_list: if bigg.name in keep_name: bigg_list_new.append(bigg) try: ### clean up del_files([exonfile, nano_intron, gff_intron]) ### Also clean up exon fils del_files([nano_exon, gff_exon]) except Exception as e: print("Cleanup in prefilter_smallexon is not successful: ", e) return bigg_list_new
def __init__(self, dir): os.makedirs(dir, exist_ok=True) utils.del_files(dir) # 删除之前的log文件 self.dir = dir self.step = 1 prefix = 'events' path = osp.join(osp.abspath(dir), prefix) import tensorflow as tf from tensorflow.python import pywrap_tensorflow from tensorflow.core.util import event_pb2 from tensorflow.python.util import compat self.tf = tf self.event_pb2 = event_pb2 self.pywrap_tensorflow = pywrap_tensorflow self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
def __init__(self, img_file, conf_file=None, verbose='vv'): Preprocess.__init__(self, img_file, conf_file=conf_file, verbose=verbose) ocr_dict.clear() cellName_xy_dict.clear() dict_texts.clear() self.ocr_files = [] if not os.path.exists(ocr_dir): os.mkdir(ocr_dir) else: utils.del_files(ocr_dir)
def save_models(self, val_losses=None, del_prev=True, not_saved=()): """save models in 1) during epoch val_loss hits new low, 2) after one epoch""" # ---------------------- # To save all weights, all layers of the encoder must be turned to trainable # ---------------------- def unfreeze_all_to_save_models(): for m_name, model in self.models.items(): model.trainable = True unfreeze_all_to_save_models() # - delete previous weights if del_prev: del_files(self.opt.save_model_path) # - save new weightspy if val_losses is None: val_losses = self.val_losses_min # has_depth = not isinstance(val_losses['da/a1'], (int, list)) # when not initialized, da/a1 == 10 or [] weights_name = '_'.join( ['weights', str(val_losses['loss/total'].numpy())[2:5]]) if not self.opt.disable_gt and self.has_depth_gt: weights_name = '_'.join( [weights_name, str(val_losses['da/a1'].numpy())[2:4]]) print('-> Saving weights with new low loss:\n', self.val_losses_min) weights_path = os.path.join(self.opt.save_model_path, weights_name) if not os.path.isdir(weights_path) and not self.opt.debug_mode: os.makedirs(weights_path) for m_name, model in self.models.items(): if m_name not in not_saved: m_path = os.path.join(weights_path, m_name + '.h5') tf.print("saving {} to:".format(m_name), m_path, output_stream=sys.stdout) model.save_weights(m_path) # ------------------- # Restore unfreeze options for training # ------------------- if self.opt.num_unfreeze is not None: self.unfreeze_partial_models()
def prefilter_smallexon(bigg_list, bigg_list_gff, cutoff=50): """ remove two kind of reads: 1. not same strand as in current annotation 2. with less than cutoff intersection with current annotation :param bigg_list: :param cutoff: at least 50bp intersection with current annotation :return: retained list """ if len(bigg_list_gff) == 0: return bigg_list # filter 1 strand = bigg_list_gff[0].strand bigg_list_strand = [x for x in bigg_list if x.strand == strand] if len(bigg_list_strand) == 0: return None # filter 2 nano_exon, nano_intron = bigglist_to_bedfile(bigg_list_strand) gff_exon, gff_intron = bigglist_to_bedfile(bigg_list_gff) exonfile = wrapper_bedtools_intersect2(nano_exon, gff_exon) out_d = pandas_summary(exonfile) keep_name = set() for k, intersection in out_d.items(): nano_name, gff_name = k if intersection > cutoff: keep_name.add(nano_name) bigg_list_new = [] for bigg in bigg_list: if bigg.name in keep_name: bigg_list_new.append(bigg) ### clean up del_files([exonfile, nano_intron, gff_intron]) return bigg_list_new
def cal_distance(bigg_list, intronweight=0.5, by="ratio"): """ :param bigg_list: :param intronweight: if 0, do not cal the intron to save time :param by: used to cal the distance between two bigg object, can be "ratio", "ratio_short", "length", "length_short" :return: D: distance matrix """ #wkdir=set_tmp() #os.chdir(wkdir) bigg_list.sort(key=operator.attrgetter("chromStart")) for i in bigg_list: i.get_exon() i.to_bedstr() length=len(bigg_list) D_exon=scipy.zeros([length, length]) D_intron=scipy.zeros([length, length]) # get an pos combination and the name of bigg for each i # ij_list=getij(bigg_list) pos_dic=get_pos_dic(bigg_list) # flow begin file_exon, file_intron = bigglist_to_bedfile(bigg_list) exon_out=wrapper_bedtools_intersect2(file_exon, file_exon) exon_i=pandas_summary(exon_out) del_files([file_exon, exon_out]) intron_out=wrapper_bedtools_intersect2(file_intron, file_intron) intron_i=pandas_summary(intron_out) del_files([file_intron, intron_out]) for k, intersection in exon_i.items(): name1, name2=k i=pos_dic[name1] j=pos_dic[name2] min_length = min(bigg_list[i].exonlen, bigg_list[j].exonlen) union = bigg_list[i].exonlen + bigg_list[j].exonlen - intersection # debug insanity if union <=0: print "exon", name1, name2, bigg_list[i].exonlen, bigg_list[j].exonlen, union, intersection # debug over if by == "ratio": # exon could be 0? if min_length == 0: D_exon[i, j] = 1 else: similar = float(intersection) / union D_exon[i, j] = 1 - similar elif by == "ratio_short": # intron could be 0 if min_length == 0: D_exon[i, j] = 1 else: D_exon[i, j] = 1 - float(intersection) / min_length for k, intersection in intron_i.items(): name1, name2 = k i = pos_dic[name1] j = pos_dic[name2] min_length = min(bigg_list[i].intronlen, bigg_list[j].intronlen) union = bigg_list[i].intronlen + bigg_list[j].intronlen - intersection #### debug ## Intron union could equal to 0, for single exon transcripts if union <=0: print "intron",name1, name2, bigg_list[i].intronlen, bigg_list[j].intronlen, union, intersection #### debug over if by == "ratio": # intron could be 0 if min_length == 0: D_intron[i, j] = 1 else: #print union similar = float(intersection) / union D_intron[i, j] = 1 - similar elif by == "ratio_short": # intron could be 0 if min_length == 0: D_intron[i, j] = 1 else: D_intron[i, j] = 1 - float(intersection) / min_length D=(D_exon+intronweight*D_intron)/float(1+intronweight) print("exon_out is ", exon_out) print("intron_out is ", intron_out) print("file_exon is ", file_exon) print("file_intron is ", file_intron) ## try: ## # cleanup ## del_files([exon_out, intron_out, file_exon, file_intron]) ## except Exception as e: ## print("Cleanup in flow_cluster is not successful: ", e) # debug: #print("D_exon",D_exon) #print("D_intron", D_intron) #print("D",D) #cleanup(remove_all=True) return D, bigg_list