Example #1
0
def wrapper_bedtools_intersect2(bedfile1, bedfile2, outfile=None):
    """
    Using two bedfile to get the intsersection of pairs
    :param bigg_one:
    :param bigg_two:
    :return:
    """
    if outfile is None:
        prefix1 = get_file_prefix(bedfile1)
        prefix2 = get_file_prefix(bedfile2)
        location = get_file_location(bedfile1)

        outfile = location + "/" + "_".join([prefix1, prefix2]) + ".bed"

    sort_cmd1 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile1)
    sort_cmd2 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile2)

    _ = myexe(sort_cmd1)
    _ = myexe(sort_cmd2)

    # generate the bedfile

    cmd = "bedtools intersect -wa -wb -a {bedfile1}_s -b {bedfile2}_s>{out}".format(
        bedfile1=bedfile1, bedfile2=bedfile2, out=outfile)

    _ = myexe(cmd)

    ### cleanup
    bed1s = bedfile1 + "_s"
    bed2s = bedfile2 + "_s"
    del_files([bedfile1, bedfile2, bed1s, bed2s])

    return outfile
Example #2
0
    def save_models(self, val_losses=None, del_prev=True, not_saved=()):
        """save models in 1) during epoch val_loss hits new low, 2) after one epoch"""
        # - delete previous weights
        if del_prev:
            del_files(self.opt.save_model_path)

        # - save new weightspy
        if val_losses is None:
            val_losses = self.val_losses_min

        has_depth = not isinstance(
            val_losses['da/a1'],
            (int, list))  # when not initialized, da/a1 == 10 or []
        weights_name = '_'.join(
            ['weights', str(val_losses['loss/total'].numpy())[2:5]])
        if not self.opt.disable_gt and has_depth:
            weights_name = '_'.join(
                [weights_name,
                 str(val_losses['da/a1'].numpy())[2:4]])

        print('-> Saving weights with new low loss:\n', self.val_losses_min)
        weights_path = os.path.join(self.opt.save_model_path, weights_name)
        if not os.path.isdir(weights_path) and not self.opt.debug_mode:
            os.makedirs(weights_path)

        for m_name, model in self.models.items():
            if m_name not in not_saved:
                m_path = os.path.join(weights_path, m_name + '.h5')
                tf.print("saving {} to:".format(m_name),
                         m_path,
                         output_stream=sys.stdout)
                model.save_weights(m_path)
Example #3
0
def prefilter_smallexon(bigg_list,bigg_list_gff, cutoff=50):
    """
    remove two kind of reads:

    1. not same strand as in current annotation
    2. with less than cutoff intersection with current annotation

    :param bigg_list:
    :param cutoff: at least 50bp intersection with current annotation
    :return: retained list
    """

    ## The anti-sense reads should not be removed
    ## in direct RNA sequencing, or novel anti-sence
    ## transcripts will be lost, thus filter 1 is disabled
    
    ## Intergenic transcripts will be lost due to filter 2,
    ## so the whole function will not be used in the main flow

    if len(bigg_list_gff)==0:
        return bigg_list

    ## # filter 1
    ## strand=bigg_list_gff[0].strand
    ## bigg_list_strand=[x for x in bigg_list if x.strand==strand]
    bigg_list_strand = bigg_list
    
    if len(bigg_list_strand)==0:
        return None

    # filter 2
    nano_exon, nano_intron=bigglist_to_bedfile(bigg_list_strand)
    gff_exon, gff_intron=bigglist_to_bedfile(bigg_list_gff)

    exonfile=wrapper_bedtools_intersect2(nano_exon, gff_exon)
    out_d=pandas_summary(exonfile)
    keep_name=set()
    for k, intersection in out_d.items():
        nano_name, gff_name=k
        if intersection > cutoff:
            keep_name.add(nano_name)

    bigg_list_new=[]
    for bigg in bigg_list:
        if bigg.name in keep_name:
            bigg_list_new.append(bigg)

    try:
        ### clean up
        del_files([exonfile, nano_intron, gff_intron])
        ### Also clean up exon fils
        del_files([nano_exon, gff_exon])
    except Exception as e:
        print("Cleanup in prefilter_smallexon is not successful: ", e)

    return bigg_list_new
Example #4
0
 def __init__(self, dir):
     os.makedirs(dir, exist_ok=True)
     utils.del_files(dir)  # 删除之前的log文件
     self.dir = dir
     self.step = 1
     prefix = 'events'
     path = osp.join(osp.abspath(dir), prefix)
     import tensorflow as tf
     from tensorflow.python import pywrap_tensorflow
     from tensorflow.core.util import event_pb2
     from tensorflow.python.util import compat
     self.tf = tf
     self.event_pb2 = event_pb2
     self.pywrap_tensorflow = pywrap_tensorflow
     self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
Example #5
0
    def __init__(self, img_file, conf_file=None, verbose='vv'):
        Preprocess.__init__(self,
                            img_file,
                            conf_file=conf_file,
                            verbose=verbose)

        ocr_dict.clear()
        cellName_xy_dict.clear()
        dict_texts.clear()

        self.ocr_files = []

        if not os.path.exists(ocr_dir):
            os.mkdir(ocr_dir)
        else:
            utils.del_files(ocr_dir)
Example #6
0
    def save_models(self, val_losses=None, del_prev=True, not_saved=()):
        """save models in 1) during epoch val_loss hits new low, 2) after one epoch"""

        # ----------------------
        # To save all weights, all layers of the encoder must be turned to trainable
        # ----------------------
        def unfreeze_all_to_save_models():
            for m_name, model in self.models.items():
                model.trainable = True

        unfreeze_all_to_save_models()
        # - delete previous weights
        if del_prev:
            del_files(self.opt.save_model_path)

        # - save new weightspy
        if val_losses is None:
            val_losses = self.val_losses_min

        # has_depth = not isinstance(val_losses['da/a1'], (int, list))    # when not initialized, da/a1 == 10 or []
        weights_name = '_'.join(
            ['weights', str(val_losses['loss/total'].numpy())[2:5]])
        if not self.opt.disable_gt and self.has_depth_gt:
            weights_name = '_'.join(
                [weights_name,
                 str(val_losses['da/a1'].numpy())[2:4]])

        print('-> Saving weights with new low loss:\n', self.val_losses_min)
        weights_path = os.path.join(self.opt.save_model_path, weights_name)
        if not os.path.isdir(weights_path) and not self.opt.debug_mode:
            os.makedirs(weights_path)

        for m_name, model in self.models.items():
            if m_name not in not_saved:
                m_path = os.path.join(weights_path, m_name + '.h5')
                tf.print("saving {} to:".format(m_name),
                         m_path,
                         output_stream=sys.stdout)
                model.save_weights(m_path)

        # -------------------
        # Restore unfreeze options for training
        # -------------------
        if self.opt.num_unfreeze is not None:
            self.unfreeze_partial_models()
Example #7
0
def prefilter_smallexon(bigg_list, bigg_list_gff, cutoff=50):
    """
    remove two kind of reads:

    1. not same strand as in current annotation
    2. with less than cutoff intersection with current annotation

    :param bigg_list:
    :param cutoff: at least 50bp intersection with current annotation
    :return: retained list
    """
    if len(bigg_list_gff) == 0:
        return bigg_list

    # filter 1
    strand = bigg_list_gff[0].strand
    bigg_list_strand = [x for x in bigg_list if x.strand == strand]

    if len(bigg_list_strand) == 0:
        return None

    # filter 2
    nano_exon, nano_intron = bigglist_to_bedfile(bigg_list_strand)
    gff_exon, gff_intron = bigglist_to_bedfile(bigg_list_gff)

    exonfile = wrapper_bedtools_intersect2(nano_exon, gff_exon)
    out_d = pandas_summary(exonfile)
    keep_name = set()
    for k, intersection in out_d.items():
        nano_name, gff_name = k
        if intersection > cutoff:
            keep_name.add(nano_name)

    bigg_list_new = []
    for bigg in bigg_list:
        if bigg.name in keep_name:
            bigg_list_new.append(bigg)

    ### clean up
    del_files([exonfile, nano_intron, gff_intron])

    return bigg_list_new
Example #8
0
def cal_distance(bigg_list, intronweight=0.5, by="ratio"):
    """
    :param bigg_list:
    :param intronweight: if 0, do not cal the intron to save time
    :param by: used to cal the distance between two bigg object, can be "ratio", "ratio_short", "length", "length_short"
    :return: D: distance matrix
    """
    #wkdir=set_tmp()
    #os.chdir(wkdir)

    bigg_list.sort(key=operator.attrgetter("chromStart"))

    for i in bigg_list:
        i.get_exon()
        i.to_bedstr()

    length=len(bigg_list)
    D_exon=scipy.zeros([length, length])
    D_intron=scipy.zeros([length, length])

    # get an pos combination and the name of bigg for each i
    # ij_list=getij(bigg_list)
    pos_dic=get_pos_dic(bigg_list)

    # flow begin
    file_exon, file_intron = bigglist_to_bedfile(bigg_list)

    exon_out=wrapper_bedtools_intersect2(file_exon, file_exon)
    exon_i=pandas_summary(exon_out)
    del_files([file_exon, exon_out])

    intron_out=wrapper_bedtools_intersect2(file_intron, file_intron)
    intron_i=pandas_summary(intron_out)
    del_files([file_intron, intron_out])

    for k, intersection in exon_i.items():
        name1, name2=k
        i=pos_dic[name1]
        j=pos_dic[name2]
        min_length = min(bigg_list[i].exonlen, bigg_list[j].exonlen)
        union = bigg_list[i].exonlen + bigg_list[j].exonlen - intersection
        # debug insanity
        if union <=0:
            print "exon", name1, name2, bigg_list[i].exonlen,  bigg_list[j].exonlen, union, intersection
        # debug over

        if by == "ratio":
            # exon could be 0?
            if min_length == 0:
                D_exon[i, j] = 1
            else:
                similar = float(intersection) / union
                D_exon[i, j] = 1 - similar

        elif by == "ratio_short":
            # intron could be 0
            if min_length == 0:
                D_exon[i, j] = 1
            else:
                D_exon[i, j] = 1 - float(intersection) / min_length

    for k, intersection in intron_i.items():
        name1, name2 = k
        i = pos_dic[name1]
        j = pos_dic[name2]
        min_length = min(bigg_list[i].intronlen, bigg_list[j].intronlen)
        union = bigg_list[i].intronlen + bigg_list[j].intronlen - intersection

        #### debug
        ## Intron union could equal to 0, for single exon transcripts
        if union <=0:
            print "intron",name1, name2, bigg_list[i].intronlen,  bigg_list[j].intronlen, union, intersection
        #### debug over

        if by == "ratio":
            # intron could be 0
            if min_length == 0:
                D_intron[i, j] = 1
            else:
                #print union
                similar = float(intersection) / union
                D_intron[i, j] = 1 - similar

        elif by == "ratio_short":
            # intron could be 0
            if min_length == 0:
                D_intron[i, j] = 1
            else:
                D_intron[i, j] = 1 - float(intersection) / min_length


    D=(D_exon+intronweight*D_intron)/float(1+intronweight)

    print("exon_out is ", exon_out)
    print("intron_out is ", intron_out)
    print("file_exon is ", file_exon)
    print("file_intron is ", file_intron)
    ## try:
    ##     # cleanup
    ##     del_files([exon_out, intron_out, file_exon, file_intron])
    ## except Exception as e:
    ##     print("Cleanup in flow_cluster is not successful: ", e)

    # debug:
    #print("D_exon",D_exon)
    #print("D_intron", D_intron)
    #print("D",D)

    #cleanup(remove_all=True)

    return D, bigg_list