Example #1
0
    def __init__(self,
                 parent,
                 dataset_list=["H37Rv_Sassetti_glycerol.wig"],
                 annotation="H37Rv.prot_table",
                 gene="",
                 scale=None,
                 feature_hashes=[],
                 feature_data=[]):

        view_trash.MainFrame.__init__(self, parent)

        self.parent = parent
        self.size = wx.Size(1500, 800)
        self.start = 1
        self.end = 10000

        #self.orf2data = draw_trash.read_prot_table(annotation)
        #self.hash = draw_trash.hash_prot_genes(annotation)

        self.orf2data = transit_tools.get_gene_info(annotation)
        self.hash = transit_tools.get_pos_hash(annotation)

        self.features = []

        #Data to facilitate search
        self.name2id = {}
        for orf, (name, desc, start, end, strand) in self.orf2data.items():
            name = name.lower()
            if name not in self.name2id: self.name2id[name] = []
            self.name2id[name].append(orf)

        self.lowerid2id = dict([(x.lower(), x) for x in self.orf2data.keys()])
        self.labels = [fetch_name(d) for d in dataset_list]
        (self.fulldata, self.position) = tnseq_tools.get_data(dataset_list)

        #Save normalized data
        (self.fulldata_norm,
         self.factors) = norm_tools.normalize_data(self.fulldata,
                                                   method="nzmean")
        self.wasNorm = False

        #initialize parent class

        self.feature_hashes = feature_hashes
        self.feature_data = feature_data

        if not scale:
            scale = [150] * len(dataset_list)
        self.scale = scale
        self.globalScale = False

        self.datasetChoice.SetItems(self.labels)
        self.datasetChoice.SetSelection(0)

        if gene:
            self.searchText.SetValue(gene)
            self.searchFunc(gene)

        self.updateFunc(parent)
        self.Fit()
Example #2
0
    def Run(self):

        self.transit_message("Starting IGV Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata,
         factors) = norm_tools.normalize_data(fulldata, self.normalization,
                                              self.ctrldata,
                                              self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to IGV with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" %
                              self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write(
                    "#Normalization Factors: %s\n" %
                    "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join(
                    [",".join(["%s" % bx for bx in b]) for b in factors]))

        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)

        dataset_str = "\t".join(
            [transit_tools.fetch_name(F) for F in self.ctrldata])
        self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" %
                          dataset_str)
        chrom = transit_tools.fetch_name(self.annotation_path)

        (K, N) = fulldata.shape
        self.progress_range(N)
        for i, pos in enumerate(position):
            self.output.write(
                "%s\t%s\t%s\tTA%s\t%s\t1\n" %
                (chrom, position[i], position[i] + 1, position[i], "\t".join(
                    ["%1.1f" % fulldata[j][i] for j in range(len(fulldata))])))

            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0 * i / N)
            self.progress_update(text, i)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")
Example #3
0
    def Run(self):

        self.transit_message("Starting Gene Mean Counts Export")
        start_time = time.time()
        
        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, 
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Summarized to Mean Gene Counts with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" % self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))


        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)


        K,Nsites = fulldata.shape
        # Get Gene objects
        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
        N = len(G)
        self.progress_range(N)
        dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
        self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header)
        for i,gene in enumerate(G):
            if gene.n > 0:
                data_str = "\t".join(["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)])
            else:
                data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)])
            self.output.write("%s\t%s\t%s\t%s\n" % (gene.orf, gene.name, gene.n, data_str))

            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()



        self.transit_message("") # Printing empty line to flush stdout 
        self.finish()
        self.transit_message("Finished Export") 
Example #4
0
    def Run(self):

        self.transit_message("Starting Combined Wig Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata,
         factors) = norm_tools.normalize_data(fulldata, self.normalization,
                                              self.ctrldata,
                                              self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to CombinedWig with TRANSIT.\n")
        self.output.write("#normalization method: %s\n" % self.normalization)
        if self.normalization != "nonorm":
            if type(factors[0]) == type(0.0):
                self.output.write(
                    "#Normalization Factors: %s\n" %
                    "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join(
                    [",".join(["%s" % bx for bx in b]) for b in factors]))

        (K, N) = fulldata.shape
        for f in self.ctrldata:
            self.output.write("#File: %s\n" % f)
        self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata)))

        for i, pos in enumerate(position):
            #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            if self.normalization != 'nonorm':
                vals = "\t".join(["%1.1f" % c for c in fulldata[:, i]])
            else:
                vals = "\t".join(["%d" % c for c in fulldata[:, i]
                                  ])  # no decimals if raw counts
            self.output.write("%d\t%s\t%s\n" % (position[i], vals, ",".join([
                "%s (%s)" % (orf, rv2info.get(orf, ["-"])[0])
                for orf in hash.get(position[i], [])
            ])))
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0 * i / N)
            self.progress_update(text, i)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")
Example #5
0
    def Run(self):

        self.transit_message("Starting IGV Export")
        start_time = time.time()
        
        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, 
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to IGV with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" % self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))

        self.output.write("#Files:\n")
        for f in self.ctrldata:
            self.output.write("#%s\n" % f)

        dataset_str = "\t".join([transit_tools.fetch_name(F) for F in self.ctrldata])
        self.output.write("#Chromosome\tStart\tEnd\tFeature\t%s\tTAs\n" % dataset_str)
        chrom = transit_tools.fetch_name(self.annotation_path)

        (K,N) = fulldata.shape
        self.progress_range(N)
        for i,pos in enumerate(position):
            self.output.write("%s\t%s\t%s\tTA%s\t%s\t1\n" % (chrom, position[i], position[i]+1, position[i], "\t".join(["%1.1f" % fulldata[j][i] for j in range(len(fulldata))])))
            
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()




        self.transit_message("") # Printing empty line to flush stdout 
        self.finish()
        self.transit_message("Finished Export") 
Example #6
0
    def Run(self):

        self.transit_message("Starting Combined Wig Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization,
            self.ctrldata, self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Converted to CombinedWig with TRANSIT.\n")
        self.output.write("#normalization method: %s\n" % self.normalization)
        if self.normalization != "nonorm":
            if type(factors[0]) == type(0.0):
                self.output.write("#Normalization Factors: %s\n" % "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))


        (K,N) = fulldata.shape
        for f in self.ctrldata:
            self.output.write("#File: %s\n" % f)
        self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata)))

        for i,pos in enumerate(position):
            #self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            if self.normalization!='nonorm': vals = "\t".join(["%1.1f" % c for c in fulldata[:,i]])
            else: vals = "\t".join(["%d" % c for c in fulldata[:,i]]) # no decimals if raw counts
            self.output.write("%d\t%s\t%s\n" % (position[i],vals,",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])])   ))
            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0*i/N)
            self.progress_update(text, i)
        self.output.close()



        self.transit_message("") # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")
Example #7
0
 def addFeatureFunc(self, event):
     wc = u"Known Annotation Formats (*.prot_table,*.gff3,*.gff)|*.prot_table;*.gff3;*.gff;|\nProt Table (*.prot_table)|*.prot_table;|\nGFF3 (*.gff,*.gff3)|*.gff;*.gff3;|\nAll files (*.*)|*.*"
     path = self.OpenFile(DIR=self.parent.workdir, FILE="", WC=wc)
     try:
         if path:
             if self.checkHMMFeature(path):
                 H, S = self.getHMMHash(path)
                 self.feature_hashes.append(H)
                 self.feature_data.append(S)
             else:
                 self.feature_hashes.append(
                     transit_tools.get_pos_hash(path))
                 self.feature_data.append(transit_tools.get_gene_info(path))
             self.updateFunc(self.parent)
             self.Fit()
         else:
             self.track_message("No feature added")
     except Exception as e:
         self.track_message("ERROR: %s" % e)
         traceback.print_exc()
Example #8
0
    def Run(self):

        self.transit_message("Starting HMM Method")
        start_time = time.time()

        #Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        # Normalize data
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        # Do LOESS
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        if len(self.ctrldata) > 1:
            self.transit_message("Combining Replicates as '%s'" %
                                 self.replicates)
        O = tnseq_tools.combine_replicates(
            data, method=self.replicates
        ) + 1  # Adding 1 to because of shifted geometric in scipy

        #Parameters
        Nstates = 4
        label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"}

        reads = O - 1
        reads_nz = sorted(reads[reads != 0])
        size = len(reads_nz)
        mean_r = numpy.average(reads_nz[:int(0.95 * size)])
        mu = numpy.array([1 / 0.99, 0.01 * mean_r + 2, mean_r, mean_r * 5.0])
        #mu = numpy.array([1/0.99, 0.1 * mean_r + 2,  mean_r, mean_r*5.0])
        L = 1.0 / mu
        B = []  # Emission Probability Distributions
        for i in range(Nstates):
            B.append(scipy.stats.geom(L[i]).pmf)

        pins = self.calculate_pins(O - 1)
        pins_obs = sum([1 for rd in O if rd >= 2]) / float(len(O))
        pnon = 1.0 - pins
        pnon_obs = 1.0 - pins_obs

        for r in range(100):
            if pnon**r < 0.01: break

        A = numpy.zeros((Nstates, Nstates))
        a = math.log1p(-B[int(Nstates / 2)](1)**r)
        b = r * math.log(B[int(Nstates / 2)](1)) + math.log(
            1.0 / 3)  # change to Nstates-1?
        for i in range(Nstates):
            A[i] = [b] * Nstates
            A[i][i] = a

        PI = numpy.zeros(Nstates)  # Initial state distribution
        PI[0] = 0.7
        PI[1:] = 0.3 / (Nstates - 1)

        self.progress_range(self.maxiterations)

        ###############
        ### VITERBI ###
        (Q_opt, delta, Q) = self.viterbi(A, B, PI, O)
        ###############

        ##################
        ### ALPHA PASS ###
        (log_Prob_Obs, alpha,
         C) = self.forward_procedure(numpy.exp(A), B, PI, O)
        ##################

        #################
        ### BETA PASS ###
        beta = self.backward_procedure(numpy.exp(A), B, PI, O, C)
        #################

        T = len(O)
        total = 0
        state2count = dict.fromkeys(range(Nstates), 0)
        for t in range(T):
            state = Q_opt[t]
            state2count[state] += 1
            total += 1

        self.output.write("#HMM - Sites\n")
        self.output.write("# Tn-HMM\n")

        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("# \n")
        self.output.write("# Mean:\t%2.2f\n" % (numpy.average(reads_nz)))
        self.output.write("# Median:\t%2.2f\n" % numpy.median(reads_nz))
        self.output.write("# Normalization:\t%s\n" % self.normalization)
        self.output.write("# LOESS Correction:\t%s\n" % str(self.LOESS))
        self.output.write("# pins (obs):\t%f\n" % pins_obs)
        self.output.write("# pins (est):\t%f\n" % pins)
        self.output.write("# Run length (r):\t%d\n" % r)
        self.output.write("# State means:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %8.4f" % (label[i], mu[i]) for i in range(Nstates)]))
        self.output.write("# Self-Transition Prob:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %2.4e" % (label[i], A[i][i]) for i in range(Nstates)]))
        self.output.write("# State Emission Parameters (theta):\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %1.4f" % (label[i], L[i]) for i in range(Nstates)]))
        self.output.write("# State Distributions:")
        self.output.write("#    %s\n" % "   ".join([
            "%s: %2.2f%%" % (label[i], state2count[i] * 100.0 / total)
            for i in range(Nstates)
        ]))

        states = [int(Q_opt[t]) for t in range(T)]
        last_orf = ""
        for t in range(T):
            s_lab = label.get(states[t], "Unknown State")
            gamma_t = (alpha[:, t] * beta[:, t]) / numpy.sum(
                alpha[:, t] * beta[:, t])
            genes_at_site = hash.get(position[t], [""])
            genestr = ""
            if not (len(genes_at_site) == 1 and not genes_at_site[0]):
                genestr = ",".join([
                    "%s_(%s)" % (g, rv2info.get(g, "-")[0])
                    for g in genes_at_site
                ])

            self.output.write("%s\t%s\t%s\t%s\t%s\n" %
                              (int(position[t]), int(O[t]) - 1, "\t".join(
                                  ["%-9.2e" % g
                                   for g in gamma_t]), s_lab, genestr))

        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Finished HMM - Sites Method")
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="HMM - Sites")

        #Gene Files
        self.transit_message("Creating HMM Genes Level Output")
        genes_path = ".".join(self.output.name.split(
            ".")[:-1]) + "_genes." + self.output.name.split(".")[-1]

        tempObs = numpy.zeros((1, len(O)))
        tempObs[0, :] = O - 1
        self.post_process_genes(tempObs, position, states, genes_path)

        self.transit_message("Adding File: %s" % (genes_path))
        self.add_file(path=genes_path, filetype="HMM - Genes")
        self.finish()
        self.transit_message("Finished HMM Method")