def compute_annotations_per_iteration(self, depth=5, saveas=None, bf_thresh=0.0, num_iterations=10, homologs=False): ''' Given a file with iterative gene removal data, this function will plot a bar chart of the GO annotations at a given level (depth) for each iteration. @param depth: The GO ontology depth to use. @param saveas: Provide a filename to save the GO annotation results. This will be a python pickle file. Specify None to avoid saving the results to a file. @param bf_thresh: The bayes factor threshold. An annotation must have at least this bayes factor before being counted in the results. @param num_iterations: The number of iterations to process @param homologs: Select whether to include gene homologs in the analysis. ''' iters = self.removed[:num_iterations] res = [] print "Starting analysis..." for i,genelist in enumerate(iters): ifr.print_progress(i, len(iters)) bcdat = self._gen_bar_chart_data(genelist, depth, bf_thresh=bf_thresh, homologs=homologs) res.append((i,bcdat,genelist)) print "" if not saveas is None: cPickle.dump(res, open(saveas,"wb")) print "Results saved to file: %s"%saveas self.annotation_dat = res self.depth = depth #for convenience, remember the depth used to generate the analysis
def get_geneids_from_affy(affy_id_list, affy_file=None): """ Returns a dictionary mapping affy probe ids to the tuple (genebank,unigene,symbol) given an input list of affy probe ids, and a csv file from affymetrix with the appropriate information @param affy_id_list: A list of strings like '1000_at'... @param affy_file: If none, then the function get_affy_key_file() will be called to get the full file name and path to the csv file, else specify the filename/path. """ if affy_file is None: affy_file = get_affy_key_file() affy_dict = {} lines = [] with open(affy_file, "r") as f: for tmpline in f: if tmpline[0] != "#": lines.append(tmpline) # omit header/comment lines for i, ln in enumerate(lines[1:]): # lines[0] is the column headers ifr.print_progress(i, len(lines)) tmp = ifr.smart_split(ln, sep=",") key = tmp[0] genebank = tmp[8] unigene = tmp[10] symbol = tmp[14] affy_dict[key] = (genebank, unigene, symbol) return affy_dict
def all_IFR_pairs_classification(ifr, numIter=20): ''' computes the classification accuracy using all pairs of IFR iterations from the flu_genelist SSVM IFR ''' acc = {} #the classifier accuracy combining iteration i with j (D,L,_,_) = ifr.load_flu_mat() (D2,L2,_,_) = ifr.load_H1N1_mat() removed = ifr.get_removed_features() #compute L2 SVM test accuracies for all iterations upto numIter print "Computing L2 SVM test accuracies for each IFR iteration." test_acc_list = [] for x in range(numIter): glx = removed[x] tmp=pathway_classification(glx, (D,L), (D2,L2)) test_acc_list.append(tmp[0]) print "Computing L2 SVM test accuracies for all pairs of IFR iterations." cur = 0 total = (numIter/2)*(numIter-1) for i in range(numIter): for j in range((i+1),numIter): ifr.print_progress(cur, total) cur+=1 gl1 = removed[i] gl2 = removed[j] gl = gl1+gl2 a1 = test_acc_list[i] a2 = test_acc_list[j] max_acc = max(a1,a2) #compute combined accuracy rc=pathway_classification(gl, (D,L), (D2,L2)) #store the results acc[(i,j)] = (rc[0], "IFR %d + IFR %d"%(i,j), a1, a2, max_acc ) res = sorted( acc.values(), reverse=True) return res, test_acc_list
def gen_affy_to_geneId_dict(affy_file_subdir="HG_U95A.na33.annot", affy_fn="HG_U95A.na33.annot.csv"): """ Converts a list of affymetric probe set ids into a genelist with names suitable for querying gather or kegg. Generates a dictionary with entries { affy_id : gene_id_list }. Most times, gene_id_list will have only a single entry, but several probes have multiple Gene IDs given. @param affy_file_subdir: The subdirectory of the ifr.DATA_DIR that has the HG_U95A.na33.annot.csv file. @param affy_fn: The csv file in the subdirectory with the data. The parameter is provided in case the file was renamed from the orginal name of "HG_U95A.na33.annot.csv" @note: Relies on a data file called HG_U95A.na33.annot.csv that must be present in the HG_U95A.na33.annot subdirectory of the linked Data directory. It would be most efficient to use this function once and save the resulting dictionary in a pickle file for later use instead of having to re-parse the data. """ affy_dict = {} affy_file = os.path.join(ifr.DATA_DIR, affy_file_subdir, affy_fn) lines = [] with open(affy_file, "r") as f: for tmpline in f: if tmpline[0] != "#": lines.append(tmpline) # omit header/comment lines for i, ln in enumerate(lines[1:]): # lines[0] is the column headers ifr.print_progress(i, len(lines)) tmp = ifr.smart_split(ln, sep=",") key = tmp[0] val = tmp[14] if val == "---": # this affy id has no gene symbol genelist = [] elif "///" in val: # there are more than one GeneIds for this probe # print "Subfield indicator in Gene Symbol for %s, line: %d."%(key,(i+1)) # print "Val: %s"%tmp[14] genelist = [x.strip() for x in val.split("///") if x.strip() != ""] else: genelist = [val] affy_dict[key] = list(set(genelist)) # remove duplicates return affy_dict