def fix_affy_dict(affy_dict):
    """
    Given an affy dict in form { probe_id:(genebank,unigene,symbol)},
    such as generated by get_geneids_from_affy() by reading a CSV file,
    this attempts to fix those entries where the symbol was missing or
    multiple-valued by doing a live query against the latest symbol
    information from ncbi web sources.
    """
    # N = len(affy_dict)
    new_dict = {}
    for ix, k in enumerate(affy_dict.keys()):
        lookup = False
        # svo_util.print_progress(ix, N)
        (genebank, unigene, symbol) = affy_dict[k]
        if symbol == "---":
            print "(%d) %s unknown" % ((ix + 1), k),
            lookup = True
        elif "///" in symbol:
            print "(%d) %s ambiguous" % ((ix + 1), k),
            lookup = True

        if lookup:
            if (unigene == "---") or ("///" in unigene):
                symbol = ifr.get_official_name(genebank)
            else:
                # if we have a unigene name, try it first,
                # but if it doesn't work, then try the genebank
                symbol = ifr.get_official_name(unigene)
                if symbol == unigene:
                    symbol = ifr.get_official_name(genebank)
            print " = %s" % symbol

        new_dict[k] = (genebank, unigene, symbol)

    return new_dict
 def _pathway_classification(self,genelist,common_scale=False):
     '''
     Given a genelist from the data set, such as a collection of genes belonging to a specific
     pathway, this function computes the accuracy of a standard (L2) SVM classifier
     '''
     (D,L) = self.traindat
     (D2,L2) = self.testdat
         
     G = self.feature_names
     idxs = []
     for x in genelist:
         while x in self.known_aliases: x = self.known_aliases[x]
         #alias could be key to another alias...so go down until we stop finding substitutions
         
         if x in self.known_bad: continue #skip the ones we know are not resolvable
         
         try:
             idx = G.index(x)
         except(ValueError):                 
             print "Gene %s is not a known feature name. Querying for official name..."%x
             xn = ifr.get_official_name(x, entrezgene=True)
             if (xn is None) or (xn==x):
                 xn = None
                 self.known_bad.append(x)
                 print "No other official name found, gene will be skipped."
             else:                    
                 print "Substituting %s for %s."%(xn,x)
                 self.known_aliases[x] = xn
             
             try:
                 idx = G.index(xn) if (not xn is None) else None
             except(ValueError):
                 idx = None  #happens when official name is found, but still not in list
             
         if not idx is None: idxs.append(idx)
     
     rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False,
                         no_normalization=False, loss="L2", penalty="L2", C=1.0)
     return rc
def pathway_classification(genelist, feature_names=None, traindat=None, testdat=None, return_err_idxs=False, 
                           common_scale=False, verbose=True ):
    '''
    Given a genelist from the flu data set, such as a collection of genes belonging to a specific
    pathway, this function computes the accuracy of a standard (L2) SVM classifier
    @param genelist: The list of genes forming the pathway
    @param feature_names: The list of feature (gene) names associated with the columns of test/train data.
    The feature_names must contain the genes in genelist, or a lookup will be performed to find the matching alias.
    If None, then feature names will be the genes from the Duke Influenza data.
    @param traindat: If None, then H3N2 data will be used. Else, specify the tuple (D,L) where D is the
    data matrix (samples in rows) and L is the label vector
    @param testdat: If None, then H1N1 data will be used. Else, specify (D2,L2) tuple for test data.
    @param return_err_idxs: If true, then the indexes in the test set where the classifier is wrong will
    be returned.
    @param common_scale: If true, then the test data will be scaled using the training data mean/std, else
    it will be scaled using its own mean/std.
    @param verbose: If true, more output will be displayed.
    @return: Either returns rc or rc, err_set, where rc is the return from the svm engine, which is the tuple
    (test_accuracy, factors, clf, train_accuracy). See L{ifr.svm_engine}.
    '''
    global PC_KNOWN_BAD
    global PC_KNOWN_ALIASES
    
    if traindat is None:
        (D,L,_,_) = ifr.load_flu_mat()
    else:
        (D,L) = traindat
        
    if testdat is None:
        (D2,L2,_,_) = ifr.load_H1N1_mat()
    else:
        (D2,L2) = testdat
        
    G = feature_names if (not feature_names is None) else ( ifr.load_gene_ids(short_name_only=True) )
    idxs = []

    for x in genelist:
        while x in PC_KNOWN_ALIASES:
            if verbose: print "%s is known alias to %s."%(x, PC_KNOWN_ALIASES[x])
            x = PC_KNOWN_ALIASES[x]
            #alias could be key to another alias...so go down until we stop finding substitutions
        
        if x in PC_KNOWN_BAD:
            if verbose: print "%s is known bad, skipping gene."%x
            continue       
        
        try:
            idx = G.index(x)
        except(ValueError): 
            #x is probably an alias to a gene name in G
            if verbose:
                print "Gene %s is not a known feature name. Querying for official name..."%x
            xn = ifr.get_official_name(x, entrezgene=True)
            if (xn is None) or (xn==x):
                if verbose: print "No other official name found, gene will be skipped."
                xn = None
                PC_KNOWN_BAD.append(x)
            else:                    
                print "Substituting %s for %s."%(xn,x)
                PC_KNOWN_ALIASES[x] = xn
             
            try:
                idx = G.index(xn) if (not xn is None) else None
            except(ValueError):
                idx = None  #happens when official name is found, but still not in list
            
        if not idx is None: idxs.append(idx)

    rc = ifr.svm_engine((D,L), (D2,L2), perm=idxs, common_scale=common_scale, verbose=False,
                        no_normalization=False, loss="L2", penalty="L2", C=1.0)
    
    if return_err_idxs:
        IX = sp.array( range(len(L2)))
        errs = IX[ rc[4] != L2]  #indexes where prediction is not correct
        err_set = set(errs)
        return rc, err_set
    else:
        return rc