def sdmaleph_runner(examples, mapping, ontologies=[], posClassVal=None, cutoff=None, relations=[], minPos=defaults['minpos'], noise=defaults['noise'], clauseLen=defaults['clauselength'], dataFormat='tab'): """ SDM-Aleph web service. Inputs: - examples: str, a .tab dataset or a list of pairs - mapping : str, a mapping between examples and ontological terms, - ontologies : a list of {'ontology' : str} dicts - relations : a list of {'relation' : str} dicts - posClassVal : str, if the data is class-labeled, this is the target class, - cutoff : int, if the data is ranked, this is the cutoff value for splitting it into two classes, - minPos : int >= 1, minimum number of true positives per rule - noise : int > 0, false positives allowed per rule - clauseLen : int >= 1, number of predicates per clause, - dataFormat : str, legal values are 'tab' or 'list' Output: - str, the induced theory. @author: Anze Vavpetic, 2011 <*****@*****.**> """ examples = StructuredFormat.parseInput(examples, dataFormat) mapping = StructuredFormat.parseMapping(mapping) relations = StructuredFormat.parseRelations(relations) pos, neg = [],[] if posClassVal: for id, val in examples: if val==posClassVal: pos.append((id, val)) else: neg.append((id, val)) elif cutoff: pos, neg = examples[:cutoff], examples[cutoff:] else: raise Exception('You must specify either the cutoff or the positive class value.') posEx, negEx, b = OWL2X.get_aleph_input([ont['ontology'] for ont in ontologies], mapping, [rel['relation'] for rel in relations], pos, neg) filestem = str(uuid.uuid4()) print '4' runner = Aleph() print '5' # Set parameters for setting, val in defaults.items(): runner.set(setting, val) if minPos >= 1: runner.set('minpos', minPos) else: raise Exception('minPos must be >= 1.') if noise >= 0: runner.set('noise', noise) else: raise Exception('noise must be >= 0.') if clauseLen >= 1: runner.set('clauselength', clauseLen) else: raise Exception('clauseLen must be >= 1.') # Set eval script print '5.2' str_rules, dump = runner.induce(defaults['mode'], posEx, negEx, b, filestem=filestem) rules = __conv(dump, pos, neg) #rules_json = json.dumps(__conv(dump, pos, neg)) print '5.5' rules_w_scores = '' for rule in rules: rules_w_scores += '%s [sup=%d, cov=%d, wracc=%.3f]\n' % (rule['clause'], len(rule['posCovered']), len(rule['covered']), rule['wracc']) print '6' return rules_w_scores
def run(self, inputData, # List of the form [..., (id_i, rank_i or label_i), ...] or str. mapping, # List of the form [..., (id_i, URI1, URI2, ...), ...] where id_i is annotated with with the listed URI's or str. ont1, # OWL ontologies as strings ont2 = None, ont3 = None, ont4 = None, interactions = [], # List of the form [..., (id_i, id_j), ...] where id_i interacts with id_j or str. generalTerms = [], legacy = False, posClassVal = None, cutoff = None, wracc_k = defaults[WRACC_K], minimalSetSize = defaults[MIN_SET_SIZE], maxNumTerms = defaults[MAX_NUM_TERMS], maxReported = defaults[MAX_REPORTED], maximalPvalue = defaults[MAX_P_VALUE], weightFisher = defaults[WEIGHT_FISHER], weightGSEA = defaults[WEIGHT_GSEA], weightPAGE = defaults[WEIGHT_PAGE], summarizeDescriptions = defaults[SUMMARIZE], randomSeed = defaults[RANDOM_SEED], level_ont1 = defaults[LEVEL_ONT1], level_ont2 = defaults[LEVEL_ONT2], level_ont3 = defaults[LEVEL_ONT3], level_ont4 = defaults[LEVEL_ONT4], dataFormat = StructuredFormat.FORMAT_TAB, progressFname = 'progress.txt', ): logger.info("Starting SDM-SEGS.") # Check if we have properly structured inputs or strings if type(inputData) in [str, unicode]: inputData = StructuredFormat.parseInput(inputData, dataFormat) if type(interactions) in [str, unicode]: interactions = StructuredFormat.parseInteractions(interactions) if type(mapping) in [str, unicode]: mapping = StructuredFormat.parseMapping(mapping) if type(generalTerms) in [str, unicode]: generalTerms = StructuredFormat.parseGeneralTerms(generalTerms) if posClassVal: # Labelled data pos, neg = [], [] # Assure pos class instances are listed first. for iid, label in inputData: if label == posClassVal: pos.append((iid, label)) else: neg.append((iid, label)) cutoff = len(pos) pos.extend(neg) data = [[], []] for iid, label in pos: data[0].append(int(iid)) data[1].append(0.5) else: # Assume ranked data if not cutoff: raise MissingParameterException("Cutoff needs to be specified for ranked data by the user!") data = [[], []] for iid, rank in inputData: data[0].append(int(iid)) data[1].append(rank) inputData = data # Parse interactions idToList = dict() for id1, id2 in interactions: if not idToList.has_key(id1): idToList[id1] = [] idToList[id1].append(id2) g2g = [] for iid, idList in sorted(idToList.items(), key=lambda p: p[0]): g2g.append([iid, idList]) if not legacy: import segs ont, g2ont = OWL2X.get_segs_input(filter(None, [ont1, ont2, ont3, ont4]), mapping) numOfOnt = len(filter(None, [ont1, ont2, ont3, ont4])) else: import segs_legacy as segs # Legacy input of segs - we assume it is already properly formatted g2ont = [] for entry in mapping: g2ont.append([entry[0], entry[1]]) ont = [] for entry in StringIO.StringIO(ont1): ont.append(eval(entry)) numOfOnt = 4 # Create a map from go terms to human-readable descriptions ontDict = dict() for entry in ont: goID = entry[0] name = entry[1][1] ontDict[goID] = name logger.info("Running SEGS subsystem.") segs_result = segs.runSEGS( generalTerms = generalTerms, ontology = ont, g2g = g2g, g2ont = g2ont, progressFname = progressFname, inputData = inputData, useMolFunctions = True, useBioProcesses = numOfOnt > 1, useCellComponents = numOfOnt > 2, useKEGG = numOfOnt > 3, useGeneInteractions = 1 if interactions else 0, summarize = summarizeDescriptions, cutoff = cutoff, minSizeGS = minimalSetSize, maxNumTerms = maxNumTerms, GSEAfactor = SDMSEGS.locked[SDMSEGS.GSEA_FACTOR], numIters = 0, PrintTopGS = maxReported, p_value = maximalPvalue if legacy else 1, weightFisher = weightFisher, weightGSEA = weightGSEA, weightPAGE = weightPAGE, randomSeed = randomSeed, wracc_k = wracc_k, level_ont1 = level_ont1, level_ont2 = level_ont2, level_ont3 = level_ont3, level_ont4 = level_ont4) del segs logger.info("SDM-SEGS finished.") rules = [] for _, segs_rule in segs_result['A']['WRAcc'].items(): if segs_rule['scores']['wracc'] <= 0: continue rule = { 'support' : segs_rule['topGenes'], 'coverage' : segs_rule['allGenes'], 'scores' : segs_rule['scores'], 'terms' : [ontDict[term] for term in segs_rule['terms'] if isinstance(term, str)], 'interacting_terms' : [ontDict[term] for term_list in segs_rule['terms'] if isinstance(term_list, list) for term in term_list], } rules.append(rule) return rules