Ejemplo n.º 1
0
def sdmaleph_runner(examples, mapping, ontologies=[], posClassVal=None, cutoff=None, relations=[],
        minPos=defaults['minpos'], noise=defaults['noise'], clauseLen=defaults['clauselength'], dataFormat='tab'):
    """
    SDM-Aleph web service.
    
    Inputs:
        - examples: str, a .tab dataset or a list of pairs
        - mapping : str, a mapping between examples and ontological terms,
        - ontologies : a list of {'ontology' : str} dicts
        - relations : a list of {'relation' : str} dicts
        - posClassVal : str, if the data is class-labeled, this is the target class,
        - cutoff : int, if the data is ranked, this is the cutoff value for splitting it into two classes,
        - minPos : int >= 1, minimum number of true positives per rule
        - noise : int > 0, false positives allowed per rule
        - clauseLen : int >= 1, number of predicates per clause,
        - dataFormat : str, legal values are 'tab' or 'list'
    Output:
        - str, the induced theory.
    
    @author: Anze Vavpetic, 2011 <*****@*****.**>
    """
    examples = StructuredFormat.parseInput(examples, dataFormat)
    mapping = StructuredFormat.parseMapping(mapping)
    relations = StructuredFormat.parseRelations(relations)
    pos, neg = [],[]
    if posClassVal:
        for id, val in examples:
            if val==posClassVal:
                pos.append((id, val))
            else:
                neg.append((id, val))
    elif cutoff:
        pos, neg = examples[:cutoff], examples[cutoff:]
    else:
        raise Exception('You must specify either the cutoff or the positive class value.')
    posEx, negEx, b = OWL2X.get_aleph_input([ont['ontology'] for ont in ontologies], mapping, [rel['relation'] for rel in relations], pos, neg)
    filestem = str(uuid.uuid4())
    print '4'
    runner = Aleph()
    print '5'
    # Set parameters
    for setting, val in defaults.items():
       runner.set(setting, val)
    if minPos >= 1:
       runner.set('minpos', minPos)
    else:
        raise Exception('minPos must be >= 1.')
    if noise >= 0:
       runner.set('noise', noise)
    else:
        raise Exception('noise must be >= 0.')
    if clauseLen >= 1:
        runner.set('clauselength', clauseLen)
    else:
        raise Exception('clauseLen must be >= 1.')
    # Set eval script
    print '5.2'
    str_rules, dump = runner.induce(defaults['mode'], posEx, negEx, b, filestem=filestem)
    rules = __conv(dump, pos, neg)
    #rules_json = json.dumps(__conv(dump, pos, neg))
    print '5.5'
    rules_w_scores = ''
    for rule in rules:
        rules_w_scores += '%s [sup=%d, cov=%d, wracc=%.3f]\n' % (rule['clause'], len(rule['posCovered']), len(rule['covered']), rule['wracc'])
    print '6'
    return rules_w_scores
Ejemplo n.º 2
0
    def run(self, 
            inputData,           # List of the form [..., (id_i, rank_i or label_i), ...] or str.
            mapping,             # List of the form [..., (id_i, URI1, URI2, ...), ...] where id_i is annotated with with the listed URI's or str.
            ont1,                # OWL ontologies as strings 
            ont2 = None, 
            ont3 = None, 
            ont4 = None,
            interactions = [],        # List of the form [..., (id_i, id_j), ...] where id_i interacts with id_j or str.
            generalTerms = [],
            legacy = False,
            posClassVal = None, 
            cutoff = None, 
            wracc_k = defaults[WRACC_K], 
            minimalSetSize = defaults[MIN_SET_SIZE],
            maxNumTerms = defaults[MAX_NUM_TERMS],
            maxReported = defaults[MAX_REPORTED],
            maximalPvalue = defaults[MAX_P_VALUE],
            weightFisher = defaults[WEIGHT_FISHER],
            weightGSEA = defaults[WEIGHT_GSEA],
            weightPAGE = defaults[WEIGHT_PAGE],
            summarizeDescriptions = defaults[SUMMARIZE],
            randomSeed = defaults[RANDOM_SEED],
            level_ont1 = defaults[LEVEL_ONT1],
            level_ont2 = defaults[LEVEL_ONT2],
            level_ont3 = defaults[LEVEL_ONT3],
            level_ont4 = defaults[LEVEL_ONT4],
            dataFormat = StructuredFormat.FORMAT_TAB,
            progressFname = 'progress.txt',   
            ):

        logger.info("Starting SDM-SEGS.")
        
        # Check if we have properly structured inputs or strings
        if type(inputData) in [str, unicode]:
            inputData = StructuredFormat.parseInput(inputData, dataFormat)
        if type(interactions) in [str, unicode]:
            interactions = StructuredFormat.parseInteractions(interactions)
        if type(mapping) in [str, unicode]:
            mapping = StructuredFormat.parseMapping(mapping)
        if type(generalTerms) in [str, unicode]:
            generalTerms = StructuredFormat.parseGeneralTerms(generalTerms)
        if posClassVal:
            # Labelled data
            pos, neg = [], []
            # Assure pos class instances are listed first.
            for iid, label in inputData:
                if label == posClassVal:
                    pos.append((iid, label))
                else:
                    neg.append((iid, label))
            cutoff = len(pos)
            pos.extend(neg)
            data = [[], []]
            for iid, label in pos:
                data[0].append(int(iid))
                data[1].append(0.5)
        else:
            # Assume ranked data
            if not cutoff:
                raise MissingParameterException("Cutoff needs to be specified for ranked data by the user!")
            data = [[], []]
            for iid, rank in inputData:
                data[0].append(int(iid))
                data[1].append(rank)
        inputData = data
        # Parse interactions
        idToList = dict()
        for id1, id2 in interactions:
            if not idToList.has_key(id1):
                idToList[id1] = []
            idToList[id1].append(id2)
        g2g = []
        for iid, idList in sorted(idToList.items(), key=lambda p: p[0]):
            g2g.append([iid, idList])
        if not legacy:
            import segs
            ont, g2ont = OWL2X.get_segs_input(filter(None, [ont1, ont2, ont3, ont4]), mapping)
            numOfOnt = len(filter(None, [ont1, ont2, ont3, ont4]))
        else:
            import segs_legacy as segs
            # Legacy input of segs - we assume it is already properly formatted
            g2ont = []
            for entry in mapping:
                g2ont.append([entry[0], entry[1]])
            ont = []
            for entry in StringIO.StringIO(ont1):
                ont.append(eval(entry))
            numOfOnt = 4
        # Create a map from go terms to human-readable descriptions
        ontDict = dict()
        for entry in ont:
            goID = entry[0]
            name = entry[1][1]
            ontDict[goID] = name     
        logger.info("Running SEGS subsystem.")        
        segs_result = segs.runSEGS(
            generalTerms = generalTerms,
            ontology = ont,
            g2g = g2g,
            g2ont = g2ont,
            progressFname = progressFname,
            inputData = inputData,
            useMolFunctions = True,
            useBioProcesses = numOfOnt > 1,
            useCellComponents = numOfOnt > 2,
            useKEGG = numOfOnt > 3,
            useGeneInteractions = 1 if interactions else 0,
            summarize = summarizeDescriptions,
            cutoff = cutoff,
            minSizeGS = minimalSetSize,
            maxNumTerms = maxNumTerms,
            GSEAfactor = SDMSEGS.locked[SDMSEGS.GSEA_FACTOR],
            numIters = 0,
            PrintTopGS = maxReported,
            p_value = maximalPvalue if legacy else 1,
            weightFisher = weightFisher,
            weightGSEA = weightGSEA,
            weightPAGE = weightPAGE,
            randomSeed = randomSeed,
            wracc_k = wracc_k,
            level_ont1 = level_ont1,
            level_ont2 = level_ont2,
            level_ont3 = level_ont3,
            level_ont4 = level_ont4)
        del segs
        logger.info("SDM-SEGS finished.")
        rules = []
        for _, segs_rule in segs_result['A']['WRAcc'].items():
            if segs_rule['scores']['wracc'] <= 0:
                continue
            rule = {
                'support' : segs_rule['topGenes'],
                'coverage' : segs_rule['allGenes'],
                'scores' : segs_rule['scores'],
                'terms' : [ontDict[term] for term in segs_rule['terms'] if isinstance(term, str)],
                'interacting_terms' : [ontDict[term] for term_list in segs_rule['terms'] if isinstance(term_list, list) for term in term_list],
            }
            rules.append(rule)
        return rules