def buildExamplesForDocuments(self,
                                  documentSentences,
                                  output,
                                  idFileTag=None):
        examples = []
        counter = ProgressCounter(len(documentSentences), "Build examples")

        #calculatePredictedRange(self, sentences)

        outfile = open(output, "wt")
        exampleCount = 0
        for document in documentSentences:
            counter.update(
                1,
                "Building examples (" + document[0].sentence.get("id") + "): ")
            examples = self.buildExamples(document)
            exampleCount += len(examples)
            #examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Ejemplo n.º 2
0
    def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >>sys.stderr, "Examples built:", exampleCount
        print >>sys.stderr, "Features:", len(self.featureSet.getNames())
        # IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        # ENDIF
        # Save Ids
        if idFileTag != None:
            print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
Ejemplo n.º 3
0
    def buildExamplesForSentences(self,
                                  sentences,
                                  goldSentences,
                                  output,
                                  idFileTag=None,
                                  append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(
                1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0],
                                          goldSentence[0],
                                          append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0

    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)

    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId

    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example (" + example[0] + "): ")
        features = example[2]
        for i in range(len(weightFeatures) - 1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet):
    outFile = open(outFile, "wt")
    addCount = 0
    
    f = open(exampleFile)
    numExamples = sum([1 for line in f])
    f.close()
    counter = ProgressCounter(numExamples, "Polynomize examples", step=0)
    
    weightFeatureIds = {}
    for weightFeature in weightFeatures:
        wId = idSet.getId(weightFeature, False)
        if wId == None:
            sys.exit("Weight vector feature", weightFeature, "not in id file")
        weightFeatureIds[weightFeature] = wId
    
    print "Polynomizing", exampleFile
    exampleCache = []
    for example in ExampleUtils.readExamples(exampleFile):
        counter.update(1, "Processing example ("+example[0]+"): ")
        features = example[2]
        for i in range(len(weightFeatures)-1):
            wI = weightFeatures[i]
            wIid = weightFeatureIds[wI]
            if not features.has_key(wIid):
                continue
            for j in range(i + 1, len(weightFeatures)):
                wJ = weightFeatures[j]
                wJid = weightFeatureIds[wJ]
                if not features.has_key(wJid):
                    continue
                # Make polynomial feature
                features[idSet.getId(wI + "_AND_" + wJ)] = 1
                addCount += 1
        exampleCache.append(example)
        if len(exampleCache) > 50:
            ExampleUtils.appendExamples(exampleCache, outFile)
            exampleCache = []
    ExampleUtils.appendExamples(exampleCache, outFile)
    outFile.close()
    print "Added", addCount, "polynomial features"
Ejemplo n.º 6
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")
    
    calculatePredictedRange(exampleBuilder, sentences)
    
    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Ejemplo n.º 7
0
def buildExamples(exampleBuilder, sentences, outfilename):
    timer = Timer()
    examples = []
    if "graph_kernel" in exampleBuilder.styles:
        counter = ProgressCounter(len(sentences), "Build examples", 0)
    else:
        counter = ProgressCounter(len(sentences), "Build examples")

    calculatePredictedRange(exampleBuilder, sentences)

    outfile = open(outfilename, "wt")
    exampleCount = 0
    for sentence in sentences:
        counter.update(
            1, "Building examples (" + sentence[0].getSentenceId() + "): ")
        examples = exampleBuilder.buildExamples(sentence[0])
        exampleCount += len(examples)
        examples = exampleBuilder.preProcessExamples(examples)
        Example.appendExamples(examples, outfile)
    outfile.close()

    print >> sys.stderr, "Examples built:", str(exampleCount)
    print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames())
    print >> sys.stderr, "Elapsed", timer.toString()
Ejemplo n.º 8
0
    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return 0  #[]

        #examples = []
        exampleIndex = 0

        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}

        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass(
                "GIVEN", "ENTITY"
        ):  # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]:  # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]:  # manually force the setting
            buildForNameless = False

        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "given"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless:  # no names, no need for triggers
                return 0  #[]

            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(
                    sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)

            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles[
                    "names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue

            tokenText = token.get("text").lower()
            #            if "stem_gazetteer" in self.styles:
            #                tokenText = PorterStemmer.stem(tokenText)
            #            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
            #                features = {}
            #                features[self.featureSet.getId("exclude_gazetteer")] = 1
            #                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
            #                if entityIds != None:
            #                    extra["goldIds"] = entityIds
            #                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            #                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
            #                exampleIndex += 1
            #                continue

            # FEATURES
            features = {}

            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1

            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_" + stringLower)] = 1
                features[self.featureSet.getId(
                    "substringstem_" + PorterStemmer.stem(stringLower))] = 1

            if not self.styles["no_context"]:
                # Linear order features
                for index in [-3, -2, -1, 1, 2, 3]:
                    if i + index > 0 and i + index < len(sentenceGraph.tokens):
                        self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                      str(index), features)

                # Linear n-grams
                if self.styles["linear_ngrams"]:
                    self.buildLinearNGram(max(0, i - 1), i, sentenceGraph,
                                          features)
                    self.buildLinearNGram(max(0, i - 2), i, sentenceGraph,
                                          features)

            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:3].lower())] = 1

            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            if not self.styles["no_context"]:
                t1InEdges = self.inEdgesByToken[token]
                for edge in t1InEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HIn_" +
                                                   edge[0].get("POS"))] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   edge[0].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[0])
                    features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1
                t1OutEdges = self.outEdgesByToken[token]
                for edge in t1OutEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HOut_" +
                                                   edge[1].get("POS"))] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   edge[1].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[1])
                    features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1

            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(
                    sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)

            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" +
                                                   str(index) + "_" +
                                                   normalizedText[:index +
                                                                  1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" +
                                                   str(index) + "_" +
                                                   normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)
                self.drugFeatureBuilder.setFeatureVector(None)

            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(
                    tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_" + wordNetFeature)] = 1
                #print

            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(
                    token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)

            if self.styles["ontobiotope_features"]:
                self.ontobiotopeFeatureBuilder.setFeatureVector(features)
                self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token)
                self.ontobiotopeFeatureBuilder.setFeatureVector(None)

            extra = {"xtype": "token", "t": token.get("id")}
            if self.styles["bb_features"]:
                extra[
                    "trigex"] = "bb"  # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi"  # Request trigger type unmerging
            if entityIds != None:
                extra[
                    "goldIds"] = entityIds  # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )

            if self.styles["bb_spans"]:
                for span in sentenceGraph.sentenceElement.iter("span"):
                    if span.get("headOffset") != token.get("charOffset"):
                        continue
                    #if span.get("source") != "spec":
                    #    continue
                    #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id")
                    features[self.featureSet.getId("span_found")] = 1
                    features[self.featureSet.getId(
                        "span_count")] = 1 + features.get(
                            self.featureSet.getId("span_count"), 0)
                    features[self.featureSet.getId("span_identifier" +
                                                   span.get("identifier"))] = 1
                    features[self.featureSet.getId("span_type" +
                                                   span.get("type"))] = 1
                    features[self.featureSet.getId("span_category" +
                                                   span.get("category"))] = 1
                    features[self.featureSet.getId("span_source" +
                                                   span.get("source"))] = 1

                    if "define_offset" in extra:
                        prevOffset = [
                            int(x) for x in extra["define_offset"].split("-")
                        ]
                        assert len(prevOffset) == 2
                        newOffset = [
                            int(x) for x in span.get("charOffset").split("-")
                        ]
                        assert len(newOffset) == 2
                        prevOffsetRange = abs(prevOffset[0] - prevOffset[1])
                        newOffsetRange = abs(newOffset[0] - newOffset[1])
                        if newOffsetRange > prevOffsetRange:
                            extra["define_offset"] = span.get("charOffset")
                    else:
                        extra["define_offset"] = span.get("charOffset")
                features[self.featureSet.getId("span_count_" + str(
                    features.get(self.featureSet.getId("span_count"), 0)))] = 1

            # chains
            if not self.styles["no_context"]:
                self.buildChains(token, sentenceGraph, features)

            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            if self.styles["wordvector"]:
                self.wordVectorFeatureBuilder.setFeatureVector(features)
                self.wordVectorFeatureBuilder.buildFeatures(token)
                self.wordVectorFeatureBuilder.setFeatureVector(None)

            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Ejemplo n.º 9
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        # example directionality
        if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus
            examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True
        elif self.styles["directed"]:
            assert self.styles["undirected"] in [None, False]
            examplesAreDirected = True
        elif self.styles["undirected"]:
            assert self.styles["directed"] in [None, False]
            examplesAreDirected = False
        
        if not self.styles["no_trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
#         if self.styles["sdb_merge"]:
#             self.determineNonOverlappingTypes(structureAnalyzer)
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            paths = undirected
            if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and
                paths.resetAnalyses() # just in case
                paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["token_nodes"]:
            loopRange = len(sentenceGraph.tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["token_nodes"]:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected)
                for categoryName, features, extra in examples:
                    # make example
                    if self.styles["binary"]:
                        if categoryName != "neg":
                            category = 1
                        else:
                            category = -1
                        extra["categoryName"] = "i"
                    else:
                        category = self.classSet.getId(categoryName)
                    example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra]
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1

        return exampleIndex
Ejemplo n.º 10
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected

        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)

        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")

        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)

        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            # Entered here - Mu
            # The entities here include both named entities(Protein) and event triggers
            # The purpose of merging the entities is to convert the original gold annotation, where
            # a trigger can have multiple trigger annotations, to the merged version.
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
            # pdb.set_trace()
            # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities):
            #     pdb.set_trace()

        # Up to here, the merged graph has been built. for one sentence - Mu
        # sentenceGraph_return = sentenceGraph
        # with open('./GE09_train_graph/merged-'+ sentenceGraph.sentenceElement.get('id'), 'wb') as f:
        #     pickle.dump(sentenceGraph, f)
        # with open('./GE09_train_graph/gold-'+ goldGraph.sentenceElement.get('id'), 'wb') as f:
        #     pickle.dump(goldGraph, f)


        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue

            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)

            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions: # interactions are outgoing edges for the current entity - Mu
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            # pdb.set_trace()
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            # pdb.set_trace()
            # if 'Theme' in validInteractionsByType.keys() and 'Cause' in validInteractionsByType:
                # pdb.set_trace()
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[(), (d,)], [(a,), (b,)]] - Mu
            # pdb.set_trace()
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())

            # Up to here, all possible interaction combinations are found - Mu
            # Note this is for each trigger - Mu
            #sum(argCombinations, []) # flatten nested list
            argCombinations_return = argCombinations
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations

            # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities) and len(argCombinations) != 0:
            # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1':
            #     pdb.set_trace()
            for argCombination in argCombinations:
                # Originally binary classification
                # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']:
                #     maxArgCombinationLen = max([len(i) for i in argCombinations])
                #     if len(argCombination) != maxArgCombinationLen:
                #         # meaning that for Regulation classes, there are plausible association of both
                #         # (Theme, Cause) and (Theme). And we always choose (Theme, Cause) and ignore (Theme)
                #         continue
                # if entity.get('type') in ['Binding']:
                #     maxArgCombinationLen = max([len(i) for i in argCombinations])
                #     if len(argCombination) != maxArgCombinationLen:
                #         # meaning that for binding events, only take the longest ones.
                #         continue
                # if entity.get('type') in ['Localization', 'Phosphorylation']:
                #     maxArgCombinationLen = max([len(i) for i in argCombinations])
                #     if len(argCombination) != maxArgCombinationLen:
                #         # meaning that for binding events, only take the longest ones.
                #         continue
                # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']:
                # if entity.get('type') in ['Binding']:
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else: # Entered here, since self.styles["binary"] is None - Mu
                        category = entity.get("type")

                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)

                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                else: # not a valid event or valid entity
                    if len(issues) == 0: # must be > 0 so that it gets filtered
                        if not structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:"+eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()

        #return examples
        return exampleIndex#, sentenceGraph_return, argCombinations_return
    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each phrase in the sentence
        """
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        #examples = []
        exampleIndex = 0

        # Prepare phrases, create subphrases
        #filter = set(["NP", "TOK-IN", "WHADVP", "WHNP", "TOK-WP$", "TOK-PRP$", "NP-IN"])
        phrases = MapPhrases.getPhrases(sentenceGraph.parseElement,
                                        sentenceGraph.tokens,
                                        set(["NP", "WHADVP", "WHNP"]))
        phraseDict = MapPhrases.getPhraseDict(phrases)
        phrases.extend(
            MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens,
                                        phraseDict, ["NP"]))
        phrases.extend(
            MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict))
        phraseToEntity = MapPhrases.getPhraseEntityMapping(
            sentenceGraph.entities, phraseDict)
        # Make counts
        phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases)
        for key in phraseTypeCounts.keys():
            if not self.phraseTypeCounts.has_key(key):
                self.phraseTypeCounts[key] = 0
            self.phraseTypeCounts[key] += phraseTypeCounts[key]
        self.exampleStats.addVariable(
            "Phrase type counts", self.phraseTypeCounts
        )  # can be added on each loop, will always point to the same thing

        # Build one example for each phrase
        for phrase in phrases:
            features = {}
            self.triggerFeatureBuilder.setFeatureVector(features)

            categoryName = self.getCategoryName(phrase, phraseToEntity)
            category = self.classSet.getId(categoryName)
            phraseTokens = self.getPhraseTokens(phrase, sentenceGraph)
            phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens)
            self.exampleStats.beginExample(categoryName)

            if self.styles["co_limits"] and not self.isPotentialCOTrigger(
                    phrase, phraseTokens, sentenceGraph):
                self.exampleStats.filter("co_limits")
                self.exampleStats.endExample()
                continue

            # Sentence level features
            features.update(self.triggerFeatureBuilder.bowFeatures)

            # Whole phrase features
            self.buildLinearNGram(phraseTokens, sentenceGraph, features)
            features[self.featureSet.getId("pType_" + phrase.get("type"))] = 1
            for split in phrase.get("type").split("-"):
                features[self.featureSet.getId("pSubType_" + split)] = 1
            # Check named entities
            nameCount = 0
            for token in phraseTokens:
                if sentenceGraph.tokenIsName[token]:
                    nameCount += 1
            features[self.featureSet.getId("phraseNames_" +
                                           str(nameCount))] = 1
            features[self.featureSet.getId("phraseNameCount")] = nameCount

            # Head token features
            self.triggerFeatureBuilder.setTag("head_")
            self.triggerFeatureBuilder.buildFeatures(phraseHeadToken)
            self.triggerFeatureBuilder.buildAttachedEdgeFeatures(
                phraseHeadToken, sentenceGraph)
            self.triggerFeatureBuilder.setTag()

            # Features for all phrase tokens
            self.triggerFeatureBuilder.setTag("ptok_")
            phraseTokenPos = 0
            #print len(phraseTokens)
            for token in phraseTokens:
                self.triggerFeatureBuilder.setTag("ptok_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken,
                                                         linear=False,
                                                         chains=False)
                self.triggerFeatureBuilder.setTag("ptok_" +
                                                  str(phraseTokenPos) + "_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken,
                                                         linear=False,
                                                         chains=False)
                self.triggerFeatureBuilder.setTag("ptok_" +
                                                  str(phraseTokenPos -
                                                      len(phraseTokens)) + "_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken,
                                                         linear=False,
                                                         chains=False)
                #self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken)
                phraseTokenPos += 1
            self.triggerFeatureBuilder.setTag()

            extra = {
                "xtype": "phrase",
                "t": phraseHeadToken.get("id"),
                "p": phrase.get("id"),
                "ptype": phrase.get("type")
            }
            extra["charOffset"] = phrase.get("charOffset")
            if phrase not in phraseToEntity:
                extra["eids"] = "neg"
            else:
                extra["eids"] = ",".join(
                    [x.get("id") for x in phraseToEntity[phrase]])
            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            self.exampleStats.endExample()
            exampleIndex += 1

        # Mark missed entities in exampleStats
        linkedEntities = set(sum(phraseToEntity.values(), []))
        for entity in sentenceGraph.entities:
            if entity.get("given") != "True" and entity not in linkedEntities:
                self.exampleStats.addValue("Entities with no phrase", 1)
                # Marking these as filtered examples was misleading, as examples are per phrase, and these are entities
                #self.exampleStats.beginExample(entity.get("type"))
                #self.exampleStats.filter("no_phrase")
                #self.exampleStats.endExample()
        return exampleIndex
Ejemplo n.º 12
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0

        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]:
            self.evexFeatureBuilder.initSentence(sentenceGraph)

        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped",
                                   len(sentenceGraph.entities) - len(entities))

        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                entities, goldGraph.entities)

        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected

        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]

        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange - 1):
            for j in range(i + 1, loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get(
                                "source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(
                            sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue

                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eI.get("type") + "/" +
                                                     eJ.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tI, tJ, paths, sentenceGraph,
                                              categoryName, exampleIndex, eI,
                                              eJ)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()

                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eJ.get("type") + "/" +
                                                     eI.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tJ, tI, paths, sentenceGraph,
                                              categoryName, exampleIndex, eJ,
                                              eI)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths,
                                                       sentenceGraph,
                                                       categoryName,
                                                       exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(
                            tJ, tI, paths, sentenceGraph, categoryName,
                            exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()

        #return examples
        return exampleIndex
Ejemplo n.º 13
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                sentenceGraph.entities, goldGraph.entities)

        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get(
                    "isName"
            ) == "True":  # known data which can be used for features
                namedEntityCount += 1
            else:  # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1

            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_" + text):
                    bagOfWords["spec_bow_" + text] = 0
                bagOfWords["spec_bow_" + text] += 1
                bagOfWords["spec_sentence"] = 1

        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("isName") == "True":
                continue

            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
            #IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
            #ENDIF
            #features[self.featureSet.getId(entityType)] = 1

            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1

            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)

            extra = {
                "xtype": "task3",
                "t3type": task3Type,
                "t": token.get("id"),
                "entity": entity.get("id")
            }
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Ejemplo n.º 14
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        #examples = []
        exampleIndex = 0

        #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected

        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(
            sentenceGraph, paths)

        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:  # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get(
                    "charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")

        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)

        # Generate examples based on interactions between entities or interactions between tokens
#        interactionsByEntityId = {}
#        for entity in sentenceGraph.entities:
#            interactionsByEntityId[entity.get("id")] = []
#        for interaction in sentenceGraph.interactions:
#            if interaction.get("type") == "neg":
#                continue
#            e1Id = interaction.get("e1")
#            interactionsByEntityId[e1Id].append(interaction)
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities

        exampleIndex = 0
        for entity in entities:  # sentenceGraph.entities:
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]:
            #    continue

            #if not goldEntitiesByOffset.has_key(entity.get("headOffset")):
            #    continue

            #interactions = interactionsByEntityId[entity.get("id")]
            interactions = [
                x[2]
                for x in sentenceGraph.getOutInteractions(entity, mergeInput)
            ]
            argCombinations = self.getArgumentCombinations(
                eType, interactions, entity.get("id"))
            #if len(argCombinations) <= 1:
            #    continue
            assert argCombinations != None, (entity.get("id"),
                                             entity.get("type"))
            for argCombination in argCombinations:
                if eType != "Process":
                    assert len(argCombination
                               ) > 0, eType + ": " + str(argCombinations)
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination,
                                                   sentenceGraph, goldGraph,
                                                   goldEntitiesByOffset)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
                    #category = "event"
                    category = eType
                    if category.find("egulation") != -1:
                        category = "All_regulation"
                    elif category != "Binding":
                        category = "Other"  #"simple6"
                else:
                    category = "neg"

                features = {}

                argString = ""
                for arg in argCombination:
                    argString += "," + arg.get("id")
                extra = {
                    "xtype": "um",
                    "e": entity.get("id"),
                    "i": argString[1:],
                    "etype": eType,
                    "class": category
                }
                assert type(extra["etype"]) == types.StringType, extra
                self.exampleStats.addExample(category)
                example = self.buildExample(sentenceGraph, paths, entity,
                                            argCombination, interactions)
                example[0] = sentenceGraph.getSentenceId() + ".x" + str(
                    exampleIndex)
                example[1] = self.classSet.getId(category)
                example[3] = extra
                #examples.append( example )
                ExampleUtils.appendExamples([example], outfile)
                exampleIndex += 1

        #return examples
        return exampleIndex
Ejemplo n.º 15
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each phrase in the sentence
        """
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        # examples = []
        exampleIndex = 0

        # Prepare phrases, create subphrases
        # filter = set(["NP", "TOK-IN", "WHADVP", "WHNP", "TOK-WP$", "TOK-PRP$", "NP-IN"])
        phrases = MapPhrases.getPhrases(sentenceGraph.parseElement, sentenceGraph.tokens, set(["NP", "WHADVP", "WHNP"]))
        phraseDict = MapPhrases.getPhraseDict(phrases)
        phrases.extend(MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens, phraseDict, ["NP"]))
        phrases.extend(MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict))
        phraseToEntity = MapPhrases.getPhraseEntityMapping(sentenceGraph.entities, phraseDict)
        # Make counts
        phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases)
        for key in phraseTypeCounts.keys():
            if not self.phraseTypeCounts.has_key(key):
                self.phraseTypeCounts[key] = 0
            self.phraseTypeCounts[key] += phraseTypeCounts[key]
        self.exampleStats.addVariable(
            "Phrase type counts", self.phraseTypeCounts
        )  # can be added on each loop, will always point to the same thing

        # Build one example for each phrase
        for phrase in phrases:
            features = {}
            self.triggerFeatureBuilder.setFeatureVector(features)

            categoryName = self.getCategoryName(phrase, phraseToEntity)
            category = self.classSet.getId(categoryName)
            phraseTokens = self.getPhraseTokens(phrase, sentenceGraph)
            phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens)
            self.exampleStats.beginExample(categoryName)

            if self.styles["co_limits"] and not self.isPotentialCOTrigger(phrase, phraseTokens, sentenceGraph):
                self.exampleStats.filter("co_limits")
                self.exampleStats.endExample()
                continue

            # Sentence level features
            features.update(self.triggerFeatureBuilder.bowFeatures)

            # Whole phrase features
            self.buildLinearNGram(phraseTokens, sentenceGraph, features)
            features[self.featureSet.getId("pType_" + phrase.get("type"))] = 1
            for split in phrase.get("type").split("-"):
                features[self.featureSet.getId("pSubType_" + split)] = 1
            # Check named entities
            nameCount = 0
            for token in phraseTokens:
                if sentenceGraph.tokenIsName[token]:
                    nameCount += 1
            features[self.featureSet.getId("phraseNames_" + str(nameCount))] = 1
            features[self.featureSet.getId("phraseNameCount")] = nameCount

            # Head token features
            self.triggerFeatureBuilder.setTag("head_")
            self.triggerFeatureBuilder.buildFeatures(phraseHeadToken)
            self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken, sentenceGraph)
            self.triggerFeatureBuilder.setTag()

            # Features for all phrase tokens
            self.triggerFeatureBuilder.setTag("ptok_")
            phraseTokenPos = 0
            # print len(phraseTokens)
            for token in phraseTokens:
                self.triggerFeatureBuilder.setTag("ptok_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
                self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos) + "_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
                self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos - len(phraseTokens)) + "_")
                self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False)
                # self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken)
                phraseTokenPos += 1
            self.triggerFeatureBuilder.setTag()

            extra = {
                "xtype": "phrase",
                "t": phraseHeadToken.get("id"),
                "p": phrase.get("id"),
                "ptype": phrase.get("type"),
            }
            extra["charOffset"] = phrase.get("charOffset")
            if phrase not in phraseToEntity:
                extra["eids"] = "neg"
            else:
                extra["eids"] = ",".join([x.get("id") for x in phraseToEntity[phrase]])
            example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            self.exampleStats.endExample()
            exampleIndex += 1

        # Mark missed entities in exampleStats
        linkedEntities = set(sum(phraseToEntity.values(), []))
        for entity in sentenceGraph.entities:
            if entity.get("given") != "True" and entity not in linkedEntities:
                self.exampleStats.addValue("Entities with no phrase", 1)
                # Marking these as filtered examples was misleading, as examples are per phrase, and these are entities
                # self.exampleStats.beginExample(entity.get("type"))
                # self.exampleStats.filter("no_phrase")
                # self.exampleStats.endExample()
        return exampleIndex
Ejemplo n.º 16
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)
        
        #examples = []
        exampleIndex = 0
        
        #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        # Generate examples based on interactions between entities or interactions between tokens
#        interactionsByEntityId = {}
#        for entity in sentenceGraph.entities:
#            interactionsByEntityId[entity.get("id")] = []
#        for interaction in sentenceGraph.interactions:
#            if interaction.get("type") == "neg":
#                continue
#            e1Id = interaction.get("e1")
#            interactionsByEntityId[e1Id].append(interaction)
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]:
            #    continue
            
            #if not goldEntitiesByOffset.has_key(entity.get("headOffset")):
            #    continue
            
            #interactions = interactionsByEntityId[entity.get("id")]
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            #if len(argCombinations) <= 1:
            #    continue
            assert argCombinations != None, (entity.get("id"), entity.get("type"))
            for argCombination in argCombinations:
                if eType != "Process":
                    assert len(argCombination) > 0, eType + ": " + str(argCombinations)
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
                    #category = "event"
                    category = eType
                    if category.find("egulation") != -1:
                        category = "All_regulation"
                    elif category != "Binding":
                        category = "Other" #"simple6"
                else:
                    category = "neg"
                    
                features = {}
                
                argString = ""
                for arg in argCombination:
                    argString += "," + arg.get("id")
                extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                assert type(extra["etype"]) == types.StringType, extra
                self.exampleStats.addExample(category)
                example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                example[1] = self.classSet.getId(category)
                example[3] = extra
                #examples.append( example )
                ExampleUtils.appendExamples([example], outfile)
                exampleIndex += 1
            
        #return examples
        return exampleIndex
Ejemplo n.º 17
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """       
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") 
            return 0 #[]
        
        #examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}
        
        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass("GIVEN", "ENTITY"): # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]: # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]: # manually force the setting
            buildForNameless = False
        
        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("given") == "True": # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers
                return 0 #[]
            
            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)
            
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue           
            
            tokenText = token.get("text").lower()
#            if "stem_gazetteer" in self.styles:
#                tokenText = PorterStemmer.stem(tokenText)
#            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
#                features = {}
#                features[self.featureSet.getId("exclude_gazetteer")] = 1
#                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
#                if entityIds != None:
#                    extra["goldIds"] = entityIds
#                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
#                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
#                exampleIndex += 1
#                continue
            
            # FEATURES
            features = {}
            
            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
            if normalizedText == "bound": # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_"+normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_"+norStem)] = 1
            features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1
            
            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1
            
            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_"+stringLower)] = 1
                features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Linear n-grams
            if self.styles["linear_ngrams"]:
                self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features)
                self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features)
            
            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1
                
            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            
            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)
            
            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index+1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)  
                self.drugFeatureBuilder.setFeatureVector(None)
            
            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_"+wordNetFeature)] = 1
                #print
            
            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)
                             
            extra = {"xtype":"token","t":token.get("id")}
            if self.styles["bb_features"]:
                extra["trigex"] = "bb" # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi" # Request trigger type unmerging
            if entityIds != None:
                extra["goldIds"] = entityIds # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            
            # chains
            self.buildChains(token, sentenceGraph, features)
            
            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)
            
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Ejemplo n.º 18
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        
        if self.styles["trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
            
        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected
        
        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()
        
        #return examples
        return exampleIndex
Ejemplo n.º 19
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities)
        
        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get("given") == "True": # known data which can be used for features
                namedEntityCount += 1
            else: # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            
            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_"+text):
                    bagOfWords["spec_bow_"+text] = 0
                bagOfWords["spec_bow_"+text] += 1
                bagOfWords["spec_sentence"] = 1
        
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("given") == "True":
                continue
            
            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)  
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
#IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be 
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
#ENDIF            
            #features[self.featureSet.getId(entityType)] = 1
            
            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1
            
            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)
             
            extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")}
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1            
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues):
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                else:
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"                
                    features = {}
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
Ejemplo n.º 21
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                else: # not a valid event or valid entity
                    if len(issues) == 0: # must be > 0 so that it gets filtered
                        if not structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:"+eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex