def mapRootSentiment(source, tigerHelper, force=False): """Maps top-level sentiment between source and target sentences. If the target root node is not aligned and only has the default sentiment, we apply the sentiment value of the source root node to the target root node. We assume that root nodes always are implicitly aligned. The optional force parameter specifies whether the root sentiment is always mapped. This will override any previous mapping based on node alignments. @param source {Iterable{nltk.trees.Tree}} Source PTB trees @param target {etree} Target TigerXML tree, will be modified @param force {boolean} If True, always map sentiment between root nodes @returns Modified TigerXML tree """ for (sourceSentence, targetSentence) in itertools.izip_longest( source, th.getSentences(tigerHelper.tree), fillvalue="LIST_LENGTH_NOT_EQUAL"): rootNode = tigerHelper.getSentenceRoot(targetSentence) metaS = rootNode.get("x-sentiment") # we will typically get here before default sentiment values # have been applied, so metaS might be None. #assert metaS is not None if (not force and metaS == th.SEN_MAPPED): continue else: logger.debug("Mapping root sentiment %s for target %s", sourceSentence.node, th.getNodeID(rootNode)) th.setSentiment(rootNode, sourceSentence.node, th.SEN_MAPPED_ROOT)
def extract_features(self, trees, projected, extractLabels): """ Turns a set of parse trees into feature vectors. @param trees {Iterable<nltk.tree.Tree>} Source data @param projected {basestring} Filename of projected trees (TigerXML) @param extractLabels {bool} Whether to return node labels or not @returns {tuple} Tuple of lists for features and labels """ projectedSentences = [] if PROJ in self.features: projectedTiger = th.TigerHelper(projected) projectedSentences = th.getSentences(projectedTiger.tree) nodeLabels = [] data = [] rootIndices = [] for (projectedSentence, treeSentence) in itertools.izip_longest(projectedSentences, trees, fillvalue= "LISTLEN_NEQ"): if projectedSentence == "LISTLEN_NEQ": raise ValueError("projectedSentences too short!") if treeSentence == "LISTLEN_NEQ": raise ValueError("trees too short!") rootIndices.append(len(data)) # Variables are only set if features are to be extracted projS = None labelsS = None sentiWSPosS = None sentiWSNegS = None pos = None (phraseS, regScores, counts) = self.ppE.extract_phrase_predictor_sentiment( treeSentence) if PROJ in self.features: projS = self.projE.extract_projection_sentiment( projectedTiger, projectedSentence) # Does the tree contain gold or is it just any old tree? # To clarify: a tree is not necessarily annotated with a sentiment # label. It could also be a vanilla parse tree. if extractLabels: labelsS = self.goldE.extract_gold_sentiment(treeSentence) assert len(labelsS) == len(phraseS) if SENTIWS in self.features: (sentiWSPosS, sentiWSNegS) = self.sentiWSE.extract_sentiWS(treeSentence) if POSF in self.features: pos = self.posE.extract_POS(treeSentence) vectors = self.build_item_vectors(phraseS, projS, regScores, counts, sentiWSPosS, sentiWSNegS, pos) data.extend(vectors) if extractLabels: nodeLabels.extend(labelsS) return (numpy.asarray(data), numpy.asarray(nodeLabels), rootIndices)
def read_tiger_items(fileName): helper = th.TigerHelper(fileName) res = [] for item in th.getSentences(helper.tree): # have to serialize these, or numpy.array_split # will return weird splits. It looks like it uses the individual # sentence nodes as iterables. res.append(etree.tostring(item)) return res
def evaluate(predictFile, goldFile, showPercentages, dumpFile, csvFile, runName): if dumpFile is not None: dumpFile = open(dumpFile, "w") rootPredictedLabels = [] rootGoldLabels = [] predictedLabels = [] goldLabels = [] mappedPredictedLabels = [] mappedGoldLabels = [] predictTiger = th.TigerHelper(predictFile) predictSentences = th.getSentences(predictTiger.tree) gold = shared.ma_util.readPenn(goldFile) for (predictSentence, goldSentence) in itertools.izip_longest(predictSentences, gold, fillvalue= "LIST_LENGTH_NOT_EQUAL"): rootPredictedLabels.append(predictTiger.getSentenceSentiment( predictSentence, forceSentiment=True)) rootGoldLabels.append(goldSentence.node) # print "#" * 16 for (predictNode, goldNode) in itertools.izip_longest( predictTiger.preOrder(predictSentence, forceSentiment=True), shared.ma_util.walkTree(goldSentence), fillvalue="LIST_LENGTH_NOT_EQUAL"): predictedSentiment = predictNode[1] #if (predictedSentiment is None): #print predictNode predictedLabels.append(predictedSentiment) # print "=" * 8 # print goldNode # print "-" * 8 # print predictNode # print "=" * 8 goldSentiment = goldNode.node goldLabels.append(goldSentiment) if predictNode[2] != th.SEN_DEFAULT: mappedPredictedLabels.append(predictedSentiment) mappedGoldLabels.append(goldSentiment) if dumpFile and (mapNumToS(predictedSentiment) != mapNumToS(goldSentiment)): dumpFile.write("=" * 8) dumpFile.write("\n") dumpFile.write("Prediction error.\n") dumpFile.write("gold: ") dumpFile.write(str(goldNode)) dumpFile.write("\n") dumpFile.write("predicted:") dumpFile.write(str(predictNode[1])) dumpFile.write("\n") dumpFile.write("=" * 8) dumpFile.write("\n") if dumpFile is not None: dumpFile.close() print "All node labels" allNodes = printStats(goldLabels, predictedLabels, showPercentages) print "" print "Mapped node labels only (No default)" noDefault = printStats(mappedGoldLabels, mappedPredictedLabels, showPercentages, prefix="noDefault") print "Skipped %s default labels" % (len(goldLabels) - len(mappedGoldLabels)) print "" print "Root labels" rootLabels = printStats(rootGoldLabels, rootPredictedLabels, showPercentages, prefix="root") allNodes = shared.evaluate.ins(['run', "type"], [runName, "all nodes"], allNodes) noDefault = shared.evaluate.ins(['run', 'type'], [runName, "no default"], noDefault) allNodes.update(rootLabels) shared.evaluate.statsToFile(allNodes, csvFile)