def test2(self): inputFile = "tests/1.xml" inputFile = th.collapseToTmp(inputFile) print "Unary-collapsed tree at %s" % inputFile self.maxDiff = None penn = shared.ma_util.readPenn("tests/1-binarization-yv.ptb").next() tiger = project.readTiger(inputFile).next() res = project.getMappingFromNodeIDToSentiment(tiger, penn) expected = yaml.load(open("tests/1.newcollapse.expected.yml")) self.assertEqual(self._mapToString(res), self._mapToString(expected))
def main(inputFile, annotations, alignment, targetFile, output, stripTargetIDPrefix, applyParentSentiment, projectRootSentiment, alignTypes): """ Projects sentiment labels from a source tree to a target tree using an alignment between source and target nodes. @param inputFile {basestring} Filename of source treebank in TigerXML format @param annotations {basestring} Filename of treebank with sentiment labels in Penn Treebank format @param alignment {basestring} Filename of mapping between source and target nodes in Stockholm Treealigner format @param targetFile {basestring} Filename of target treebank in TigerXML format @param output {basestring} Filename for resulting output file @param stripTargetIDPrefix {boolean} Whether to strip alphabetic prefixes from node IDs in target tree @param applyParentSentiment {boolean} Whether to infer sentiment labels for unaligned nodes from ancestor nodes @param projectRootSentiment {boolean} Whether to perform implicit alignment between source and target root nodes if unaligned @param alignTypes {list} Which link types to include: good, fuzzy or both """ mapping = {} logger.info("Loading alignment.") alignment = readAlignment(alignment, alignTypes) logger.info("Done loading alignment.") logger.info("Alignment source was: %s", alignment["source"]) logger.info("Alignment target was: %s", alignment["target"]) alignment = alignment["alignment"] logger.info("Collapsing unary nodes for source file") # Now get some node statistic from source/input side # This means we have to load the file again in tigerHelper inputHelper = TigerHelper(inputFile) print ("Target has %s nodes (T, NT) before unary-collapsing nodes" % inputHelper.count) del inputHelper # Now overwrite inputFile variable! inputFile = th.collapseToTmp(inputFile, alignment.keys()) logger.info("Wrote unary-collapsed source tigerXML to %s", inputFile) logger.info("Extracting mapping from source ID to sentiment value.") for (tigerSentence, pennSentence) in itertools.izip_longest( readTiger(inputFile), ma_util.readPenn(annotations), fillvalue="LIST_LENGTH_NOT_EQUAL"): mapping.update( getMappingFromNodeIDToSentiment(tigerSentence, pennSentence)) logger.info("Done extracting mapping.") fh = open(targetFile, "r") target = etree.parse(fh) fh.close() tigerHelper = TigerHelper(target, stripTargetIDPrefix) print "Target has %s nodes (T, NT)" % tigerHelper.count logger.info("Applying mapping to target.") applyMappingToTarget( mapping, alignment, tigerHelper, stripTargetIDPrefix) print ("Source nodes with sentiment, not in alignment: %s" % countSourceNotInAlignment) print ("Nodes with sentiment and alignment, but not found " + "in target tree: %s" % countTargetNotFound) print ("Sentiment label projected using alignment for %s nodes" % countMappingApplied) logger.info("Done applying mapping.") logger.info("Unary-collapsing nodes in target tree.") tigerHelper.collapseUnary() logger.info("Done collapsing unary nodes.") print ("After collapsing unary nodes, Target has %s nodes (T, NT)" % tigerHelper.count) logger.info("Fixing up remaining nodes") # Need to map root sentiment before looking up parent sentiment # so we can use the new information if projectRootSentiment: logger.info("Projecting root sentiment for unaligned root nodes.") mapRootSentiment(ma_util.readPenn(annotations), tigerHelper) logger.info("Done projecting root sentiment.") if applyParentSentiment: logger.info("Using parent lookup for nodes with" + "unknown sentiment values.") (modTree, count) = tigerHelper.applyParentSentimentValue() print "Applied parent sentiment value for %s nodes" % count else: logger.info("Using default for nodes with unknown sentiment values.") (modTree, count) = tigerHelper.applyDefaultSentimentValue() print "Applied default sentiment value for %s nodes" % count logger.info("Done fixing up remaining nodes.") logger.info("Saving to disk...") tigerHelper.tree.write(output) logger.info("Done!")