Example #1
0
 def test2(self):
     inputFile = "tests/1.xml"
     inputFile = th.collapseToTmp(inputFile)
     print "Unary-collapsed tree at %s" % inputFile
     self.maxDiff = None
     penn = shared.ma_util.readPenn("tests/1-binarization-yv.ptb").next()
     tiger = project.readTiger(inputFile).next()
     res = project.getMappingFromNodeIDToSentiment(tiger, penn)
     expected = yaml.load(open("tests/1.newcollapse.expected.yml"))
     self.assertEqual(self._mapToString(res), self._mapToString(expected))
Example #2
0
def main(inputFile, annotations, alignment, targetFile, output,
         stripTargetIDPrefix, applyParentSentiment, projectRootSentiment,
         alignTypes):
    """
    Projects sentiment labels from a source tree to a target tree
    using an alignment between source and target nodes.

    @param inputFile {basestring} Filename of source treebank in TigerXML
           format
    @param annotations {basestring} Filename of treebank with sentiment labels
           in Penn Treebank format
    @param alignment {basestring} Filename of mapping between source and
           target nodes in Stockholm Treealigner format
    @param targetFile {basestring} Filename of target treebank in TigerXML
           format
    @param output {basestring} Filename for resulting output file
    @param stripTargetIDPrefix {boolean} Whether to strip alphabetic prefixes
           from node IDs in target tree
    @param applyParentSentiment {boolean} Whether to infer sentiment labels
          for unaligned nodes from ancestor nodes
    @param projectRootSentiment {boolean} Whether to perform implicit alignment
    between source and target root nodes if unaligned
    @param alignTypes {list} Which link types to include: good, fuzzy or both
    """
    mapping = {}
    logger.info("Loading alignment.")
    alignment = readAlignment(alignment, alignTypes)
    logger.info("Done loading alignment.")
    logger.info("Alignment source was: %s", alignment["source"])
    logger.info("Alignment target was: %s", alignment["target"])
    alignment = alignment["alignment"]
    logger.info("Collapsing unary nodes for source file")
    # Now get some node statistic from source/input side
    # This means we have to load the file again in tigerHelper
    inputHelper = TigerHelper(inputFile)
    print ("Target has %s nodes (T, NT) before unary-collapsing nodes"
           % inputHelper.count)
    del inputHelper
    # Now overwrite inputFile variable!
    inputFile = th.collapseToTmp(inputFile, alignment.keys())
    logger.info("Wrote unary-collapsed source tigerXML to %s", inputFile)
    logger.info("Extracting mapping from source ID to sentiment value.")
    for (tigerSentence, pennSentence) in itertools.izip_longest(
            readTiger(inputFile), ma_util.readPenn(annotations),
            fillvalue="LIST_LENGTH_NOT_EQUAL"):
        mapping.update(
            getMappingFromNodeIDToSentiment(tigerSentence, pennSentence))
    logger.info("Done extracting mapping.")
    fh = open(targetFile, "r")
    target = etree.parse(fh)
    fh.close()
    tigerHelper = TigerHelper(target, stripTargetIDPrefix)
    print "Target has %s nodes (T, NT)" % tigerHelper.count
    logger.info("Applying mapping to target.")
    applyMappingToTarget(
        mapping, alignment, tigerHelper, stripTargetIDPrefix)
    print ("Source nodes with sentiment, not in alignment: %s"
           % countSourceNotInAlignment)
    print ("Nodes with sentiment and alignment, but not found "
           + "in target tree: %s" % countTargetNotFound)
    print ("Sentiment label projected using alignment for %s nodes"
           % countMappingApplied)
    logger.info("Done applying mapping.")
    logger.info("Unary-collapsing nodes in target tree.")
    tigerHelper.collapseUnary()
    logger.info("Done collapsing unary nodes.")
    print ("After collapsing unary nodes, Target has %s nodes (T, NT)"
           % tigerHelper.count)
    logger.info("Fixing up remaining nodes")
    # Need to map root sentiment before looking up parent sentiment
    # so we can use the new information
    if projectRootSentiment:
        logger.info("Projecting root sentiment for unaligned root nodes.")
        mapRootSentiment(ma_util.readPenn(annotations), tigerHelper)
        logger.info("Done projecting root sentiment.")
    if applyParentSentiment:
        logger.info("Using parent lookup for nodes with"
                    + "unknown sentiment values.")
        (modTree, count) = tigerHelper.applyParentSentimentValue()
        print "Applied parent sentiment value for %s nodes" % count
    else:
        logger.info("Using default for nodes with unknown sentiment values.")
        (modTree, count) = tigerHelper.applyDefaultSentimentValue()
        print "Applied default sentiment value for %s nodes" % count
    logger.info("Done fixing up remaining nodes.")
    logger.info("Saving to disk...")
    tigerHelper.tree.write(output)
    logger.info("Done!")