Example #1
0
def main(treeFile):
    dump = set(['4 -> 1 3',
               '0 -> 1 3'])
    prods = {}
    pe = evaluate_productions.ProdEvaluator(True, ma_util.GRANULARITY_FINE)
    for tree in ma_util.readPenn(treeFile):
        for subTree in ma_util.walkTree(tree):
            l = pe.getInterestingLabel(subTree)
            if l:
                if not l in prods:
                    prods[l] = set()

                prod = '%s -> %s %s' % (subTree.node, subTree[0].node,
                                              subTree[1].node)
                prods[l].add(prod)
                if prod in dump:
                    print "DESIRED: %s" % l
                    print subTree.pprint().encode('utf-8')
                # don't print everything
                if len(subTree.leaves()) < 5:
                    print "L: %s" % l
                    print subTree.pprint().encode('utf-8')
    print "-" * 8
    for rule in prods:
        print "RULE: %s" % rule
        for prod in prods[rule]:
            print prod
        print "=" * 8
Example #2
0
 def __init__(self, features, goldFile, projectedFile, foldsFile,
              mznAlphabet, mznModel, mznArgs,
              sentiWSPos, sentiWSNeg, toClassifyFile, toClassifyProjFile,
              toClassifyOutFile, toClassifyIsTest, doCV, evalCSV,
              weightsCSV, learners):
     """
     @param features {iterable{string}} List of features
     @param goldFile Filename of gold sentences (PTB format)
     @param projectedFile Filename of projected trees (TigerXML)
     @param mznAlphabet Alphabet file (Pickle) (see PhrasePredictor)
     @param mznModel Model file (Pickle) (see PhrasePredictor)
     @param mznArgs Argsparse dump (Pickle) (see PhrasePredictor)
     @param sentiWSPos SentiWS file with positive words
     @param sentiWSNeg SentiWS file with negative words
     """
     self.sentiWSPos = sentiWSPos
     self.sentiWSNeg = sentiWSNeg
     self.features = features
     self.goldFile = goldFile
     self.gold = ma_util.readPenn(goldFile)
     self.projectedFile = projectedFile
     self.foldsFile = foldsFile
     self.pp = PhrasePredictor(mznAlphabet, mznModel, mznArgs, False)
     self.toClassifyFile = toClassifyFile
     self.toClassify = None
     if self.toClassifyFile:
         self.toClassify = list(ma_util.readPenn(self.toClassifyFile))
     self.toClassifyProjFile = toClassifyProjFile
     self.toClassifyOutFile = toClassifyOutFile
     self.toClassifyIsTest = toClassifyIsTest
     self._init_extractors()
     # re-init
     self.gold = ma_util.readPenn(goldFile)
     self.doCV = doCV
     self.learners = learners
     self.evalCSV = evalCSV
     self.weightsCSV = weightsCSV
     # eval data is stored and written once we're done
     self.evalData = []
     self.weightsData = []
     self._load_data()
Example #3
0
def getTreeLabelsAtLengthFile(fileName, granularity):
    return getTreeLabelsAtLength(ma_util.readPenn(fileName), granularity)
Example #4
0
def getCoarseGrainedTreeLabelsFile(fileName):
    return getCoarseGrainedTreeLabels(ma_util.readPenn(fileName))
Example #5
0
def readPenn(treeBank):
    return ma_util.readPenn(treeBank)
Example #6
0
def main(inputFile, annotations, alignment, targetFile, output,
         stripTargetIDPrefix, applyParentSentiment, projectRootSentiment,
         alignTypes):
    """
    Projects sentiment labels from a source tree to a target tree
    using an alignment between source and target nodes.

    @param inputFile {basestring} Filename of source treebank in TigerXML
           format
    @param annotations {basestring} Filename of treebank with sentiment labels
           in Penn Treebank format
    @param alignment {basestring} Filename of mapping between source and
           target nodes in Stockholm Treealigner format
    @param targetFile {basestring} Filename of target treebank in TigerXML
           format
    @param output {basestring} Filename for resulting output file
    @param stripTargetIDPrefix {boolean} Whether to strip alphabetic prefixes
           from node IDs in target tree
    @param applyParentSentiment {boolean} Whether to infer sentiment labels
          for unaligned nodes from ancestor nodes
    @param projectRootSentiment {boolean} Whether to perform implicit alignment
    between source and target root nodes if unaligned
    @param alignTypes {list} Which link types to include: good, fuzzy or both
    """
    mapping = {}
    logger.info("Loading alignment.")
    alignment = readAlignment(alignment, alignTypes)
    logger.info("Done loading alignment.")
    logger.info("Alignment source was: %s", alignment["source"])
    logger.info("Alignment target was: %s", alignment["target"])
    alignment = alignment["alignment"]
    logger.info("Collapsing unary nodes for source file")
    # Now get some node statistic from source/input side
    # This means we have to load the file again in tigerHelper
    inputHelper = TigerHelper(inputFile)
    print ("Target has %s nodes (T, NT) before unary-collapsing nodes"
           % inputHelper.count)
    del inputHelper
    # Now overwrite inputFile variable!
    inputFile = th.collapseToTmp(inputFile, alignment.keys())
    logger.info("Wrote unary-collapsed source tigerXML to %s", inputFile)
    logger.info("Extracting mapping from source ID to sentiment value.")
    for (tigerSentence, pennSentence) in itertools.izip_longest(
            readTiger(inputFile), ma_util.readPenn(annotations),
            fillvalue="LIST_LENGTH_NOT_EQUAL"):
        mapping.update(
            getMappingFromNodeIDToSentiment(tigerSentence, pennSentence))
    logger.info("Done extracting mapping.")
    fh = open(targetFile, "r")
    target = etree.parse(fh)
    fh.close()
    tigerHelper = TigerHelper(target, stripTargetIDPrefix)
    print "Target has %s nodes (T, NT)" % tigerHelper.count
    logger.info("Applying mapping to target.")
    applyMappingToTarget(
        mapping, alignment, tigerHelper, stripTargetIDPrefix)
    print ("Source nodes with sentiment, not in alignment: %s"
           % countSourceNotInAlignment)
    print ("Nodes with sentiment and alignment, but not found "
           + "in target tree: %s" % countTargetNotFound)
    print ("Sentiment label projected using alignment for %s nodes"
           % countMappingApplied)
    logger.info("Done applying mapping.")
    logger.info("Unary-collapsing nodes in target tree.")
    tigerHelper.collapseUnary()
    logger.info("Done collapsing unary nodes.")
    print ("After collapsing unary nodes, Target has %s nodes (T, NT)"
           % tigerHelper.count)
    logger.info("Fixing up remaining nodes")
    # Need to map root sentiment before looking up parent sentiment
    # so we can use the new information
    if projectRootSentiment:
        logger.info("Projecting root sentiment for unaligned root nodes.")
        mapRootSentiment(ma_util.readPenn(annotations), tigerHelper)
        logger.info("Done projecting root sentiment.")
    if applyParentSentiment:
        logger.info("Using parent lookup for nodes with"
                    + "unknown sentiment values.")
        (modTree, count) = tigerHelper.applyParentSentimentValue()
        print "Applied parent sentiment value for %s nodes" % count
    else:
        logger.info("Using default for nodes with unknown sentiment values.")
        (modTree, count) = tigerHelper.applyDefaultSentimentValue()
        print "Applied default sentiment value for %s nodes" % count
    logger.info("Done fixing up remaining nodes.")
    logger.info("Saving to disk...")
    tigerHelper.tree.write(output)
    logger.info("Done!")