def handleTree(self, tree1, tree2):
     for (n1, n2) in itertools.izip_longest(ma_util.walkTree(tree1),
                                            ma_util.walkTree(tree2), fillvalue=F):
         if n1 == F or n2 == F:
             raise ValueError('Tree length not equal or other breakage')
         prod1 = self.getProduction(n1)
         prod2 = self.getProduction(n2)
         goldLabel = int(n1.node)
         predLabel = int(n2.node)
         if prod1:
             self.count(prod1, prod2)
             self.gold.append(prod1)
             self.predicted.append(prod2)
         if not self.interesting:
             return
         coarse_map = {ma_util.VERY_NEG: ma_util.NEG,
                       ma_util.SLIGHTLY_NEG: ma_util.NEG,
                       ma_util.VERY_POS: ma_util.POS,
                       ma_util.SLIGHTLY_POS: ma_util.POS}
         ruleLabel = self.getInterestingLabel(n1)
         if ruleLabel and goldLabel in coarse_map:
             if not ruleLabel in self.totalI:
                 self.totalI[ruleLabel] = 0
             self.totalI[ruleLabel] += 1
             if (predLabel in coarse_map and
                     coarse_map[goldLabel] == coarse_map[predLabel]):
                 if not ruleLabel in self.correctI:
                     self.correctI[ruleLabel] = 0
                 self.correctI[ruleLabel] += 1
Beispiel #2
0
 def extract_POS(self, goldSentence, tagged=None):
     if tagged is None:
         tagged = self.posCache[goldSentence.pprint().encode('utf-8')]
     if tagged is None:
         #tagged = self.get_pos_tags_for_sentences([goldSentence])[0]
         raise ValueError("Should have seen sentence in cache: %s" %
                          goldSentence)
     leaves = goldSentence.leaves()
     if not len(leaves) == len(tagged):
         logger.error("leaves do not correspond to tagged!")
         logger.error("leaves: %s, tagged: %s", leaves, tagged)
     # TODO: there's a chance that similar leaves will have their POS tags
     # overriden
     # but yeah, good enough for now.
     leafDict = {}
     for (leaf, pos) in itertools.izip(leaves, tagged):
         pos = pos[1]
         leafDict[leaf] = pos
     items = []
     all_pos_tags = set()
     for goldNode in ma_util.walkTree(goldSentence):
         res = {}
         for subTreeLeaf in goldNode.leaves():
             key = leafDict[subTreeLeaf]  # [0]
             if not key in res:
                 res[key] = 0
             res[key] += 1  # += 1
             all_pos_tags.add(key)
         items.append(res)
     return items
Beispiel #3
0
    def extract_phrase_predictor_sentiment(self, goldSentence,
                                           returnSpans=False):
        """Extracts features from PhrasePredictor.

        The PhrasePredictor returns three features:
            - ScoreSum - sum over learned word weights
            - RegressionScore - predicted Amazon Review Star Rating
            - Token count
        @param goldSentence {PTB tree} Parse tree with sentiment annotation
        @returns {tuple} 3-tuple of lists with features
        """
        data = []
        data2 = []
        counts = []
        spans = []
        for goldNode in ma_util.walkTree(goldSentence):
            ppSpan = self.pp.getSpan(goldNode)
            ppSentiment = self.pp.main(ppSpan, True)
            sentiString = self.pp.score_sum_to_sentiment(ppSentiment[0])
            data.append(ma_util.strSen(sentiString))
            # this works almost as well (to the point of the difference
            # likely being noise), and the feature importance is
            # bit more evenly distributed with the original sumscore
            # instead of the discretized version
            # data.append(ppSentiment[0])
            data2.append(ppSentiment[1])
            # counts.append(ppSentiment[2])
            counts.append(len(goldNode.leaves()))
            spans.append(ppSpan)
        if returnSpans:
            return (data, data2, counts, spans)
        else:
            return (data, data2, counts)
Beispiel #4
0
 def extract_xgrams_from_tree(self, tree):
     # a tree is a single document
     # returns a horizontally stacked scipy.sparse.csr_matrix
     vectors = None
     for subTree in ma_util.walkTree(tree):
         xgrams = self.handleGrams(subTree.leaves())
         vec = self.convert_document_to_vector(xgrams)
         if vectors is None:
             # make 2d array
             vectors = np.asarray([vec])
         else:
             vectors = np.concatenate([vectors, [vec]])
     return vectors
Beispiel #5
0
    def extract_gold_sentiment(self, goldSentence, extractLength=False):
        """Extracts gold label.

        @param goldSentence {PTB tree} Parse tree with sentiment annotation
        @returns {list} List of labels
        """
        data = []
        for goldNode in ma_util.walkTree(goldSentence):
            label = ma_util.sen(goldNode.node, self.granularity)
            if extractLength:
                data.append((label, len(goldNode.leaves())))
            else:
                data.append(label)
        return data
Beispiel #6
0
    def extract_sentiWS(self, goldSentence):
        """
        Extracts features from SentiWS.
        For each span in the goldSentence tree, the positive and negative
        weights (if any) are added in separate features.

        @param goldSentence {PTB tree} Parse tree with sentiment annotation
        @param posWords {dict} Mapping from positive word forms to weights
        @param negWords {dict} Mapping from negative word forms to weights
        @returns {tuple({list}, {list}} Positive and Negative features per span
        """
        pos = []
        neg = []
        for goldNode in ma_util.walkTree(goldSentence):

            (negScore,
             posScore) = self.getSentiWSScore(goldNode.leaves())
            pos.append(posScore)
            neg.append(negScore)
        return (pos, neg)