Beispiel #1
0
    def getEdgeFeats(self, k_edge, v_edge, tag, curr_filename, my_nodes, my_edges):
        """
        extract features for one edge.
        cache features before conjoining with edge tag.
        k_edge: (1,2), v_edge: AmrEdge
        my_nodes: (1,) -> AmrNode, my_edges: (2,1) -> AmrEdge
        """
        feat_vec = FeatureVector()
        
        # edge features have been extracted
        if self.curr_filename == curr_filename and k_edge in self.curr_feats:
            feat_vec = self.curr_feats[k_edge]
        
        else: # new edge
            # if this is a new file, clear cache
            if self.curr_filename != curr_filename:
                self.curr_filename = curr_filename
                self.curr_feats = {}
            
            # extract features and add to cache
            for feat_func in self.edge_feat_funcs:
                feat_vec += feat_func(v_edge)
            self.curr_feats[k_edge] = feat_vec
        
        # conjoin features with tag
        new_feat_vec = FeatureVector()
        for k, v in feat_vec.iteritems():
            new_feat_vec[(str(tag),) + k] = v

        # return edge features conjoined with tag            
        return new_feat_vec
Beispiel #2
0
    def getEdgeFeats(self, k_edge, v_edge, tag, curr_filename, my_nodes,
                     my_edges):
        """
        extract features for one edge.
        cache features before conjoining with edge tag.
        k_edge: (1,2), v_edge: AmrEdge
        my_nodes: (1,) -> AmrNode, my_edges: (2,1) -> AmrEdge
        """
        feat_vec = FeatureVector()

        # edge features have been extracted
        if self.curr_filename == curr_filename and k_edge in self.curr_feats:
            feat_vec = self.curr_feats[k_edge]

        else:  # new edge
            # if this is a new file, clear cache
            if self.curr_filename != curr_filename:
                self.curr_filename = curr_filename
                self.curr_feats = {}

            # extract features and add to cache
            for feat_func in self.edge_feat_funcs:
                feat_vec += feat_func(v_edge)
            self.curr_feats[k_edge] = feat_vec

        # conjoin features with tag
        new_feat_vec = FeatureVector()
        for k, v in feat_vec.iteritems():
            new_feat_vec[(str(tag), ) + k] = v

        # return edge features conjoined with tag
        return new_feat_vec
Beispiel #3
0
    def ffEdgeNodeDepth(self, edge):
        """
        extract node depth features for edge
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodeDepth(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodeDepth(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        node1_foremost = 1e10
        node2_foremost = 1e10

        for source in node1.sources:
            node_depth = len((source.graph_idx).split('.'))
            if node1_foremost > node_depth:
                node1_foremost = node_depth

        for source in node2.sources:
            node_depth = len((source.graph_idx).split('.'))
            if node2_foremost > node_depth:
                node2_foremost = node_depth

        # concatenate foremost occurrence of node1 and node2
        feat_vec[('e', 'n1', 'dep', 'fmst', str(node1_foremost), 'n2', 'dep',
                  'fmst', str(node2_foremost))] = 1.0
        return feat_vec
Beispiel #4
0
    def oracle(self, instance):
        """
        an instance includes:
        my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ...
        my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,...
        root_nodes: (1,), (3,),... nodes that are root of sentence
        selected_nodes: (1,), (3,),... nodes contained in summary graph
        selected_edges: (1,2), (3,1),... edges contained in summary graph
        """
        logger.debug('start oracle decoding...')

        curr_filename = instance.filename
        my_nodes, oracle_nodes, _ = instance.nodes  # nodes and selected nodes
        my_edges, oracle_edges = instance.edges  # edges and selected edges

        # features that are associated with oracle graph
        feat_vec = FeatureVector()

        for k_edge, v_edge in my_edges.iteritems():
            tag = 1 if k_edge in oracle_edges else 0  # use oracle tag
            feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag,
                                                    curr_filename, my_nodes,
                                                    my_edges)

        for k_node, v_node in my_nodes.iteritems():
            tag = 1 if k_node in oracle_nodes else 0  # use oracle tag
            feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag,
                                                    curr_filename, my_nodes,
                                                    my_edges)

        score_true = self.weights.dot(feat_vec)

        # return features associated with oracle graph
        return feat_vec, oracle_nodes, oracle_edges, score_true
Beispiel #5
0
 def ffNodeBias(self, node):
     """
     add a bias term to node
     """
     feat_vec = FeatureVector()
     feat_vec[('n', 'bias')] = 1.0
     return feat_vec
Beispiel #6
0
    def ffEdgeNonNullFreq(self, edge):
        """
        extract a binary feature for edge frequency (Non-NULL edges)
        freq == 0, freq >= 1, freq >= 2, freq >= 5, freq >= 10
        """
        feat_vec = FeatureVector()

        edge_freq = 0
        for source in edge.sources:
            if source.relation != 'NULL':
                edge_freq += 1

        # binary feature for edge frequency
        feat_vec[('e', 'freq', 'non_null',
                  '0')] = 1.0 if edge_freq == 0 else 0.0
        feat_vec[('e', 'freq', 'non_null',
                  '1')] = 1.0 if edge_freq >= 1 else 0.0
        feat_vec[('e', 'freq', 'non_null',
                  '2')] = 1.0 if edge_freq >= 2 else 0.0
        feat_vec[('e', 'freq', 'non_null',
                  '5')] = 1.0 if edge_freq >= 5 else 0.0
        feat_vec[('e', 'freq', 'non_null',
                  '10')] = 1.0 if edge_freq >= 10 else 0.0

        return feat_vec
Beispiel #7
0
 def ffEdgeBias(self, edge):
     """
     add a bias term to edge
     """
     feat_vec = FeatureVector()
     feat_vec[('e', 'bias')] = 1.0
     return feat_vec
Beispiel #8
0
    def ffEdgeNodeSpan(self, edge):
        """
        extract features from node spans of edge
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodeSpan(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodeSpan(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        node1_longest = -1
        node2_longest = -1

        for source in node1.sources:
            node_span = source.end_idx - source.start_idx
            if node1_longest < node_span:
                node1_longest = node_span

        for source in node2.sources:
            node_span = source.end_idx - source.start_idx
            if node2_longest < node_span:
                node2_longest = node_span

        # concatenate foremost occurrence of node1 and node2
        feat_vec[('e', 'n1', 'span', 'lgst', str(node1_longest), 'n2', 'span',
                  'lgst', str(node2_longest))] = 1.0

        return feat_vec
Beispiel #9
0
    def ffEdgeNodePos(self, edge):
        """
        extract node position features for edge
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodePos(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodePos(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        node1_foremost = 1e10
        node2_foremost = 1e10

        for source in node1.sources:
            if node1_foremost > source.line_num:
                node1_foremost = source.line_num

        for source in node2.sources:
            if node2_foremost > source.line_num:
                node2_foremost = source.line_num

        # concatenate foremost occurrence of node1 and node2
        feat_vec[('e', 'n1', 'posit', 'fmst', str(node1_foremost), 'n2',
                  'posit', 'fmst', str(node2_foremost))] = 1.0

        return feat_vec
Beispiel #10
0
    def ffNodeConcept(self, node):
        """
        extract node concept feature
        """
        feat_vec = FeatureVector()

        # node concept
        feat_vec[('n', 'cpt', node.concept)] = 1.0

        return feat_vec
Beispiel #11
0
    def ffNodeCollapsedEntity(self, node):
        """
        extract features from collapsed concept node
        """
        feat_vec = FeatureVector()

        # named entity or not
        feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0
        feat_vec[('n', 'date-ent')] = 1.0 if (
            node.concept).startswith('date-entity') else 0.0

        return feat_vec
Beispiel #12
0
 def ffEdgeIsNull(self, edge):
     """
     extract a binary feature indicating a NULL edge or not
     """
     feat_vec = FeatureVector()
     is_null = 1.0
     for source in edge.sources:
         if source.relation != 'NULL':
             is_null = 0.0
             break
     feat_vec[('e', 'is_null')] = is_null
     return feat_vec
Beispiel #13
0
    def ffEdgeNodeConcept(self, edge):
        """
        extract node concept features for edge
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodeConcept(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodeConcept(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        return feat_vec
Beispiel #14
0
    def ffEdgeNodeFreq(self, edge):
        """
        extract node frequency features for an edge
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodeFreq(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodeFreq(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        return feat_vec
Beispiel #15
0
    def ffEdgeNodeCollapsedEntity(self, edge):
        """
        extract features from collapsed concept node
        """
        feat_vec = FeatureVector()

        node1 = edge.node1
        node2 = edge.node2

        for k, v in self.ffNodeSpan(node1).iteritems():
            feat_vec[('e', 'n1') + k] = v

        for k, v in self.ffNodeSpan(node2).iteritems():
            feat_vec[('e', 'n2') + k] = v

        return feat_vec
Beispiel #16
0
    def ffNodeFreq(self, node):
        """
        extract node frequency features
        """
        feat_vec = FeatureVector()

        # node frequency
        node_freq = len(node.sources)

        feat_vec[('n', 'freq', '0')] = 1.0 if node_freq == 0 else 0.0
        feat_vec[('n', 'freq', '1')] = 1.0 if node_freq >= 1 else 0.0
        feat_vec[('n', 'freq', '2')] = 1.0 if node_freq >= 2 else 0.0
        feat_vec[('n', 'freq', '5')] = 1.0 if node_freq >= 5 else 0.0
        feat_vec[('n', 'freq', '10')] = 1.0 if node_freq >= 10 else 0.0

        return feat_vec
Beispiel #17
0
    def ffEdgeRel(self, edge):
        """
        extract a binary feature for edge relation.
        """
        feat_vec = FeatureVector()
        edge_freq = len(edge.sources)

        # primary and secondary relation (edge relation entropy)?
        rel_freq = Counter()
        for source in edge.sources:
            rel_freq[source.relation] += 1
        rels = rel_freq.most_common()

        if rels:  # primary relation
            (rel, count) = rels[0]
            feat_vec[('e', 'rel', 'fst', rel)] = 1.0

            # relative frequency
            per_fst_rel = count / edge_freq

            feat_vec[('e', 'rel', 'fst', rel,
                      'p1')] = 1.0 if per_fst_rel >= 0.5 else 0.0
            feat_vec[('e', 'rel', 'fst', rel,
                      'p2')] = 1.0 if per_fst_rel >= 0.66 else 0.0
            feat_vec[('e', 'rel', 'fst', rel,
                      'p3')] = 1.0 if per_fst_rel >= 0.75 else 0.0

            # secondary relation and relative frequency
            if len(rels) > 1:
                (sec_rel, sec_count) = rels[1]
                feat_vec[('e', 'rel', 'sec', sec_rel)] = 1.0

                # relative frequency
                per_sec_rel = sec_count / edge_freq

                feat_vec[('e', 'rel', 'sec', sec_rel,
                          'p1')] = 1.0 if per_sec_rel >= 0.25 else 0.0
                feat_vec[('e', 'rel', 'sec', sec_rel,
                          'p2')] = 1.0 if per_sec_rel >= 0.33 else 0.0
                feat_vec[('e', 'rel', 'sec', sec_rel,
                          'p3')] = 1.0 if per_sec_rel >= 0.5 else 0.0

                # combine first and secondary relation
                feat_vec[('e', 'rel', 'fst', rel, 'sec', sec_rel)] = 1.0

        return feat_vec
Beispiel #18
0
    def ffNodeSpan(self, node):
        """
        extract features from node spans
        """
        feat_vec = FeatureVector()

        # longest span of node
        # average span of node
        node_longest = -1
        node_average = 0.0
        node_freq = len(node.sources)

        for source in node.sources:
            node_span = source.end_idx - source.start_idx
            node_average += node_span
            if node_longest < node_span:
                node_longest = node_span

        if node_freq > 0:
            node_average /= node_freq

        if node_freq > 0:
            feat_vec[('n', 'span', 'lgst',
                      '0')] = 1.0 if node_longest >= 0 else 0.0
            feat_vec[('n', 'span', 'lgst',
                      '1')] = 1.0 if node_longest >= 1 else 0.0
            feat_vec[('n', 'span', 'lgst',
                      '2')] = 1.0 if node_longest >= 2 else 0.0
            feat_vec[('n', 'span', 'lgst',
                      '5')] = 1.0 if node_longest >= 5 else 0.0
            feat_vec[('n', 'span', 'lgst',
                      '10')] = 1.0 if node_longest >= 10 else 0.0

            feat_vec[('n', 'span', 'avg',
                      '0')] = 1.0 if node_longest >= 0 else 0.0
            feat_vec[('n', 'span', 'avg',
                      '1')] = 1.0 if node_longest >= 1 else 0.0
            feat_vec[('n', 'span', 'avg',
                      '2')] = 1.0 if node_longest >= 2 else 0.0
            feat_vec[('n', 'span', 'avg',
                      '5')] = 1.0 if node_longest >= 5 else 0.0
            feat_vec[('n', 'span', 'avg',
                      '10')] = 1.0 if node_longest >= 10 else 0.0

        return feat_vec
Beispiel #19
0
    def ffNodePos(self, node):
        """
        extract node position features
        """
        feat_vec = FeatureVector()

        # foremost occurrence of node
        # average occurrence position of node
        node_foremost = 1e10
        node_average = 0.0
        node_freq = len(node.sources)

        for source in node.sources:
            node_average += source.line_num
            if node_foremost > source.line_num:
                node_foremost = source.line_num

        if node_freq > 0:
            node_average /= node_freq

        if node_freq > 0:
            # foremost occurrence
            feat_vec[('n', 'posit', 'fmst',
                      '5')] = 1.0 if node_foremost >= 5 else 0.0
            feat_vec[('n', 'posit', 'fmst',
                      '6')] = 1.0 if node_foremost >= 6 else 0.0
            feat_vec[('n', 'posit', 'fmst',
                      '7')] = 1.0 if node_foremost >= 7 else 0.0
            feat_vec[('n', 'posit', 'fmst',
                      '10')] = 1.0 if node_foremost >= 10 else 0.0
            feat_vec[('n', 'posit', 'fmst',
                      '15')] = 1.0 if node_foremost >= 15 else 0.0
            # average occurrence
            feat_vec[('n', 'posit', 'avg',
                      '5')] = 1.0 if node_average >= 5 else 0.0
            feat_vec[('n', 'posit', 'avg',
                      '6')] = 1.0 if node_average >= 6 else 0.0
            feat_vec[('n', 'posit', 'avg',
                      '7')] = 1.0 if node_average >= 7 else 0.0
            feat_vec[('n', 'posit', 'avg',
                      '10')] = 1.0 if node_average >= 10 else 0.0
            feat_vec[('n', 'posit', 'avg',
                      '15')] = 1.0 if node_average >= 15 else 0.0

        return feat_vec
Beispiel #20
0
    def ffNodeDepth(self, node):
        """
        depth of node in graph topology
        """
        feat_vec = FeatureVector()

        # foremost occurrence of node1, node2
        # average occurrence position of node1, node2
        node_foremost = 1e10
        node_average = 0.0
        node_freq = len(node.sources)

        for source in node.sources:
            node_depth = len((source.graph_idx).split('.'))
            node_average += node_depth
            if node_foremost > node_depth:
                node_foremost = node_depth

        if node_freq > 0:
            node_average /= node_freq

        if node_freq > 0:
            feat_vec[('n', 'dep', 'fmst',
                      '1')] = 1.0 if node_foremost >= 1 else 0.0
            feat_vec[('n', 'dep', 'fmst',
                      '2')] = 1.0 if node_foremost >= 2 else 0.0
            feat_vec[('n', 'dep', 'fmst',
                      '3')] = 1.0 if node_foremost >= 3 else 0.0
            feat_vec[('n', 'dep', 'fmst',
                      '4')] = 1.0 if node_foremost >= 4 else 0.0
            feat_vec[('n', 'dep', 'fmst',
                      '5')] = 1.0 if node_foremost >= 5 else 0.0

            feat_vec[('n', 'dep', 'avg',
                      '1')] = 1.0 if node_average >= 1 else 0.0
            feat_vec[('n', 'dep', 'avg',
                      '2')] = 1.0 if node_average >= 2 else 0.0
            feat_vec[('n', 'dep', 'avg',
                      '3')] = 1.0 if node_average >= 3 else 0.0
            feat_vec[('n', 'dep', 'avg',
                      '4')] = 1.0 if node_average >= 4 else 0.0
            feat_vec[('n', 'dep', 'avg',
                      '5')] = 1.0 if node_average >= 5 else 0.0

        return feat_vec
Beispiel #21
0
    def ffEdgeFreq(self, edge):
        """
        extract a binary feature for edge frequency.
        freq == 0, freq >= 1, freq >= 2, freq >= 5, freq >= 10
        """
        feat_vec = FeatureVector()

        # edge frequency
        edge_freq = len(edge.sources)

        # binary feature for edge frequency
        feat_vec[('e', 'freq', '0')] = 1.0 if edge_freq == 0 else 0.0
        feat_vec[('e', 'freq', '1')] = 1.0 if edge_freq >= 1 else 0.0
        feat_vec[('e', 'freq', '2')] = 1.0 if edge_freq >= 2 else 0.0
        feat_vec[('e', 'freq', '5')] = 1.0 if edge_freq >= 5 else 0.0
        feat_vec[('e', 'freq', '10')] = 1.0 if edge_freq >= 10 else 0.0

        return feat_vec
Beispiel #22
0
    def ffEdgePos(self, edge):
        """
        extract features from edge occurrences.

        """
        feat_vec = FeatureVector()

        # foremost position in all edge occurrences
        # average position across all edge occurrences
        foremost = 1e10
        average = 0.0
        edge_freq = len(edge.sources)

        for source in edge.sources:
            average += source.line_num
            if foremost > source.line_num:
                foremost = source.line_num

        if edge_freq > 0:
            average /= edge_freq

        if edge_freq > 0:  # edge exists
            # foremost occurrence
            feat_vec[('e', 'posit', 'fmst',
                      '5')] = 1.0 if foremost >= 5 else 0.0
            feat_vec[('e', 'posit', 'fmst',
                      '6')] = 1.0 if foremost >= 6 else 0.0
            feat_vec[('e', 'posit', 'fmst',
                      '7')] = 1.0 if foremost >= 7 else 0.0
            feat_vec[('e', 'posit', 'fmst',
                      '10')] = 1.0 if foremost >= 10 else 0.0
            feat_vec[('e', 'posit', 'fmst',
                      '15')] = 1.0 if foremost >= 15 else 0.0
            # average occurrence
            feat_vec[('e', 'posit', 'avg', '5')] = 1.0 if average >= 5 else 0.0
            feat_vec[('e', 'posit', 'avg', '6')] = 1.0 if average >= 6 else 0.0
            feat_vec[('e', 'posit', 'avg', '7')] = 1.0 if average >= 7 else 0.0
            feat_vec[('e', 'posit', 'avg',
                      '10')] = 1.0 if average >= 10 else 0.0
            feat_vec[('e', 'posit', 'avg',
                      '15')] = 1.0 if average >= 15 else 0.0

        return feat_vec
Beispiel #23
0
        feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0
        feat_vec[('n', 'date-ent')] = 1.0 if (node.concept).startswith('date-entity') else 0.0
            
        return feat_vec   
    

if __name__ == '__main__':
    
    input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/'
    body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt'
    summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt'
    
    corpus = buildCorpus(os.path.join(input_dir, body_file),
                         os.path.join(input_dir, summ_file))
    feat_extr = FeatureExtractor()
    feat_vec = FeatureVector()

    for inst in corpus:
        curr_filename = inst.filename
        my_nodes, s_nodes = inst.nodes
        my_edges, s_edges = inst.edges
        
#         logger.debug('extracting features for file: %s' % curr_filename)
#         for k_edge, v_edge in my_edges.iteritems():
#             for tag in [0,1]:
#                 feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
                
        logger.debug('extracting features for file: %s' % curr_filename)
        for k_node, v_node in my_nodes.iteritems():
            for tag in [0,1]:
                feat_vec += feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges)
Beispiel #24
0
    def learnParamsAdaGrad(self,
                           decoder,
                           corpus,
                           param_file,
                           loss_func,
                           num_passes=10,
                           oracle_len='nolen'):
        """
        learn parameters using Structured Perceptron, Ramp Loss, (Hinge??)
        """
        logger.debug('start learning parameters...')
        shuffle(corpus)  # shuffle corpus

        avg_weights = FeatureVector()
        curr_instances = 0

        node_perf = PerfScore(0.0, 0.0, 0.0)  # node performance
        edge_perf = PerfScore(0.0, 0.0, 0.0)  # edge performance

        eta = 1.0  # stepsize
        l2reg = 0.0  #
        node_cost_scaling = 1.0  # cost scaling factor
        edge_cost_scaling = 1.0  # cost scaling factor

        sumSq = FeatureVector()

        for curr_num_passes in xrange(1, num_passes + 1):
            logger.debug('#curr_num_passes#: %d' % curr_num_passes)

            for instance in corpus:
                curr_instances += 1
                logger.debug('processing instance %d...' % curr_instances)

                # perceptron loss
                if loss_func.startswith('perceptron'):
                    gradient, selected_nodes, selected_edges, score_pred = decoder.decode(
                        instance, oracle_len)
                    plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle(
                        instance)

                    curr_loss = score_pred - score_true  # @UnusedVariable
                    gradient -= plus_feats
                    decoder.weights -= eta * gradient

                # ramp loss + cost-augmented decoding
                if loss_func.startswith('ramp'):
                    node_cost, edge_cost = node_cost_scaling, edge_cost_scaling
                    gradient, _, _, score_plus_cost = decoder.decode(
                        instance, oracle_len, node_cost, edge_cost)
                    plus_feats, selected_nodes, selected_edges, score_minus_cost = decoder.decode(
                        instance, oracle_len, -1.0 * node_cost,
                        -1.0 * edge_cost)
                    _, oracle_nodes, oracle_edges, score_true = decoder.oracle(
                        instance)

                    curr_loss = score_plus_cost - score_minus_cost  # @UnusedVariable
                    gradient -= plus_feats

                    for k, v in gradient.iteritems():
                        if v == 0.0: continue
                        sumSq[k] = sumSq.get(k, 0.0) + v * v
                        decoder.weights[k] = decoder.weights.get(
                            k, 0.0) - eta * v / sqrt(sumSq[k])

                    if l2reg != 0.0:
                        for k, v in decoder.weights.iteritems():
                            if v == 0.0: continue
                            value = l2reg * v
                            sumSq[k] = sumSq.get(k, 0.0) + value * value
                            decoder.weights[k] = v - eta * value / sqrt(
                                sumSq[k])

                # hinge-loss + cost-augmented decoding
                if loss_func.startswith('hinge'):
                    node_cost, edge_cost = node_cost_scaling, edge_cost_scaling
                    gradient, selected_nodes, selected_edges, score_plus_cost = decoder.decode(
                        instance, oracle_len, node_cost, edge_cost)
                    plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle(
                        instance)

                    curr_loss = score_plus_cost - score_true  # @UnusedVariable
                    gradient -= plus_feats

                    for k, v in gradient.iteritems():
                        if v == 0.0: continue
                        sumSq[k] = sumSq.get(k, 0.0) + v * v
                        decoder.weights[k] = decoder.weights.get(
                            k, 0.0) - eta * v / sqrt(sumSq[k])

                    if l2reg != 0.0:
                        for k, v in decoder.weights.iteritems():
                            if v == 0.0: continue
                            value = l2reg * v
                            sumSq[k] = sumSq.get(k, 0.0) + value * value
                            decoder.weights[k] = v - eta * value / sqrt(
                                sumSq[k])

                # use gold nodes and edges to calculate P/R/F
                num_gold_nodes, num_gold_edges = instance.gold
                # P/R/F scores of nodes and edges, for current instance
                # Edge recall can not reach %100 since decoding produces only tree structure
                intersect_nodes = set(selected_nodes) & set(oracle_nodes)
                curr_node_perf = getPRFScores(len(intersect_nodes),
                                              len(selected_nodes),
                                              num_gold_nodes)
                logPRFScores('train_node', curr_node_perf)

                intersect_edges = set(selected_edges) & set(oracle_edges)
                curr_edge_perf = getPRFScores(len(intersect_edges),
                                              len(selected_edges),
                                              num_gold_edges)
                logPRFScores('train_edge', curr_edge_perf)

                # P/R/F scores of nodes and edges, averaged across all curr_instances
                node_perf = PerfScore(
                    *[sum(x) for x in zip(node_perf, curr_node_perf)])
                edge_perf = PerfScore(
                    *[sum(x) for x in zip(edge_perf, curr_edge_perf)])

                logPRFScores(
                    'train_node_avg',
                    PerfScore(node_perf.prec / curr_instances,
                              node_perf.rec / curr_instances,
                              node_perf.fscore / curr_instances))
                logPRFScores(
                    'train_edge_avg',
                    PerfScore(edge_perf.prec / curr_instances,
                              edge_perf.rec / curr_instances,
                              edge_perf.fscore / curr_instances))

            # averaging weight vectors
            avg_weights += decoder.weights

            # output averaged weight vectors to file
            curr_weights = FeatureVector()
            curr_weights += avg_weights * (1 / curr_num_passes)
            if param_file:
                with codecs.open(param_file, 'w', 'utf-8') as outfile:
                    outfile.write('#curr_num_passes#: %d\n' % curr_num_passes)
                    outfile.write('%s\n' % curr_weights.toString())

        final_weights = FeatureVector()
        final_weights += avg_weights * (1 / num_passes)
        return final_weights
Beispiel #25
0
 def __init__(self):
     self.gilp = GurobiILP()
     self.weights = FeatureVector()
     self.feat_extr = FeatureExtractor()
     return
Beispiel #26
0
 def __init__(self):
     self.gilp = GurobiILP()
     self.weights = FeatureVector()
     self.feat_extr = FeatureExtractor()
     return
Beispiel #27
0
        feat_vec[('n', 'date-ent')] = 1.0 if (
            node.concept).startswith('date-entity') else 0.0

        return feat_vec


if __name__ == '__main__':

    input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/'
    body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt'
    summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt'

    corpus = buildCorpus(os.path.join(input_dir, body_file),
                         os.path.join(input_dir, summ_file))
    feat_extr = FeatureExtractor()
    feat_vec = FeatureVector()

    for inst in corpus:
        curr_filename = inst.filename
        my_nodes, s_nodes = inst.nodes
        my_edges, s_edges = inst.edges

        #         logger.debug('extracting features for file: %s' % curr_filename)
        #         for k_edge, v_edge in my_edges.iteritems():
        #             for tag in [0,1]:
        #                 feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)

        logger.debug('extracting features for file: %s' % curr_filename)
        for k_node, v_node in my_nodes.iteritems():
            for tag in [0, 1]:
                feat_vec += feat_extr.getNodeFeats(k_node, v_node, tag,
Beispiel #28
0
class Decoder(object):
    """
    Implement of decoder for structured prediction
    """
    def __init__(self):
        self.gilp = GurobiILP()
        self.weights = FeatureVector()
        self.feat_extr = FeatureExtractor()
        return
    
    def decode(self, instance, oracle_len='nolen', node_cost=None, edge_cost=None):
        """
        an instance includes:
        my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ...
        my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,...
        selected_nodes: (1,), (3,),... nodes contained in summary graph
        selected_edges: (1,2), (3,1),... edges contained in summary graph
        """
        logger.debug('start feature extraction...')
        
        curr_filename = instance.filename
        my_nodes, oracle_nodes, root_nodes = instance.nodes # nodes and selected nodes
        my_edges, oracle_edges = instance.edges # edges and selected edges
        num_gold_nodes, num_gold_edges = instance.gold # number of gold nodes and edges
        
        node_weights = {}
        edge_weights = {}
        
        # get edge weights
        num_nonnegative_edges = 0
        for k_edge, v_edge in my_edges.iteritems():
            for tag in [0,1]:
                edge_feats = self.feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
                edge_weights[k_edge + (tag,)] = self.weights.dot(edge_feats)
                # cost-augmented decoding
                if edge_cost is not None:
                    if tag == 0 and k_edge not in oracle_edges:      # true negative
                        curr_edge_cost = 0
                    if tag == 1 and k_edge in oracle_edges:          # true positive
                        curr_edge_cost = 0
                    if tag == 1 and k_edge not in oracle_edges:      # false positive
                        curr_edge_cost = edge_cost 
                    if tag == 0 and k_edge in oracle_edges:          # false negative
                        curr_edge_cost = edge_cost
                    edge_weights[k_edge + (tag,)] += curr_edge_cost
                if tag == 1 and edge_weights[k_edge + (tag,)] > 0.0: num_nonnegative_edges += 1
                
        # count number of non-negative edges
        logger.debug('[num_nonnegative_edges]: %d' % num_nonnegative_edges)
        
        # get node weights
        num_nonnegative_nodes = 0
        for k_node, v_node in my_nodes.iteritems():
            for tag in [0,1]:
                node_feats = self.feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges)
                node_weights[k_node + (tag,)] = self.weights.dot(node_feats)
                # cost-augmented decoding
                if node_cost is not None: 
                    if tag == 0 and k_node not in oracle_nodes:
                        curr_node_cost = 0
                    if tag == 1 and k_node in oracle_nodes:
                        curr_node_cost = 0
                    if tag == 1 and k_node not in oracle_nodes:      # false positive
                        curr_node_cost = node_cost
                    if tag == 0 and k_node in oracle_nodes:          # false negative
                        curr_node_cost = node_cost
                    node_weights[k_node + (tag,)] += curr_node_cost
                if tag == 1 and node_weights[k_node + (tag,)] > 0.0: num_nonnegative_nodes += 1
                
        # count number of non-negative nodes
        logger.debug('[num_nonnegative_nodes]: %d' % num_nonnegative_nodes)
        
        # run Gurobi ILP decoder using node and edge weights
        # optionally set the decoded summary length (#nodes and #edges)
        logger.debug('start ILP decoding...')
        num_selected_nodes = num_gold_nodes if oracle_len == 'nodes' else 0
        num_selected_edges = num_gold_edges if oracle_len == 'edges' else 0
        selected_nodes, selected_edges, score_pred = self.gilp.decode(node_weights, edge_weights, root_nodes,
                                                                      num_selected_nodes=num_selected_nodes,
                                                                      num_selected_edges=num_selected_edges)
        logger.debug('[num_gold_nodes]: %d' % num_gold_nodes)
        logger.debug('[num_selected_nodes]: %d' % len(selected_nodes))
        logger.debug('[num_gold_edges]: %d' % num_gold_edges)
        logger.debug('[num_selected_edges]: %d' % len(selected_edges))
        
        # features that are associated with the decoded graph
        feat_vec = FeatureVector()
        
        for k_edge, v_edge in my_edges.iteritems():
            tag = 1 if k_edge in selected_edges else 0  # use decoded tag
            feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
            
        for k_node, v_node in my_nodes.iteritems():
            tag = 1 if k_node in selected_nodes else 0  # use decoded tag
            feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges)
            
        # return features associated with decoded graph
        return feat_vec, selected_nodes, selected_edges, score_pred
    
    def oracle(self, instance):
        """
        an instance includes:
        my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ...
        my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,...
        root_nodes: (1,), (3,),... nodes that are root of sentence
        selected_nodes: (1,), (3,),... nodes contained in summary graph
        selected_edges: (1,2), (3,1),... edges contained in summary graph
        """
        logger.debug('start oracle decoding...')
        
        curr_filename = instance.filename
        my_nodes, oracle_nodes, _ = instance.nodes # nodes and selected nodes
        my_edges, oracle_edges = instance.edges # edges and selected edges
        
        # features that are associated with oracle graph
        feat_vec = FeatureVector()
        
        for k_edge, v_edge in my_edges.iteritems():
            tag = 1 if k_edge in oracle_edges else 0 # use oracle tag
            feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
        
        for k_node, v_node in my_nodes.iteritems():
            tag = 1 if k_node in oracle_nodes else 0 # use oracle tag
            feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges)
        
        score_true = self.weights.dot(feat_vec)
        
        # return features associated with oracle graph
        return feat_vec, oracle_nodes, oracle_edges, score_true
Beispiel #29
0
    def decode(self,
               instance,
               oracle_len='nolen',
               node_cost=None,
               edge_cost=None):
        """
        an instance includes:
        my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ...
        my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,...
        selected_nodes: (1,), (3,),... nodes contained in summary graph
        selected_edges: (1,2), (3,1),... edges contained in summary graph
        """
        logger.debug('start feature extraction...')

        curr_filename = instance.filename
        my_nodes, oracle_nodes, root_nodes = instance.nodes  # nodes and selected nodes
        my_edges, oracle_edges = instance.edges  # edges and selected edges
        num_gold_nodes, num_gold_edges = instance.gold  # number of gold nodes and edges

        node_weights = {}
        edge_weights = {}

        # get edge weights
        num_nonnegative_edges = 0
        for k_edge, v_edge in my_edges.iteritems():
            for tag in [0, 1]:
                edge_feats = self.feat_extr.getEdgeFeats(
                    k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
                edge_weights[k_edge + (tag, )] = self.weights.dot(edge_feats)
                # cost-augmented decoding
                if edge_cost is not None:
                    if tag == 0 and k_edge not in oracle_edges:  # true negative
                        curr_edge_cost = 0
                    if tag == 1 and k_edge in oracle_edges:  # true positive
                        curr_edge_cost = 0
                    if tag == 1 and k_edge not in oracle_edges:  # false positive
                        curr_edge_cost = edge_cost
                    if tag == 0 and k_edge in oracle_edges:  # false negative
                        curr_edge_cost = edge_cost
                    edge_weights[k_edge + (tag, )] += curr_edge_cost
                if tag == 1 and edge_weights[k_edge + (tag, )] > 0.0:
                    num_nonnegative_edges += 1

        # count number of non-negative edges
        logger.debug('[num_nonnegative_edges]: %d' % num_nonnegative_edges)

        # get node weights
        num_nonnegative_nodes = 0
        for k_node, v_node in my_nodes.iteritems():
            for tag in [0, 1]:
                node_feats = self.feat_extr.getNodeFeats(
                    k_node, v_node, tag, curr_filename, my_nodes, my_edges)
                node_weights[k_node + (tag, )] = self.weights.dot(node_feats)
                # cost-augmented decoding
                if node_cost is not None:
                    if tag == 0 and k_node not in oracle_nodes:
                        curr_node_cost = 0
                    if tag == 1 and k_node in oracle_nodes:
                        curr_node_cost = 0
                    if tag == 1 and k_node not in oracle_nodes:  # false positive
                        curr_node_cost = node_cost
                    if tag == 0 and k_node in oracle_nodes:  # false negative
                        curr_node_cost = node_cost
                    node_weights[k_node + (tag, )] += curr_node_cost
                if tag == 1 and node_weights[k_node + (tag, )] > 0.0:
                    num_nonnegative_nodes += 1

        # count number of non-negative nodes
        logger.debug('[num_nonnegative_nodes]: %d' % num_nonnegative_nodes)

        # run Gurobi ILP decoder using node and edge weights
        # optionally set the decoded summary length (#nodes and #edges)
        logger.debug('start ILP decoding...')
        num_selected_nodes = num_gold_nodes if oracle_len == 'nodes' else 0
        num_selected_edges = num_gold_edges if oracle_len == 'edges' else 0
        selected_nodes, selected_edges, score_pred = self.gilp.decode(
            node_weights,
            edge_weights,
            root_nodes,
            num_selected_nodes=num_selected_nodes,
            num_selected_edges=num_selected_edges)
        logger.debug('[num_gold_nodes]: %d' % num_gold_nodes)
        logger.debug('[num_selected_nodes]: %d' % len(selected_nodes))
        logger.debug('[num_gold_edges]: %d' % num_gold_edges)
        logger.debug('[num_selected_edges]: %d' % len(selected_edges))

        # features that are associated with the decoded graph
        feat_vec = FeatureVector()

        for k_edge, v_edge in my_edges.iteritems():
            tag = 1 if k_edge in selected_edges else 0  # use decoded tag
            feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag,
                                                    curr_filename, my_nodes,
                                                    my_edges)

        for k_node, v_node in my_nodes.iteritems():
            tag = 1 if k_node in selected_nodes else 0  # use decoded tag
            feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag,
                                                    curr_filename, my_nodes,
                                                    my_edges)

        # return features associated with decoded graph
        return feat_vec, selected_nodes, selected_edges, score_pred
Beispiel #30
0
    def learnParamsAdaGrad(self, decoder, corpus, param_file, loss_func, num_passes=10, oracle_len='nolen'):
        """
        learn parameters using Structured Perceptron, Ramp Loss, (Hinge??)
        """        
        logger.debug('start learning parameters...')
        shuffle(corpus) # shuffle corpus
        
        avg_weights = FeatureVector()
        curr_instances = 0
        
        node_perf = PerfScore(0.0, 0.0, 0.0) # node performance
        edge_perf = PerfScore(0.0, 0.0, 0.0) # edge performance
        
        eta = 1.0 # stepsize
        l2reg = 0.0 # 
        node_cost_scaling = 1.0 # cost scaling factor
        edge_cost_scaling = 1.0 # cost scaling factor
        
        sumSq = FeatureVector()
        
        for curr_num_passes in xrange(1, num_passes+1):
            logger.debug('#curr_num_passes#: %d' % curr_num_passes)
            
            for instance in corpus:
                curr_instances += 1
                logger.debug('processing instance %d...' % curr_instances)
                
                # perceptron loss
                if loss_func.startswith('perceptron'):
                    gradient, selected_nodes, selected_edges, score_pred = decoder.decode(instance, oracle_len)
                    plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle(instance)
                    
                    curr_loss = score_pred - score_true  # @UnusedVariable
                    gradient -= plus_feats
                    decoder.weights -= eta * gradient
                
                # ramp loss + cost-augmented decoding
                if loss_func.startswith('ramp'):                         
                    node_cost, edge_cost = node_cost_scaling, edge_cost_scaling
                    gradient, _, _, score_plus_cost = decoder.decode(instance, oracle_len, node_cost, edge_cost)
                    plus_feats, selected_nodes, selected_edges, score_minus_cost = decoder.decode(instance, oracle_len, -1.0 * node_cost, -1.0 * edge_cost)
                    _, oracle_nodes, oracle_edges, score_true = decoder.oracle(instance)
                
                    curr_loss = score_plus_cost - score_minus_cost  # @UnusedVariable
                    gradient -= plus_feats
                    
                    for k, v in gradient.iteritems():
                        if v == 0.0: continue
                        sumSq[k] = sumSq.get(k, 0.0) + v * v
                        decoder.weights[k] = decoder.weights.get(k, 0.0) - eta * v / sqrt(sumSq[k])
                    
                    if l2reg != 0.0:
                        for k, v in decoder.weights.iteritems():
                            if v == 0.0: continue
                            value = l2reg * v
                            sumSq[k] = sumSq.get(k, 0.0) + value * value
                            decoder.weights[k] = v - eta * value / sqrt(sumSq[k])
                
                # hinge-loss + cost-augmented decoding
                if loss_func.startswith('hinge'): 
                    node_cost, edge_cost = node_cost_scaling, edge_cost_scaling
                    gradient, selected_nodes, selected_edges, score_plus_cost = decoder.decode(instance, oracle_len, node_cost, edge_cost)
                    plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle(instance)
    
                    curr_loss = score_plus_cost - score_true  # @UnusedVariable
                    gradient -= plus_feats

                    for k, v in gradient.iteritems():
                        if v == 0.0: continue
                        sumSq[k] = sumSq.get(k, 0.0) + v * v
                        decoder.weights[k] = decoder.weights.get(k, 0.0) - eta * v / sqrt(sumSq[k])
                    
                    if l2reg != 0.0:
                        for k, v in decoder.weights.iteritems():
                            if v == 0.0: continue
                            value = l2reg * v
                            sumSq[k] = sumSq.get(k, 0.0) + value * value
                            decoder.weights[k] = v - eta * value / sqrt(sumSq[k])
                
                # use gold nodes and edges to calculate P/R/F
                num_gold_nodes, num_gold_edges = instance.gold
                # P/R/F scores of nodes and edges, for current instance
                # Edge recall can not reach %100 since decoding produces only tree structure
                intersect_nodes = set(selected_nodes) & set(oracle_nodes)
                curr_node_perf = getPRFScores(len(intersect_nodes), len(selected_nodes), num_gold_nodes)
                logPRFScores('train_node', curr_node_perf)
                
                intersect_edges = set(selected_edges) & set(oracle_edges)
                curr_edge_perf = getPRFScores(len(intersect_edges), len(selected_edges), num_gold_edges)
                logPRFScores('train_edge', curr_edge_perf)

                # P/R/F scores of nodes and edges, averaged across all curr_instances
                node_perf = PerfScore(*[sum(x) for x in zip(node_perf, curr_node_perf)])
                edge_perf = PerfScore(*[sum(x) for x in zip(edge_perf, curr_edge_perf)])
                
                logPRFScores('train_node_avg', 
                             PerfScore(node_perf.prec/curr_instances, node_perf.rec/curr_instances, 
                                       node_perf.fscore/curr_instances))              
                logPRFScores('train_edge_avg', 
                             PerfScore(edge_perf.prec/curr_instances, edge_perf.rec/curr_instances, 
                                       edge_perf.fscore/curr_instances))
                                
            # averaging weight vectors
            avg_weights += decoder.weights
            
            # output averaged weight vectors to file
            curr_weights = FeatureVector()
            curr_weights += avg_weights * (1/curr_num_passes)
            if param_file:
                with codecs.open(param_file, 'w', 'utf-8') as outfile:
                    outfile.write('#curr_num_passes#: %d\n' % curr_num_passes)
                    outfile.write('%s\n' % curr_weights.toString())

        final_weights = FeatureVector()
        final_weights += avg_weights * (1/num_passes)
        return final_weights