def getEdgeFeats(self, k_edge, v_edge, tag, curr_filename, my_nodes, my_edges): """ extract features for one edge. cache features before conjoining with edge tag. k_edge: (1,2), v_edge: AmrEdge my_nodes: (1,) -> AmrNode, my_edges: (2,1) -> AmrEdge """ feat_vec = FeatureVector() # edge features have been extracted if self.curr_filename == curr_filename and k_edge in self.curr_feats: feat_vec = self.curr_feats[k_edge] else: # new edge # if this is a new file, clear cache if self.curr_filename != curr_filename: self.curr_filename = curr_filename self.curr_feats = {} # extract features and add to cache for feat_func in self.edge_feat_funcs: feat_vec += feat_func(v_edge) self.curr_feats[k_edge] = feat_vec # conjoin features with tag new_feat_vec = FeatureVector() for k, v in feat_vec.iteritems(): new_feat_vec[(str(tag), ) + k] = v # return edge features conjoined with tag return new_feat_vec
def ffEdgeNonNullFreq(self, edge): """ extract a binary feature for edge frequency (Non-NULL edges) freq == 0, freq >= 1, freq >= 2, freq >= 5, freq >= 10 """ feat_vec = FeatureVector() edge_freq = 0 for source in edge.sources: if source.relation != 'NULL': edge_freq += 1 # binary feature for edge frequency feat_vec[('e', 'freq', 'non_null', '0')] = 1.0 if edge_freq == 0 else 0.0 feat_vec[('e', 'freq', 'non_null', '1')] = 1.0 if edge_freq >= 1 else 0.0 feat_vec[('e', 'freq', 'non_null', '2')] = 1.0 if edge_freq >= 2 else 0.0 feat_vec[('e', 'freq', 'non_null', '5')] = 1.0 if edge_freq >= 5 else 0.0 feat_vec[('e', 'freq', 'non_null', '10')] = 1.0 if edge_freq >= 10 else 0.0 return feat_vec
def oracle(self, instance): """ an instance includes: my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ... my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,... root_nodes: (1,), (3,),... nodes that are root of sentence selected_nodes: (1,), (3,),... nodes contained in summary graph selected_edges: (1,2), (3,1),... edges contained in summary graph """ logger.debug('start oracle decoding...') curr_filename = instance.filename my_nodes, oracle_nodes, _ = instance.nodes # nodes and selected nodes my_edges, oracle_edges = instance.edges # edges and selected edges # features that are associated with oracle graph feat_vec = FeatureVector() for k_edge, v_edge in my_edges.iteritems(): tag = 1 if k_edge in oracle_edges else 0 # use oracle tag feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) for k_node, v_node in my_nodes.iteritems(): tag = 1 if k_node in oracle_nodes else 0 # use oracle tag feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges) score_true = self.weights.dot(feat_vec) # return features associated with oracle graph return feat_vec, oracle_nodes, oracle_edges, score_true
def ffEdgeNodeSpan(self, edge): """ extract features from node spans of edge """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodeSpan(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodeSpan(node2).iteritems(): feat_vec[('e', 'n2') + k] = v node1_longest = -1 node2_longest = -1 for source in node1.sources: node_span = source.end_idx - source.start_idx if node1_longest < node_span: node1_longest = node_span for source in node2.sources: node_span = source.end_idx - source.start_idx if node2_longest < node_span: node2_longest = node_span # concatenate foremost occurrence of node1 and node2 feat_vec[('e', 'n1', 'span', 'lgst', str(node1_longest), 'n2', 'span', 'lgst', str(node2_longest))] = 1.0 return feat_vec
def ffEdgeNodeDepth(self, edge): """ extract node depth features for edge """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodeDepth(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodeDepth(node2).iteritems(): feat_vec[('e', 'n2') + k] = v node1_foremost = 1e10 node2_foremost = 1e10 for source in node1.sources: node_depth = len((source.graph_idx).split('.')) if node1_foremost > node_depth: node1_foremost = node_depth for source in node2.sources: node_depth = len((source.graph_idx).split('.')) if node2_foremost > node_depth: node2_foremost = node_depth # concatenate foremost occurrence of node1 and node2 feat_vec[('e', 'n1', 'dep', 'fmst', str(node1_foremost), 'n2', 'dep', 'fmst', str(node2_foremost))] = 1.0 return feat_vec
def ffEdgeBias(self, edge): """ add a bias term to edge """ feat_vec = FeatureVector() feat_vec[('e', 'bias')] = 1.0 return feat_vec
def ffEdgeNodePos(self, edge): """ extract node position features for edge """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodePos(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodePos(node2).iteritems(): feat_vec[('e', 'n2') + k] = v node1_foremost = 1e10 node2_foremost = 1e10 for source in node1.sources: if node1_foremost > source.line_num: node1_foremost = source.line_num for source in node2.sources: if node2_foremost > source.line_num: node2_foremost = source.line_num # concatenate foremost occurrence of node1 and node2 feat_vec[('e', 'n1', 'posit', 'fmst', str(node1_foremost), 'n2', 'posit', 'fmst', str(node2_foremost))] = 1.0 return feat_vec
def ffNodeBias(self, node): """ add a bias term to node """ feat_vec = FeatureVector() feat_vec[('n', 'bias')] = 1.0 return feat_vec
def ffNodeConcept(self, node): """ extract node concept feature """ feat_vec = FeatureVector() # node concept feat_vec[('n', 'cpt', node.concept)] = 1.0 return feat_vec
def ffNodeCollapsedEntity(self, node): """ extract features from collapsed concept node """ feat_vec = FeatureVector() # named entity or not feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0 feat_vec[('n', 'date-ent')] = 1.0 if ( node.concept).startswith('date-entity') else 0.0 return feat_vec
def ffEdgeIsNull(self, edge): """ extract a binary feature indicating a NULL edge or not """ feat_vec = FeatureVector() is_null = 1.0 for source in edge.sources: if source.relation != 'NULL': is_null = 0.0 break feat_vec[('e', 'is_null')] = is_null return feat_vec
def ffEdgeNodeConcept(self, edge): """ extract node concept features for edge """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodeConcept(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodeConcept(node2).iteritems(): feat_vec[('e', 'n2') + k] = v return feat_vec
def ffEdgeNodeFreq(self, edge): """ extract node frequency features for an edge """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodeFreq(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodeFreq(node2).iteritems(): feat_vec[('e', 'n2') + k] = v return feat_vec
def ffNodeFreq(self, node): """ extract node frequency features """ feat_vec = FeatureVector() # node frequency node_freq = len(node.sources) feat_vec[('n', 'freq', '0')] = 1.0 if node_freq == 0 else 0.0 feat_vec[('n', 'freq', '1')] = 1.0 if node_freq >= 1 else 0.0 feat_vec[('n', 'freq', '2')] = 1.0 if node_freq >= 2 else 0.0 feat_vec[('n', 'freq', '5')] = 1.0 if node_freq >= 5 else 0.0 feat_vec[('n', 'freq', '10')] = 1.0 if node_freq >= 10 else 0.0 return feat_vec
def ffEdgeNodeCollapsedEntity(self, edge): """ extract features from collapsed concept node """ feat_vec = FeatureVector() node1 = edge.node1 node2 = edge.node2 for k, v in self.ffNodeSpan(node1).iteritems(): feat_vec[('e', 'n1') + k] = v for k, v in self.ffNodeSpan(node2).iteritems(): feat_vec[('e', 'n2') + k] = v return feat_vec
def ffEdgeRel(self, edge): """ extract a binary feature for edge relation. """ feat_vec = FeatureVector() edge_freq = len(edge.sources) # primary and secondary relation (edge relation entropy)? rel_freq = Counter() for source in edge.sources: rel_freq[source.relation] += 1 rels = rel_freq.most_common() if rels: # primary relation (rel, count) = rels[0] feat_vec[('e', 'rel', 'fst', rel)] = 1.0 # relative frequency per_fst_rel = count / edge_freq feat_vec[('e', 'rel', 'fst', rel, 'p1')] = 1.0 if per_fst_rel >= 0.5 else 0.0 feat_vec[('e', 'rel', 'fst', rel, 'p2')] = 1.0 if per_fst_rel >= 0.66 else 0.0 feat_vec[('e', 'rel', 'fst', rel, 'p3')] = 1.0 if per_fst_rel >= 0.75 else 0.0 # secondary relation and relative frequency if len(rels) > 1: (sec_rel, sec_count) = rels[1] feat_vec[('e', 'rel', 'sec', sec_rel)] = 1.0 # relative frequency per_sec_rel = sec_count / edge_freq feat_vec[('e', 'rel', 'sec', sec_rel, 'p1')] = 1.0 if per_sec_rel >= 0.25 else 0.0 feat_vec[('e', 'rel', 'sec', sec_rel, 'p2')] = 1.0 if per_sec_rel >= 0.33 else 0.0 feat_vec[('e', 'rel', 'sec', sec_rel, 'p3')] = 1.0 if per_sec_rel >= 0.5 else 0.0 # combine first and secondary relation feat_vec[('e', 'rel', 'fst', rel, 'sec', sec_rel)] = 1.0 return feat_vec
def ffNodeSpan(self, node): """ extract features from node spans """ feat_vec = FeatureVector() # longest span of node # average span of node node_longest = -1 node_average = 0.0 node_freq = len(node.sources) for source in node.sources: node_span = source.end_idx - source.start_idx node_average += node_span if node_longest < node_span: node_longest = node_span if node_freq > 0: node_average /= node_freq if node_freq > 0: feat_vec[('n', 'span', 'lgst', '0')] = 1.0 if node_longest >= 0 else 0.0 feat_vec[('n', 'span', 'lgst', '1')] = 1.0 if node_longest >= 1 else 0.0 feat_vec[('n', 'span', 'lgst', '2')] = 1.0 if node_longest >= 2 else 0.0 feat_vec[('n', 'span', 'lgst', '5')] = 1.0 if node_longest >= 5 else 0.0 feat_vec[('n', 'span', 'lgst', '10')] = 1.0 if node_longest >= 10 else 0.0 feat_vec[('n', 'span', 'avg', '0')] = 1.0 if node_longest >= 0 else 0.0 feat_vec[('n', 'span', 'avg', '1')] = 1.0 if node_longest >= 1 else 0.0 feat_vec[('n', 'span', 'avg', '2')] = 1.0 if node_longest >= 2 else 0.0 feat_vec[('n', 'span', 'avg', '5')] = 1.0 if node_longest >= 5 else 0.0 feat_vec[('n', 'span', 'avg', '10')] = 1.0 if node_longest >= 10 else 0.0 return feat_vec
def ffNodePos(self, node): """ extract node position features """ feat_vec = FeatureVector() # foremost occurrence of node # average occurrence position of node node_foremost = 1e10 node_average = 0.0 node_freq = len(node.sources) for source in node.sources: node_average += source.line_num if node_foremost > source.line_num: node_foremost = source.line_num if node_freq > 0: node_average /= node_freq if node_freq > 0: # foremost occurrence feat_vec[('n', 'posit', 'fmst', '5')] = 1.0 if node_foremost >= 5 else 0.0 feat_vec[('n', 'posit', 'fmst', '6')] = 1.0 if node_foremost >= 6 else 0.0 feat_vec[('n', 'posit', 'fmst', '7')] = 1.0 if node_foremost >= 7 else 0.0 feat_vec[('n', 'posit', 'fmst', '10')] = 1.0 if node_foremost >= 10 else 0.0 feat_vec[('n', 'posit', 'fmst', '15')] = 1.0 if node_foremost >= 15 else 0.0 # average occurrence feat_vec[('n', 'posit', 'avg', '5')] = 1.0 if node_average >= 5 else 0.0 feat_vec[('n', 'posit', 'avg', '6')] = 1.0 if node_average >= 6 else 0.0 feat_vec[('n', 'posit', 'avg', '7')] = 1.0 if node_average >= 7 else 0.0 feat_vec[('n', 'posit', 'avg', '10')] = 1.0 if node_average >= 10 else 0.0 feat_vec[('n', 'posit', 'avg', '15')] = 1.0 if node_average >= 15 else 0.0 return feat_vec
def ffNodeDepth(self, node): """ depth of node in graph topology """ feat_vec = FeatureVector() # foremost occurrence of node1, node2 # average occurrence position of node1, node2 node_foremost = 1e10 node_average = 0.0 node_freq = len(node.sources) for source in node.sources: node_depth = len((source.graph_idx).split('.')) node_average += node_depth if node_foremost > node_depth: node_foremost = node_depth if node_freq > 0: node_average /= node_freq if node_freq > 0: feat_vec[('n', 'dep', 'fmst', '1')] = 1.0 if node_foremost >= 1 else 0.0 feat_vec[('n', 'dep', 'fmst', '2')] = 1.0 if node_foremost >= 2 else 0.0 feat_vec[('n', 'dep', 'fmst', '3')] = 1.0 if node_foremost >= 3 else 0.0 feat_vec[('n', 'dep', 'fmst', '4')] = 1.0 if node_foremost >= 4 else 0.0 feat_vec[('n', 'dep', 'fmst', '5')] = 1.0 if node_foremost >= 5 else 0.0 feat_vec[('n', 'dep', 'avg', '1')] = 1.0 if node_average >= 1 else 0.0 feat_vec[('n', 'dep', 'avg', '2')] = 1.0 if node_average >= 2 else 0.0 feat_vec[('n', 'dep', 'avg', '3')] = 1.0 if node_average >= 3 else 0.0 feat_vec[('n', 'dep', 'avg', '4')] = 1.0 if node_average >= 4 else 0.0 feat_vec[('n', 'dep', 'avg', '5')] = 1.0 if node_average >= 5 else 0.0 return feat_vec
def ffEdgeFreq(self, edge): """ extract a binary feature for edge frequency. freq == 0, freq >= 1, freq >= 2, freq >= 5, freq >= 10 """ feat_vec = FeatureVector() # edge frequency edge_freq = len(edge.sources) # binary feature for edge frequency feat_vec[('e', 'freq', '0')] = 1.0 if edge_freq == 0 else 0.0 feat_vec[('e', 'freq', '1')] = 1.0 if edge_freq >= 1 else 0.0 feat_vec[('e', 'freq', '2')] = 1.0 if edge_freq >= 2 else 0.0 feat_vec[('e', 'freq', '5')] = 1.0 if edge_freq >= 5 else 0.0 feat_vec[('e', 'freq', '10')] = 1.0 if edge_freq >= 10 else 0.0 return feat_vec
def ffEdgePos(self, edge): """ extract features from edge occurrences. """ feat_vec = FeatureVector() # foremost position in all edge occurrences # average position across all edge occurrences foremost = 1e10 average = 0.0 edge_freq = len(edge.sources) for source in edge.sources: average += source.line_num if foremost > source.line_num: foremost = source.line_num if edge_freq > 0: average /= edge_freq if edge_freq > 0: # edge exists # foremost occurrence feat_vec[('e', 'posit', 'fmst', '5')] = 1.0 if foremost >= 5 else 0.0 feat_vec[('e', 'posit', 'fmst', '6')] = 1.0 if foremost >= 6 else 0.0 feat_vec[('e', 'posit', 'fmst', '7')] = 1.0 if foremost >= 7 else 0.0 feat_vec[('e', 'posit', 'fmst', '10')] = 1.0 if foremost >= 10 else 0.0 feat_vec[('e', 'posit', 'fmst', '15')] = 1.0 if foremost >= 15 else 0.0 # average occurrence feat_vec[('e', 'posit', 'avg', '5')] = 1.0 if average >= 5 else 0.0 feat_vec[('e', 'posit', 'avg', '6')] = 1.0 if average >= 6 else 0.0 feat_vec[('e', 'posit', 'avg', '7')] = 1.0 if average >= 7 else 0.0 feat_vec[('e', 'posit', 'avg', '10')] = 1.0 if average >= 10 else 0.0 feat_vec[('e', 'posit', 'avg', '15')] = 1.0 if average >= 15 else 0.0 return feat_vec
def learnParamsAdaGrad(self, decoder, corpus, param_file, loss_func, num_passes=10, oracle_len='nolen'): """ learn parameters using Structured Perceptron, Ramp Loss, (Hinge??) """ logger.debug('start learning parameters...') shuffle(corpus) # shuffle corpus avg_weights = FeatureVector() curr_instances = 0 node_perf = PerfScore(0.0, 0.0, 0.0) # node performance edge_perf = PerfScore(0.0, 0.0, 0.0) # edge performance eta = 1.0 # stepsize l2reg = 0.0 # node_cost_scaling = 1.0 # cost scaling factor edge_cost_scaling = 1.0 # cost scaling factor sumSq = FeatureVector() for curr_num_passes in xrange(1, num_passes + 1): logger.debug('#curr_num_passes#: %d' % curr_num_passes) for instance in corpus: curr_instances += 1 logger.debug('processing instance %d...' % curr_instances) # perceptron loss if loss_func.startswith('perceptron'): gradient, selected_nodes, selected_edges, score_pred = decoder.decode( instance, oracle_len) plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle( instance) curr_loss = score_pred - score_true # @UnusedVariable gradient -= plus_feats decoder.weights -= eta * gradient # ramp loss + cost-augmented decoding if loss_func.startswith('ramp'): node_cost, edge_cost = node_cost_scaling, edge_cost_scaling gradient, _, _, score_plus_cost = decoder.decode( instance, oracle_len, node_cost, edge_cost) plus_feats, selected_nodes, selected_edges, score_minus_cost = decoder.decode( instance, oracle_len, -1.0 * node_cost, -1.0 * edge_cost) _, oracle_nodes, oracle_edges, score_true = decoder.oracle( instance) curr_loss = score_plus_cost - score_minus_cost # @UnusedVariable gradient -= plus_feats for k, v in gradient.iteritems(): if v == 0.0: continue sumSq[k] = sumSq.get(k, 0.0) + v * v decoder.weights[k] = decoder.weights.get( k, 0.0) - eta * v / sqrt(sumSq[k]) if l2reg != 0.0: for k, v in decoder.weights.iteritems(): if v == 0.0: continue value = l2reg * v sumSq[k] = sumSq.get(k, 0.0) + value * value decoder.weights[k] = v - eta * value / sqrt( sumSq[k]) # hinge-loss + cost-augmented decoding if loss_func.startswith('hinge'): node_cost, edge_cost = node_cost_scaling, edge_cost_scaling gradient, selected_nodes, selected_edges, score_plus_cost = decoder.decode( instance, oracle_len, node_cost, edge_cost) plus_feats, oracle_nodes, oracle_edges, score_true = decoder.oracle( instance) curr_loss = score_plus_cost - score_true # @UnusedVariable gradient -= plus_feats for k, v in gradient.iteritems(): if v == 0.0: continue sumSq[k] = sumSq.get(k, 0.0) + v * v decoder.weights[k] = decoder.weights.get( k, 0.0) - eta * v / sqrt(sumSq[k]) if l2reg != 0.0: for k, v in decoder.weights.iteritems(): if v == 0.0: continue value = l2reg * v sumSq[k] = sumSq.get(k, 0.0) + value * value decoder.weights[k] = v - eta * value / sqrt( sumSq[k]) # use gold nodes and edges to calculate P/R/F num_gold_nodes, num_gold_edges = instance.gold # P/R/F scores of nodes and edges, for current instance # Edge recall can not reach %100 since decoding produces only tree structure intersect_nodes = set(selected_nodes) & set(oracle_nodes) curr_node_perf = getPRFScores(len(intersect_nodes), len(selected_nodes), num_gold_nodes) logPRFScores('train_node', curr_node_perf) intersect_edges = set(selected_edges) & set(oracle_edges) curr_edge_perf = getPRFScores(len(intersect_edges), len(selected_edges), num_gold_edges) logPRFScores('train_edge', curr_edge_perf) # P/R/F scores of nodes and edges, averaged across all curr_instances node_perf = PerfScore( *[sum(x) for x in zip(node_perf, curr_node_perf)]) edge_perf = PerfScore( *[sum(x) for x in zip(edge_perf, curr_edge_perf)]) logPRFScores( 'train_node_avg', PerfScore(node_perf.prec / curr_instances, node_perf.rec / curr_instances, node_perf.fscore / curr_instances)) logPRFScores( 'train_edge_avg', PerfScore(edge_perf.prec / curr_instances, edge_perf.rec / curr_instances, edge_perf.fscore / curr_instances)) # averaging weight vectors avg_weights += decoder.weights # output averaged weight vectors to file curr_weights = FeatureVector() curr_weights += avg_weights * (1 / curr_num_passes) if param_file: with codecs.open(param_file, 'w', 'utf-8') as outfile: outfile.write('#curr_num_passes#: %d\n' % curr_num_passes) outfile.write('%s\n' % curr_weights.toString()) final_weights = FeatureVector() final_weights += avg_weights * (1 / num_passes) return final_weights
def __init__(self): self.gilp = GurobiILP() self.weights = FeatureVector() self.feat_extr = FeatureExtractor() return
def decode(self, instance, oracle_len='nolen', node_cost=None, edge_cost=None): """ an instance includes: my_nodes: (1,) -> AmrNode1, (2,) -> AmrNode2, ... my_edges: (1,2) -> AmrEdge1, (2,1) -> AmrEdge2,... selected_nodes: (1,), (3,),... nodes contained in summary graph selected_edges: (1,2), (3,1),... edges contained in summary graph """ logger.debug('start feature extraction...') curr_filename = instance.filename my_nodes, oracle_nodes, root_nodes = instance.nodes # nodes and selected nodes my_edges, oracle_edges = instance.edges # edges and selected edges num_gold_nodes, num_gold_edges = instance.gold # number of gold nodes and edges node_weights = {} edge_weights = {} # get edge weights num_nonnegative_edges = 0 for k_edge, v_edge in my_edges.iteritems(): for tag in [0, 1]: edge_feats = self.feat_extr.getEdgeFeats( k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) # print edge_feats # print "i am not gonna screw your life ..................... " # return edge_weights[k_edge + (tag, )] = self.weights.dot(edge_feats) # cost-augmented decoding if edge_cost is not None: if tag == 0 and k_edge not in oracle_edges: # true negative curr_edge_cost = 0 if tag == 1 and k_edge in oracle_edges: # true positive curr_edge_cost = 0 if tag == 1 and k_edge not in oracle_edges: # false positive curr_edge_cost = edge_cost if tag == 0 and k_edge in oracle_edges: # false negative curr_edge_cost = edge_cost edge_weights[k_edge + (tag, )] += curr_edge_cost if tag == 1 and edge_weights[k_edge + (tag, )] > 0.0: num_nonnegative_edges += 1 # count number of non-negative edges logger.debug('[num_nonnegative_edges]: %d' % num_nonnegative_edges) # get node weights num_nonnegative_nodes = 0 for k_node, v_node in my_nodes.iteritems(): for tag in [0, 1]: node_feats = self.feat_extr.getNodeFeats( k_node, v_node, tag, curr_filename, my_nodes, my_edges) node_weights[k_node + (tag, )] = self.weights.dot(node_feats) # cost-augmented decoding if node_cost is not None: if tag == 0 and k_node not in oracle_nodes: curr_node_cost = 0 if tag == 1 and k_node in oracle_nodes: curr_node_cost = 0 if tag == 1 and k_node not in oracle_nodes: # false positive curr_node_cost = node_cost if tag == 0 and k_node in oracle_nodes: # false negative curr_node_cost = node_cost node_weights[k_node + (tag, )] += curr_node_cost if tag == 1 and node_weights[k_node + (tag, )] > 0.0: num_nonnegative_nodes += 1 # count number of non-negative nodes logger.debug('[num_nonnegative_nodes]: %d' % num_nonnegative_nodes) # run Gurobi ILP decoder using node and edge weights # optionally set the decoded summary length (#nodes and #edges) logger.debug('start ILP decoding...') num_selected_nodes = num_gold_nodes if oracle_len == 'nodes' else 0 num_selected_edges = num_gold_edges if oracle_len == 'edges' else 0 selected_nodes, selected_edges, score_pred = self.gilp.decode( node_weights, edge_weights, root_nodes, num_selected_nodes=num_selected_nodes, num_selected_edges=num_selected_edges) logger.debug('[num_gold_nodes]: %d' % num_gold_nodes) logger.debug('[num_selected_nodes]: %d' % len(selected_nodes)) logger.debug('[num_gold_edges]: %d' % num_gold_edges) logger.debug('[num_selected_edges]: %d' % len(selected_edges)) # features that are associated with the decoded graph feat_vec = FeatureVector() for k_edge, v_edge in my_edges.iteritems(): tag = 1 if k_edge in selected_edges else 0 # use decoded tag feat_vec += self.feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) for k_node, v_node in my_nodes.iteritems(): tag = 1 if k_node in selected_nodes else 0 # use decoded tag feat_vec += self.feat_extr.getNodeFeats(k_node, v_node, tag, curr_filename, my_nodes, my_edges) # return features associated with decoded graph return feat_vec, selected_nodes, selected_edges, score_pred
if __name__ == '__main__': input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/' body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt' summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt' input_dir = '/Users/amit/Desktop/Thesis/jamr/biocorpus/amr_parsing/data/amr-release-1.0-dev-proxy/' body_file = 'test.txt' summ_file = 'test.txt' corpus = buildCorpus(os.path.join(input_dir, body_file), os.path.join(input_dir, summ_file)) feat_extr = FeatureExtractor() feat_vec = FeatureVector() for inst in corpus: curr_filename = inst.filename # print inst.edges my_nodes, s_nodes, r_nodes = inst.nodes my_edges, s_edges = inst.edges logger.debug('extracting features for file: %s' % curr_filename) for k_edge, v_edge in my_edges.iteritems(): for tag in [0, 1]: feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) logger.debug('extracting features for file: %s' % curr_filename)