def getDocStats(input_dir, body_file, summ_file):
    """
    """
    corpus = buildCorpus(os.path.join(input_dir, body_file), 
                         os.path.join(input_dir, summ_file),
                         w_exp=True)
    
    num_docs = 0
    total_nodes = 0
    total_edges = 0
    selected_nodes = 0
    selected_edges = 0
    
    for inst in corpus:
        num_docs += 1
        my_nodes, oracle_nodes, _ = inst.nodes
        my_edges, oracle_edges = inst.edges
        
        total_nodes += len(my_nodes)
        total_edges += len(my_edges)
        
        selected_nodes += len(oracle_nodes)
        selected_edges += len(oracle_edges)
    
    print 'avg nodes: %.1f' % (total_nodes/num_docs)
    print 'avg edges: %.1f' % (total_edges/num_docs)
    
    print 'selected nodes: %.1f' % (selected_nodes/num_docs)
    print 'selected edges: %.1f' % (selected_edges/num_docs)
def test(body_file, summ_file, param_file, oracle_len, w_exp):
    """
    run summarizer, perform structured prediction
    """
    logger.debug('start testing...')
    logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # load parameters from file
    decoder = Decoder()
    decoder.weights.load(param_file)
    
    # perform structured prediction
    estimator = ParamEstimator()
    estimator.predict(decoder, corpus, oracle_len)
    
    return
def test(body_file, summ_file, param_file, oracle_len, w_exp):
    """
    run summarizer, perform structured prediction
    """
    logger.debug('start testing...')
    logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # load parameters from file
    decoder = Decoder()
    decoder.weights.load(param_file)
    
    # perform structured prediction
    estimator = ParamEstimator()
    estimator.predict(decoder, corpus, oracle_len)
    
    return
def train(body_file, summ_file, param_file, loss_func, num_passes, oracle_len, w_exp):
    """
    run summarizer, learn structured prediction parameters
    """    
    logger.debug('start training...')
    logger.debug('[settings]: %s_%d_passes_len_%s_exp_%d' % (loss_func, num_passes, oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # learn parameters
    decoder = Decoder()
    estimator = ParamEstimator()
    final_weights = estimator.learnParamsAdaGrad(decoder, corpus, param_file, loss_func, num_passes, oracle_len)
    
    # output parameters to file
    with codecs.open(param_file, 'w', 'utf-8') as outfile:
        outfile.write('#num_passes#: %d\n' % num_passes)
        outfile.write('%s\n' % final_weights.toString())
    return
def train(body_file, summ_file, param_file, loss_func, num_passes, oracle_len, w_exp):
    """
    run summarizer, learn structured prediction parameters
    """    
    logger.debug('start training...')
    logger.debug('[settings]: %s_%d_passes_len_%s_exp_%d' % (loss_func, num_passes, oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # learn parameters
    decoder = Decoder()
    estimator = ParamEstimator()
    final_weights = estimator.learnParamsAdaGrad(decoder, corpus, param_file, loss_func, num_passes, oracle_len)
    
    # output parameters to file
    with codecs.open(param_file, 'w', 'utf-8') as outfile:
        outfile.write('#num_passes#: %d\n' % num_passes)
        outfile.write('%s\n' % final_weights.toString())
    return
def summ(body_file, summ_file, param_file, oracle_len, w_exp, jamr=False):
    """
    run summarizer, perform structured prediction
    """
    logger.debug('start testing...')
    logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # load parameters from file
    decoder = Decoder()
    decoder.weights.load(param_file)
    
    # perform structured prediction
    estimator = ParamEstimator()
    output_folder = param_file.replace('params', 'summ')
    if jamr == True: output_folder = param_file.replace('params', 'jamr_summ')
    estimator.summarize(decoder, corpus, oracle_len, output_folder)
    
    return
def summ(body_file, summ_file, param_file, oracle_len, w_exp, jamr=False):
    """
    run summarizer, perform structured prediction
    """
    logger.debug('start testing...')
    logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp))
    corpus = buildCorpus(body_file, summ_file, w_exp)
    
    # load parameters from file
    decoder = Decoder()
    decoder.weights.load(param_file)
    
    # perform structured prediction
    estimator = ParamEstimator()
    output_folder = param_file.replace('params', 'summ')
    if jamr == True: output_folder = param_file.replace('params', 'jamr_summ')
    estimator.summarize(decoder, corpus, oracle_len, output_folder)
    
    return
Exemple #8
0
        # named entity or not
        feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0
        feat_vec[('n', 'date-ent')] = 1.0 if (
            node.concept).startswith('date-entity') else 0.0

        return feat_vec


if __name__ == '__main__':

    input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/'
    body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt'
    summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt'

    corpus = buildCorpus(os.path.join(input_dir, body_file),
                         os.path.join(input_dir, summ_file))
    feat_extr = FeatureExtractor()
    feat_vec = FeatureVector()

    for inst in corpus:
        curr_filename = inst.filename
        my_nodes, s_nodes = inst.nodes
        my_edges, s_edges = inst.edges

        #         logger.debug('extracting features for file: %s' % curr_filename)
        #         for k_edge, v_edge in my_edges.iteritems():
        #             for tag in [0,1]:
        #                 feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)

        logger.debug('extracting features for file: %s' % curr_filename)
        for k_node, v_node in my_nodes.iteritems():
        feat_vec = FeatureVector()
        
        # named entity or not
        feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0
        feat_vec[('n', 'date-ent')] = 1.0 if (node.concept).startswith('date-entity') else 0.0
            
        return feat_vec   
    

if __name__ == '__main__':
    
    input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/'
    body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt'
    summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt'
    
    corpus = buildCorpus(os.path.join(input_dir, body_file),
                         os.path.join(input_dir, summ_file))
    feat_extr = FeatureExtractor()
    feat_vec = FeatureVector()

    for inst in corpus:
        curr_filename = inst.filename
        my_nodes, s_nodes = inst.nodes
        my_edges, s_edges = inst.edges
        
#         logger.debug('extracting features for file: %s' % curr_filename)
#         for k_edge, v_edge in my_edges.iteritems():
#             for tag in [0,1]:
#                 feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges)
                
        logger.debug('extracting features for file: %s' % curr_filename)
        for k_node, v_node in my_nodes.iteritems():