Ejemplo n.º 1
0
def weight_files(inst_fns, out_fns, weight_func=entropy_weight,
                 descriptor=None, n=None, binary=False):
    """
    Weight corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param out_fns: list of filenames containing Timbl output
    
    @keyword weight_func: weighting fuction
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format

    
    @keyword n: limit merging to the first n files
    """
    for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]:
        corpus_inst = CorpusInst()
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        timbl_out = parse_timbl_output(open(out_fname))
        weight_corpus(corpus_inst, timbl_out, weight_func)
        log.info("saving weighted corpus instances to {0}".format(inst_fname))
        corpus_inst.save()
Ejemplo n.º 2
0
def match_files(inst_fns, matcher, descriptor=None, n=None, binary=False):
    """
    Match corpus instances files
    
    @param inst_fns: list of corpus instance filenames
    
    @param matcher: a Matcher instance for matching source to target instances
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format
    
    @keyword binary: corpus instances in binary rather than text format

    @keyword n: limit matching to the first n files
    """
    for inst_fname in inst_fns[:n]:
        corpus_inst = CorpusInst()    
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
        match_corpus(corpus_inst, matcher)
        log.info("saving matched corpus instances to {0}".format(inst_fname))
        corpus_inst.save()