Esempio n. 1
0
def print_alignments(align_count,
                     title,
                     graph_pair1,
                     graph_pair2,
                     graphs,
                     nodes_list,
                     out=stdout):
    if nodes_list:
        header(title, out, char="-")

        for nodes in nodes_list:
            align_count += 1
            rel1 = str(graph_pair1.get_align(nodes))
            rel2 = str(graph_pair2.get_align(nodes))

            # tricky because of implicit coercions,
            # see "Formatting Markers" http://www.python.org/dev/peps/pep-0100/
            print >> out, "#%d:" % align_count
            s = '(%s) %s [%s:%s]: "%s"' % (
                nodes.source,
                graphs.source.node[nodes.source]["label"].encode("utf-8"),
                graphs.source.node[nodes.source]["begin"],
                graphs.source.node[nodes.source]["end"],
                graphs.source.get_node_token_string(nodes.source))
            print >> out, s.encode("utf-8")
            print >> out, "<<<", rel1.upper(), "/", rel2.upper(), ">>>"
            s = '(%s) %s [%s:%s]: "%s"\n' % (
                nodes.target, graphs.target.node[nodes.target]["label"],
                graphs.target.node[nodes.target]["begin"],
                graphs.target.node[nodes.target]["end"],
                graphs.target.get_node_token_string(nodes.target))
            print >> out, s.encode("utf-8")

    return align_count
Esempio n. 2
0
def print_alignments(align_count, title, graph_pair1, graph_pair2, graphs,
                     nodes_list, out=stdout):
    if nodes_list:
        header(title, out, char="-")
        
        for nodes in nodes_list:        
            align_count += 1
            rel1 = str(graph_pair1.get_align(nodes))
            rel2 = str(graph_pair2.get_align(nodes))
            
            # tricky because of implicit coercions, 
            # see "Formatting Markers" http://www.python.org/dev/peps/pep-0100/
            print >>out, "#%d:" % align_count 
            s = '(%s) %s [%s:%s]: "%s"' % (                
                nodes.source,
                graphs.source.node[nodes.source]["label"].encode("utf-8"),
                graphs.source.node[nodes.source]["begin"],
                graphs.source.node[nodes.source]["end"],
                graphs.source.get_node_token_string(nodes.source))
            print >>out, s.encode("utf-8")
            print >>out, "<<<", rel1.upper(), "/", rel2.upper(), ">>>"
            s = '(%s) %s [%s:%s]: "%s"\n' % (
                nodes.target,
                graphs.target.node[nodes.target]["label"],
                graphs.target.node[nodes.target]["begin"],
                graphs.target.node[nodes.target]["end"],
                graphs.target.get_node_token_string(nodes.target))
            print >>out, s.encode("utf-8")
            
    return align_count
Esempio n. 3
0
    def write_alignment_per_relation(self, out=stdout):
        """
        write evaluation of alignment for each relation separtately
        """
        header("Alignment per relation", out)

        for rel in self.relations:
            self[rel].write(self.names, out=out, heading=rel.upper())
Esempio n. 4
0
 def write_alignment_per_relation(self, out=stdout):
     """
     write evaluation of alignment for each relation separtately
     """
     header("Alignment per relation", out)
     
     for rel in self.relations:
         self[rel].write(self.names, out=out, heading=rel.upper())
Esempio n. 5
0
def print_comments(graph_pair, annot, out, encoding="utf-8"):
    try:
        comment = graph_pair.get_meta_data().find("comment").text
    except AttributeError:
        return

    if comment.strip():
        header("Comments by " + annot, out, char="-")
        print >> out, comment.encode(encoding), "\n"
Esempio n. 6
0
def print_comments(graph_pair, annot, out, encoding="utf-8"):
    try:
        comment = graph_pair.get_meta_data().find("comment").text
    except AttributeError:
        return
    
    if comment.strip():
        header("Comments by " + annot, out, char="-")
        print >>out, comment.encode(encoding), "\n"
Esempio n. 7
0
def weight(setting):
    """
    Weight predictions
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.weight:
        log.info("\n" + header("WEIGHT STEP"))
        
        if setting.develop:
            weight_files(
                setting.dev_inst_fns,
                setting.dev_clas_fns,
                setting.weight_func, 
                descriptor=setting.descriptor,
                n=setting.n,
                binary=setting.binary)
        if setting.validate:
            weight_files(
                setting.val_inst_fns,
                setting.val_clas_fns,
                setting.weight_func, 
                descriptor=setting.descriptor,
                n=setting.n,
                binary=setting.binary)
Esempio n. 8
0
def extract(setting):
    """
    Extract features from corpus files, producing instance files and true
    corpus files.
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.extract:
        log.info("\n" + header("EXTRACT STEP"))

        makedirs(setting.inst_dir)
        makedirs(setting.true_dir)

        if setting.develop:
            inst_fns = setting.make_inst_fns(setting.dev_part_fns)
            true_fns = setting.make_true_fns(setting.dev_part_fns)

            extract_files(setting.extractor,
                          setting.graph_selector,
                          setting.dev_part_fns,
                          inst_fns,
                          true_fns,
                          binary=setting.binary)
        if setting.validate:
            inst_fns = setting.make_inst_fns(setting.val_part_fns)
            true_fns = setting.make_true_fns(setting.val_part_fns)

            extract_files(setting.extractor,
                          setting.graph_selector,
                          setting.val_part_fns,
                          inst_fns,
                          true_fns,
                          binary=setting.binary)
Esempio n. 9
0
def exp_init(setting):
    log.info("\n" + header("INIT"))
    log.info("Setting at start:\n" + str(setting) + "\n")
    buf = StringIO.StringIO()
    buf.write("feature description:\n")
    setting.descriptor.pprint(buf)
    log.info(buf.getvalue())
Esempio n. 10
0
def evaluate(setting):
    """
    Evaluate development data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.evaluate:
        log.info("\n" + header("EVALUATE STEP"))
        makedirs(setting.eval_dir)
            
        if setting.develop:
            setting.dev_eval = eval_files(
                setting.dev_true_fns,
                setting.dev_pred_fns,
                setting.dev_eval_fname,
                align_eval=setting.evaluator,
                n=setting.n)
             
        if setting.validate:
            setting.val_eval = eval_files(
                setting.val_true_fns,
                setting.val_pred_fns,
                setting.val_eval_fname,
                align_eval=setting.evaluator,
                n=setting.n)
Esempio n. 11
0
def extract(setting):
    """
    Extract features from corpus files, producing instance files and true
    corpus files.
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.extract:
        log.info("\n" + header("EXTRACT STEP"))
        
        makedirs(setting.inst_dir)
        makedirs(setting.true_dir)
        
        if setting.develop:
            inst_fns = setting.make_inst_fns(setting.dev_part_fns)
            true_fns = setting.make_true_fns(setting.dev_part_fns)
        
            extract_files(
                setting.extractor,
                setting.graph_selector,
                setting.dev_part_fns,
                inst_fns,
                true_fns,
                binary=setting.binary)
        if setting.validate:
            inst_fns = setting.make_inst_fns(setting.val_part_fns)
            true_fns = setting.make_true_fns(setting.val_part_fns)
            
            extract_files(
                setting.extractor,
                setting.graph_selector,
                setting.val_part_fns,
                inst_fns,
                true_fns,
                binary=setting.binary)
Esempio n. 12
0
def merge(setting):
    """
    Merge data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.merge:
        log.info("\n" + header("MERGE STEP"))        
        makedirs(setting.pred_dir)
        
        if setting.develop:
            pred_fns = setting.make_pred_fns(setting.dev_true_fns)
            
            merge_files(
                setting.dev_inst_fns, 
                setting.dev_true_fns,
                pred_fns,
                merger=setting.merger, 
                descriptor=setting.descriptor, 
                n=setting.n, 
                binary=setting.binary)
        if setting.validate:
            pred_fns = setting.make_pred_fns(setting.val_true_fns)
            
            merge_files(
                setting.val_inst_fns, 
                setting.val_true_fns,
                pred_fns,
                merger=setting.merger, 
                descriptor=setting.descriptor, 
                n=setting.n, 
                binary=setting.binary)
Esempio n. 13
0
def exp_init(setting):
    log.info("\n" + header("INIT"))
    log.info("Setting at start:\n" + str(setting) + "\n")
    buf = StringIO.StringIO()
    buf.write("feature description:\n")
    setting.descriptor.pprint(buf)
    log.info(buf.getvalue())
Esempio n. 14
0
def sample(setting):
    """
    Sample training data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.sample:
        log.info("\n" + header("SAMPLE STEP"))
        makedirs(setting.samp_dir)

        if setting.develop:
            samp_fns = setting.make_samp_fns(setting.dev_inst_fns)

            sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns)
        if setting.validate:
            samp_fns = setting.make_samp_fns(setting.val_inst_fns)

            sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
Esempio n. 15
0
def sample(setting):
    """
    Sample training data
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.sample:
        log.info("\n" + header("SAMPLE STEP"))
        makedirs(setting.samp_dir)

        if setting.develop:
            samp_fns = setting.make_samp_fns(setting.dev_inst_fns)

            sample_file(setting.class_fracts, setting.dev_inst_fns, samp_fns)
        if setting.validate:
            samp_fns = setting.make_samp_fns(setting.val_inst_fns)

            sample_file(setting.class_fracts, setting.val_inst_fns, samp_fns)
Esempio n. 16
0
def create_parts(setting):
    """
    Create the parallel graph corpora constituting the data parts for
    development and validation
    
    @param setting: Setting instance specifying the experimental setting
    """
    if setting.part:
        log.info("\n" + header("PARTING STEP"))
        
        if setting.develop:
            create_part_files(
                setting.dev_parts,
                base_dir=setting.corpus_dir,
                part_dir=setting.part_dir,
                max_size=setting.part_max_size)
        if setting.validate:
            create_part_files(
                setting.val_parts,
                base_dir=setting.corpus_dir,
                part_dir=setting.part_dir,
                max_size=setting.part_max_size)
Esempio n. 17
0
def match(setting):
    """
    Match data
    
    @param setting: Setting instance specifying the experimental setting
    """    
    if setting.match:
        log.info("\n" + header("MATCH STEP"))
        
        if setting.develop:
            match_files(
                setting.dev_inst_fns,
                setting.matcher,
                descriptor=setting.descriptor,
                n=setting.n,
                binary=setting.binary)
        if setting.validate:
            match_files(
                setting.val_inst_fns,
                setting.matcher,
                descriptor=setting.descriptor,
                n=setting.n,
                binary=setting.binary)
Esempio n. 18
0
def classify(setting):
    """
    Classify corpus instances

    @param setting: Setting instance specifying the experimental setting
    """    
    if setting.classify:
        log.info("\n" + header("CLASSIFY STEP"))
        
        makedirs(setting.clas_dir)
        
        if setting.train_sample:
            train_inst_fns = setting.dev_samp_fns
        else:
            train_inst_fns = setting.dev_inst_fns
        
        if setting.develop:
            classify_file_cv(
                train_inst_fns,
                test_inst_fns=setting.dev_inst_fns,
                out_fns=setting.make_out_fns(setting.dev_inst_fns),
                log_fns=setting.make_log_fns(setting.dev_inst_fns),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                n=setting.n,
                log=setting.timbl_log)
        if setting.validate:
            classify_file(
                train_inst_fns,
                setting.val_inst_fns,
                out_fns=setting.make_out_fns(setting.val_inst_fns),
                log_fn=setting.make_log_fname(setting.val_inst_fns[0]),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                log=setting.timbl_log)
Esempio n. 19
0
def classify(setting):
    """
    Classify corpus instances

    @param setting: Setting instance specifying the experimental setting
    """
    if setting.classify:
        log.info("\n" + header("CLASSIFY STEP"))

        makedirs(setting.clas_dir)

        if setting.train_sample:
            train_inst_fns = setting.dev_samp_fns
        else:
            train_inst_fns = setting.dev_inst_fns

        if setting.develop:
            classify_file_cv(
                train_inst_fns,
                test_inst_fns=setting.dev_inst_fns,
                out_fns=setting.make_out_fns(setting.dev_inst_fns),
                log_fns=setting.make_log_fns(setting.dev_inst_fns),
                descriptor=setting.descriptor,
                timbl=setting.classifier,
                options=setting.timbl_opts,
                n=setting.n,
                log=setting.timbl_log)
        if setting.validate:
            classify_file(train_inst_fns,
                          setting.val_inst_fns,
                          out_fns=setting.make_out_fns(setting.val_inst_fns),
                          log_fn=setting.make_log_fname(
                              setting.val_inst_fns[0]),
                          descriptor=setting.descriptor,
                          timbl=setting.classifier,
                          options=setting.timbl_opts,
                          log=setting.timbl_log)
Esempio n. 20
0
    def write_alignment_overall(self, out=stdout, percent=True):  
        """
        write evaluation summary of alignment over all relations
        """
        width = 14
        separator = 4 * width * "-" + "\n"
        
        header("Alignment over all relations", out)
        
        # write counts
        
        out.write("Relation:".ljust(width))
        
        for c in AlignCounts.count_keys:
            c = c.capitalize() + ":"
            out.write(c.rjust(width))
            
        out.write('\n' + separator)
        
        for rel, align_counts in zip(self.relations, self):
            out.write(rel.ljust(width))
            
            for c in AlignCounts.count_keys:
                s = str(self[rel].count_stats[c]["sum"])
                out.write(s.rjust(width))            
            out.write('\n')

        out.write(separator)
        
        out.write("Sum:".ljust(width))
        
        for k in AlignCounts.count_keys:
            s = str(self.count_stats[k]["sum"])
            out.write(s.rjust(width))           
            
        out.write('\n\n\n')
        
        # write measures
        
        out.write("Relation:".ljust(width))
        
        for c in AlignCounts.measure_keys:
            c = c.capitalize() + ":"
            out.write(c.rjust(width))
            
        out.write('\n' + separator)
        
        for rel, align_counts in zip(self.relations, self):
            out.write(rel.ljust(width))
            
            for m in AlignCounts.measure_keys:
                # repport the relation's micro mean here
                value = self[rel].measure_stats[m]["micro"]["mean"]
                if percent: value *= 100
                s = "%.2f" % value
                out.write(s.rjust(width))            
            out.write('\n')

        out.write(separator)

        for method in self.measure_stat_methods:
            for stat in self.measure_stat_keys:
                s = method.capitalize()+ " " + stat.capitalize() + ":"
                out.write(s.ljust(width))
                
                for m in AlignCounts.measure_keys:
                    value = self.measure_stats[m][method][stat]
                    if percent: value *= 100
                    s = "%.2f" % value
                    out.write(s.rjust(width))           
                out.write('\n')

        out.write('\n\n')
Esempio n. 21
0
 def write_alignment_only(self, out=stdout):
     """
     write evaluation of alignment only, irrespective of relation labels
     """
     header("Alignment only (regardlless of relation)", out)
     self[AlignEval.any_rel].write(self.names, out=out)
Esempio n. 22
0
 def write_alignment_only(self, out=stdout):
     """
     write evaluation of alignment only, irrespective of relation labels
     """
     header("Alignment only (regardlless of relation)", out)
     self[AlignEval.any_rel].write(self.names, out=out)
Esempio n. 23
0
    def write_alignment_overall(self, out=stdout, percent=True):
        """
        write evaluation summary of alignment over all relations
        """
        width = 14
        separator = 4 * width * "-" + "\n"

        header("Alignment over all relations", out)

        # write counts

        out.write("Relation:".ljust(width))

        for c in AlignCounts.count_keys:
            c = c.capitalize() + ":"
            out.write(c.rjust(width))

        out.write('\n' + separator)

        for rel, align_counts in zip(self.relations, self):
            out.write(rel.ljust(width))

            for c in AlignCounts.count_keys:
                s = str(self[rel].count_stats[c]["sum"])
                out.write(s.rjust(width))
            out.write('\n')

        out.write(separator)

        out.write("Sum:".ljust(width))

        for k in AlignCounts.count_keys:
            s = str(self.count_stats[k]["sum"])
            out.write(s.rjust(width))

        out.write('\n\n\n')

        # write measures

        out.write("Relation:".ljust(width))

        for c in AlignCounts.measure_keys:
            c = c.capitalize() + ":"
            out.write(c.rjust(width))

        out.write('\n' + separator)

        for rel, align_counts in zip(self.relations, self):
            out.write(rel.ljust(width))

            for m in AlignCounts.measure_keys:
                # repport the relation's micro mean here
                value = self[rel].measure_stats[m]["micro"]["mean"]
                if percent: value *= 100
                s = "%.2f" % value
                out.write(s.rjust(width))
            out.write('\n')

        out.write(separator)

        for method in self.measure_stat_methods:
            for stat in self.measure_stat_keys:
                s = method.capitalize() + " " + stat.capitalize() + ":"
                out.write(s.ljust(width))

                for m in AlignCounts.measure_keys:
                    value = self.measure_stats[m][method][stat]
                    if percent: value *= 100
                    s = "%.2f" % value
                    out.write(s.rjust(width))
                out.write('\n')

        out.write('\n\n')
Esempio n. 24
0
def pgc_diff(corpus1, corpus2, 
             corpus_name1="Corpus1", corpus_name2="Corpus2", 
             annot1="Annot1", annot2="Annot2", 
             words_only=False,
             show_comments=False,
             show_ident=False,
             relations=None,
             out=stdout):
    """
    reports the differences (and optionally the similarities) between 
    the labeled alignments from two parallel graph corpora
    """
    assert len(corpus1) == len(corpus2)
    
    if not relations:
        relations = corpus1.get_relations()
    
    # counter for numbering the alignments when printing;
    # may be less than the actual number of alignments when identical alignments
    # are not printed (cf. show_ident option)
    align_count = 0
    
    # counter for numbering the graph pairs when printing
    pair_count = 0
    
    header("%s corpus: %s\n%s corpus: %s" % (annot1, corpus_name1, annot2,
                                             corpus_name2), width=120, char="#")
    
    for graph_pair1, graph_pair2 in zip(corpus1, corpus2):
        # assume that the corpora have the same graph pairs in the same order,
        # so the only difference is in the aligned nodes
        assert graph_pair1._banks == graph_pair2._banks
        assert graph_pair1._graphs_equal(graph_pair2)
        
        pair_count += 1
        ident = []
        rel_diff = [] 
        uniq1 = []
        uniq2 = []
        # recall that graphs are identical
        graphs = graph_pair1.get_graphs()
        
        for nodes, rel1 in graph_pair1.alignments_iter(relations=relations):
            if ( words_only and
                 graphs.source.node_node_is_non_terminal(nodes.source) and
                 graphs.target.node_is_non_terminal(nodes.target) ):
                continue
            
            rel2 = graph_pair2.get_align(nodes)
                        
            if not rel2:
                uniq1.append(nodes)
            elif rel1 == rel2:
                ident.append(nodes)
            else:                        
                rel_diff.append(nodes)
            
        for nodes, rel2 in graph_pair2.alignments_iter(relations=relations):
            if ( words_only and
                 ( graphs.source.node_is_terminal(nodes.source) or
                   graphs.target.node_is_terminal(nodes.target) )):
                continue
            
            if not graph_pair1.get_align(nodes):
                uniq2.append(nodes)
                    
        
        #if not ( ident and rel_diff and uniq1 and uniq2 and show_comments ):
        #    continue
            
        header("Graph pair %d" % pair_count, width=120, char="=")
        
        print >>out, graphs.source.get_graph_token_string().encode("utf-8"), "\n"
        print >>out, graphs.target.get_graph_token_string().encode("utf-8"), "\n"
        
        if show_comments:
            print_comments(graph_pair1, annot1, out)
            print_comments(graph_pair2, annot2, out)
            
        if show_ident:
            ident.sort(cmp=cmp_nodes)
            align_count = print_alignments(align_count, "Identical",
                                           graph_pair1, graph_pair2, graphs, ident, out)
        
        rel_diff.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, "Relation different",
                                       graph_pair1, graph_pair2, graphs, rel_diff, out)
        
        uniq1.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, annot1 + " only",
                                       graph_pair1, graph_pair2, graphs, uniq1, out)
        
        uniq2.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, annot2 + " only",
                                       graph_pair1, graph_pair2, graphs, uniq2, out)
Esempio n. 25
0
def pgc_diff(corpus1,
             corpus2,
             corpus_name1="Corpus1",
             corpus_name2="Corpus2",
             annot1="Annot1",
             annot2="Annot2",
             words_only=False,
             show_comments=False,
             show_ident=False,
             relations=None,
             out=stdout):
    """
    reports the differences (and optionally the similarities) between 
    the labeled alignments from two parallel graph corpora
    """
    assert len(corpus1) == len(corpus2)

    if not relations:
        relations = corpus1.get_relations()

    # counter for numbering the alignments when printing;
    # may be less than the actual number of alignments when identical alignments
    # are not printed (cf. show_ident option)
    align_count = 0

    # counter for numbering the graph pairs when printing
    pair_count = 0

    header("%s corpus: %s\n%s corpus: %s" %
           (annot1, corpus_name1, annot2, corpus_name2),
           width=120,
           char="#")

    for graph_pair1, graph_pair2 in zip(corpus1, corpus2):
        # assume that the corpora have the same graph pairs in the same order,
        # so the only difference is in the aligned nodes
        assert graph_pair1._banks == graph_pair2._banks
        assert graph_pair1._graphs_equal(graph_pair2)

        pair_count += 1
        ident = []
        rel_diff = []
        uniq1 = []
        uniq2 = []
        # recall that graphs are identical
        graphs = graph_pair1.get_graphs()

        for nodes, rel1 in graph_pair1.alignments_iter(relations=relations):
            if (words_only
                    and graphs.source.node_node_is_non_terminal(nodes.source)
                    and graphs.target.node_is_non_terminal(nodes.target)):
                continue

            rel2 = graph_pair2.get_align(nodes)

            if not rel2:
                uniq1.append(nodes)
            elif rel1 == rel2:
                ident.append(nodes)
            else:
                rel_diff.append(nodes)

        for nodes, rel2 in graph_pair2.alignments_iter(relations=relations):
            if (words_only
                    and (graphs.source.node_is_terminal(nodes.source)
                         or graphs.target.node_is_terminal(nodes.target))):
                continue

            if not graph_pair1.get_align(nodes):
                uniq2.append(nodes)

        #if not ( ident and rel_diff and uniq1 and uniq2 and show_comments ):
        #    continue

        header("Graph pair %d" % pair_count, width=120, char="=")

        print >> out, graphs.source.get_graph_token_string().encode(
            "utf-8"), "\n"
        print >> out, graphs.target.get_graph_token_string().encode(
            "utf-8"), "\n"

        if show_comments:
            print_comments(graph_pair1, annot1, out)
            print_comments(graph_pair2, annot2, out)

        if show_ident:
            ident.sort(cmp=cmp_nodes)
            align_count = print_alignments(align_count, "Identical",
                                           graph_pair1, graph_pair2, graphs,
                                           ident, out)

        rel_diff.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, "Relation different",
                                       graph_pair1, graph_pair2, graphs,
                                       rel_diff, out)

        uniq1.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, annot1 + " only",
                                       graph_pair1, graph_pair2, graphs, uniq1,
                                       out)

        uniq2.sort(cmp=cmp_nodes)
        align_count = print_alignments(align_count, annot2 + " only",
                                       graph_pair1, graph_pair2, graphs, uniq2,
                                       out)
Esempio n. 26
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate: setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns,
                setting.dev_true_fns)[:setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname,
                                              graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus,
                                       setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Esempio n. 27
0
def exp_exit(setting):
    log.info("\n" + header("EXIT"))
    pickle(setting)
    feat_weight_graphs(setting)
    log.info("Setting at end:\n" + str(setting) + "\n")
Esempio n. 28
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate:
        setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Esempio n. 29
0
def exp_exit(setting):
    log.info("\n" + header("EXIT"))
    pickle(setting)
    feat_weight_graphs(setting)
    log.info("Setting at end:\n" + str(setting) + "\n")