Esempio n. 1
0
def read_corpus(corpus): 
    if not isinstance(corpus, HitaextDoc):
        corpus = HitaextDoc(file=corpus)
        
    from_tree = corpus.get_doc_tree("from")
    from_tree.update()
    to_tree = corpus.get_doc_tree("to")
    to_tree.update()
    corpus.inject_alignments(from_tree, to_tree)  
    
    return corpus, from_tree, to_tree
        
          
    
        
    
    
Esempio n. 2
0
def eval_alignment(filename_pairs, tag, labels=None):
    """
    Print an evaluation of the alignment w.r.t. tag 
    
    @param corpus_pairs: list of filename pairs consisting of a true and a
    predicted parallel text corpus
    
    @param tag: only alignments involving this tag are considered
    
    @keyword label_pairs: list of string labels
    """
    overall_true = overall_pred = overall_common = 0

    print(
        "    #true:    #pred:  #common:    ratio:"
        "     prec:      rec:  f-score:   label:")
    print 128 * "-"

    if not labels:
        labels = [os.path.basename(pair[1]) for pair in filename_pairs]

    for (true_fn, pred_fn), label in zip(filename_pairs, labels):
        true_corpus = HitaextDoc(file=true_fn)
        pred_corpus = HitaextDoc(file=pred_fn)
        n_true, n_pred, n_common = count_alignment(true_corpus, pred_corpus,
                                                   tag)
        overall_true += n_true
        overall_pred += n_pred
        overall_common += n_common
        ratio, prec, rec, f = compute_scores(n_true, n_pred, n_common)
        print "%10d%10d%10d%10.2f%10.2f%10.2f%10.2f   %s" % (
            n_true, n_pred, n_common, ratio, prec, rec, f, label)

    ratio, prec, rec, f = compute_scores(overall_true, overall_pred,
                                         overall_common)
    print 128 * "-"
    print "%10d%10d%10d%10.2f%10.2f%10.2f%10.2f" % (
        overall_true, overall_pred, overall_common, ratio, prec, rec, f)
Esempio n. 3
0
def read_corpus(corpus):
    if not isinstance(corpus, HitaextDoc):
        corpus = HitaextDoc(file=corpus)

    from_tree = corpus.get_doc_tree("from")
    from_tree.update()
    to_tree = corpus.get_doc_tree("to")
    to_tree.update()
    corpus.inject_alignments(from_tree, to_tree)

    return corpus, from_tree, to_tree
Esempio n. 4
0
    )


parser.add_argument(
    "-V", "--verbose",
    action="store_true",
    help="verbose output"
    )


args = parser.parse_args()

if args.verbose:
    print >>stderr, "Reading corpus from", args.corpus
    
corpus = HitaextDoc(file=args.corpus)

from_tree = corpus.get_doc_tree("from")
to_tree = corpus.get_doc_tree("to")

from_tree.update()
to_tree.update()

corpus.inject_alignments(from_tree, to_tree)

corpus.alignment.set("method", "id")

corpus.extract_alignments(from_tree, to_tree)

if args.verbose:
    print >>stderr, "Writing corpus to", args.corpus
Esempio n. 5
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file, 
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)    
    doc_trees = text_corpus.get_doc_trees(search=True)
    
    # read graph banks
    source_bank = GraphBank(source_graphbank_file,
                            graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file,
                            graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)
    
    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)
    
    for alignment in text_corpus.alignment:
        if ( alignment.get("from_tag") != focus_tags.source or 
             alignment.get("to_tag") != focus_tags.target ):
            continue
        
        source_tokens = _get_elem_tokens(doc_trees.source,
                                         focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target,
                                         focus_tags.target,
                                         alignment.get("to_id"))
        
        if len(source_tokens) > max_token_len or len(target_tokens) > max_token_len:
            continue
        
        if ( min_token_diff and
             _token_diff(source_tokens, target_tokens)  < min_token_diff ):
            continue
        
        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(
            source_bank.get_graph(source_graph_id),
            target_bank.get_graph(target_graph_id))
        
        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)
            
    return graph_corpus
Esempio n. 6
0
parser.add_argument(
    "-V", "--verbose",
    action="store_true",
    help="verbose output"
    )


args = parser.parse_args()


for fn in args.corpus:
    if args.verbose or args.test:
        print >>stderr, "Reading Hitaext document", fn
        
    htdoc = HitaextDoc(file=fn)
    
    for side in ("from", "to"):
        path = htdoc.get_filename(side)
        
        if args.verbose or args.test:
            print >>stderr, "Current %s path is %s" %  (side, path)
        
        # a heuristic to deal with windows paths        
        if path[0] in uppercase and path[1] == ":":
            # strip drive letter
            path = path[2:]
        path = path.replace("\\", "/")
        
        path = join(args.dir, basename(path))
        
Esempio n. 7
0
parser.add_argument("corpus",
                    nargs="+",
                    default="parallel text corpus",
                    help="")

parser.add_argument("-V",
                    "--verbose",
                    action="store_true",
                    help="verbose output")

args = parser.parse_args()

if args.verbose:
    print >> stderr, "Reading corpus from", args.corpus

corpus = HitaextDoc(file=args.corpus)

from_tree = corpus.get_doc_tree("from")
to_tree = corpus.get_doc_tree("to")

from_tree.update()
to_tree.update()

corpus.inject_alignments(from_tree, to_tree)

corpus.alignment.set("method", "id")

corpus.extract_alignments(from_tree, to_tree)

if args.verbose:
    print >> stderr, "Writing corpus to", args.corpus
Esempio n. 8
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file,
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)
    doc_trees = text_corpus.get_doc_trees(search=True)

    # read graph banks
    source_bank = GraphBank(source_graphbank_file, graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file, graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)

    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)

    for alignment in text_corpus.alignment:
        if (alignment.get("from_tag") != focus_tags.source
                or alignment.get("to_tag") != focus_tags.target):
            continue

        source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target,
                                         alignment.get("to_id"))

        if len(source_tokens) > max_token_len or len(
                target_tokens) > max_token_len:
            continue

        if (min_token_diff and
                _token_diff(source_tokens, target_tokens) < min_token_diff):
            continue

        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(source_bank.get_graph(source_graph_id),
                      target_bank.get_graph(target_graph_id))

        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)

    return graph_corpus
Esempio n. 9
0
    "--test",
    action="store_true",
    help="perform a dry run without actually changing the files (implies -v)")

parser.add_argument("-V",
                    "--verbose",
                    action="store_true",
                    help="verbose output")

args = parser.parse_args()

for fn in args.corpus:
    if args.verbose or args.test:
        print >> stderr, "Reading Hitaext document", fn

    htdoc = HitaextDoc(file=fn)

    for side in ("from", "to"):
        path = htdoc.get_filename(side)

        if args.verbose or args.test:
            print >> stderr, "Current %s path is %s" % (side, path)

        # a heuristic to deal with windows paths
        if path[0] in uppercase and path[1] == ":":
            # strip drive letter
            path = path[2:]
        path = path.replace("\\", "/")

        path = join(args.dir, basename(path))