Beispiel #1
0
def read_from_ptb(filename):
    ptb_file = open(filename, 'r')
    for l in ptb_file:
        tree = Tree.fromstring(l, remove_empty_top_bracketing=False)
        original = deepcopy(tree)
        chomsky_normal_form(tree, horzMarkov=HORZMARKOV, vertMarkov=VERTMARKOV, childChar='|', parentChar='#')

    ptb_file.close()
Beispiel #2
0
def generate_rule(treebank_file):
    # if you use unicode here, there is a bug...
    f = open(treebank_file, "r")
    pos_set = set([])
    full_rule_set = set([])
    s_ind = 0
    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        tree = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        preterminals = [
            t.label() for t in tree.subtrees(lambda t: t.height() == 2)
        ]
        pos_set.update(preterminals)

        # First, collapse the unary, notice that the POS tags should not be affected
        if unary_collapse:
            NewTree.collapse_unary(tree)
        bt = NewTree.get_binarize_lex(tree)
        # Extract rules from the tree
        rule_set = NewTree.generate_rules(bt)

        # Add them to the full set
        for sr in rule_set:
            full_rule_set.add(sr)
    sys.stderr.write("\n")
    f.close()

    # print core_pos_set
    # Generate the back-off rules
    backoff_rule_set = set([])
    for r in full_rule_set:
        args = r.split(" ")

        for i in xrange(1, len(args) - 1):
            if args[i] in pos_set:
                args_copy = deepcopy(args)
                args_copy[i] = "BPOS|"
                backoff_rule_set.add(" ".join(args_copy))

    ind = 0
    for r in full_rule_set:
        print str(ind) + " " + r
        ind += 1
    if use_back_off_rule:
        for r in backoff_rule_set:
            print str(ind) + " " + r
            ind += 1
        for pos in pos_set:
            print str(ind) + " 1 BPOS| " + pos
            ind += 1
Beispiel #3
0
def generate_rule(treebank_file):
    # if you use unicode here, there is a bug...
    f = open(treebank_file, "r")
    pos_set = set([])
    full_rule_set = set([])
    s_ind = 0
    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        tree = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        preterminals = [t.label() for t in tree.subtrees(lambda t: t.height() == 2)]
        pos_set.update(preterminals)

        # First, collapse the unary, notice that the POS tags should not be affected
        if unary_collapse:
            NewTree.collapse_unary(tree)
        bt = NewTree.get_binarize_lex(tree)
        # Extract rules from the tree
        rule_set = NewTree.generate_rules(bt)

        # Add them to the full set
        for sr in rule_set:
            full_rule_set.add(sr)
    sys.stderr.write("\n")
    f.close()

    # print core_pos_set
    # Generate the back-off rules
    backoff_rule_set = set([])
    for r in full_rule_set:
        args = r.split(" ")

        for i in xrange(1, len(args)-1):
            if args[i] in pos_set:
                args_copy = deepcopy(args)
                args_copy[i] = "BPOS|"
                backoff_rule_set.add(" ".join(args_copy))

    ind = 0
    for r in full_rule_set:
        print str(ind) + " " + r
        ind += 1
    if use_back_off_rule:
        for r in backoff_rule_set:
            print str(ind) + " " + r
            ind += 1
        for pos in pos_set:
            print str(ind) + " 1 BPOS| " + pos
            ind += 1
Beispiel #4
0
def generate_conll(inputf):
    f = open(inputf, "r")
    s_ind = 0
    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        deps = NewTree.generateDep(t)
        NewTree.print_conll_lines(deps,sys.stdout)
        sys.stdout.write("\n")
    f.close()
Beispiel #5
0
def generate_conll(inputf):
    f = open(inputf, "r")
    s_ind = 0
    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        deps = NewTree.generateDep(t)
        NewTree.print_conll_lines(deps, sys.stdout)
        sys.stdout.write("\n")
    f.close()
Beispiel #6
0
def generate_part(treebank_file, rule_file):
    # This generate the gold parts file for the use of C++
    rule_dic = read_rule_file(rule_file)
    f = open(treebank_file, "r")
    s_ind = 0

    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        parts = []
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        if unary_collapse:
            NewTree.collapse_unary(t)
        bt = NewTree.get_binarize_lex(t)
        for pos in bt.treepositions(order='postorder'):
            nt = bt[pos]
            if isinstance(nt, str) or isinstance(nt, unicode):
                continue
            elif nt.height() == 2:
                continue
            else:
                info = NewTree.get_span_info(nt,rule_dic)
                parts.append(info)

        work_tree = deepcopy(t)
        NewTree.lexLabel(work_tree)
        parent_dic, dep_label_set = NewTree.getParentDic(work_tree)

        print len([item for item in parts if item != None])
        parent_list = []
        label_list = []
        for ind in xrange(1, (len(t.leaves()) + 1)):
            p = str(int(parent_dic[str(ind)]) - 1)
            parent_list.append(p)
        for ind in xrange(1, (len(t.leaves()) + 1)):
            l = dep_label_set[str(ind)]
            label_list.append(l)
        for p in parts:
            if p != None:
                print " ".join(p)
            else:
                pass
    sys.stderr.write("\n")
    f.close()
Beispiel #7
0
def generate_part(treebank_file, rule_file):
    # This generate the gold parts file for the use of C++
    rule_dic = read_rule_file(rule_file)
    f = open(treebank_file, "r")
    s_ind = 0

    for sentence in f:
        if language_setting == "chn":
            sentence = sentence.decode('utf-8')
        s_ind += 1
        if s_ind % 100 == 0:
            sys.stderr.write(str(s_ind) + "..")
        parts = []
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        if unary_collapse:
            NewTree.collapse_unary(t)
        bt = NewTree.get_binarize_lex(t)
        for pos in bt.treepositions(order='postorder'):
            nt = bt[pos]
            if isinstance(nt, str) or isinstance(nt, unicode):
                continue
            elif nt.height() == 2:
                continue
            else:
                info = NewTree.get_span_info(nt, rule_dic)
                parts.append(info)

        work_tree = deepcopy(t)
        NewTree.lexLabel(work_tree)
        parent_dic, dep_label_set = NewTree.getParentDic(work_tree)

        print len([item for item in parts if item != None])
        parent_list = []
        label_list = []
        for ind in xrange(1, (len(t.leaves()) + 1)):
            p = str(int(parent_dic[str(ind)]) - 1)
            parent_list.append(p)
        for ind in xrange(1, (len(t.leaves()) + 1)):
            l = dep_label_set[str(ind)]
            label_list.append(l)
        for p in parts:
            if p != None:
                print " ".join(p)
            else:
                pass
    sys.stderr.write("\n")
    f.close()
def data_to_tok_ids(data_path, target_path, vocabulary_path, model="seq2seq"):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with open(data_path, "r") as data_file:
      with open(target_path, "w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          if model == 'seq2tree':
              tree = Tree.fromstring(line.strip())
              seqlist = tree.get_sequences()
              tok_seqlist = [[vocab.get(w, UNK_ID) for w in seq] for seq in seqlist]
              tokens_file.write("\t".join([" ".join([str(tok) for tok in seq_ids]) for seq_ids in tok_seqlist]) + "\n")
          else:
              tokdst_ids = sentence_to_tok_ids(line, vocab)
              tokens_file.write(" ".join([str(tok) for tok in tokdst_ids]) + "\n")