def write_to_file(orig_file,
                  revised_file,
                  outfile=None,
                  no_heuristic_mismatch_fix=False):
    reader = BracketParseCorpusReader('.', [])
    orig_trees = reader.parsed_sents(orig_file)
    revised_trees = reader.parsed_sents(revised_file)

    # The revised PTB parses have one less tree in the training split.
    # This attempts to patch the problem by skipping this tree.
    if not no_heuristic_mismatch_fix:
        orig_trees = list(orig_trees)
        revised_trees = list(revised_trees)
        if len(orig_trees) == 39832 and len(revised_trees) == 39831:
            del orig_trees[4906]

    converted_trees = convert_to_revised_tokenization(orig_trees,
                                                      revised_trees)

    if outfile is None:
        for tree in converted_trees:
            print(tree.pformat(margin=1e100))
    else:
        with open(outfile, 'w') as f:
            for tree in converted_trees:
                tree_rep = tree.pformat(margin=1e100)
                assert ('\n' not in tree_rep)
                f.write(tree_rep)
                f.write("\n")
コード例 #2
0
class Negra(treebank.SavedTreebank):
    default_basedir = 'negra-corpus'
    trees = []
    filename = 'negra.treebank'

    def __init__(self, basedir=None):
        if basedir == None:
            basedir = self.default_basedir
        self.basedir = basedir
        self.reader = BracketParseCorpusReader(basedir,
                                               'negra-corpus2.penn',
                                               comment_char='%')

    def parsed(self, files=None):
        # for t in treebank.SavedTreebank.parsed(self, files):
        for (i, t) in itertools.izip(itertools.count(),
                                     self.reader.parsed_sents()):
            yield NegraTree(t, labels=i)

    def get_tree(self, offset=0):
        t = self.get_trees2(offset, offset + 1)[0]
        return t

    # Devuelve los arboles que se encuentran en la posicion i con start <= i < end
    def get_trees2(self, start=0, end=None):
        lt = [t for t in itertools.islice(self.parsed(), start, end)]
        return lt

    def is_ellipsis(self, s):
        return is_ellipsis(s)

    def is_punctuation(self, s):
        return is_punctuation(s)
コード例 #3
0
ファイル: treesift.py プロジェクト: pombredanne/TreeSift
class Corpus(object):

    def __init__(self, root, fids='.*', tag_taxonomy=None, headship=None):
        self.corpus = BracketParseCorpusReader(root=root, fileids=fids)

        self._corpus_iter = (sent for sent in self.corpus.parsed_sents())

    def __iter__(self):
        return self

    def next(self):
        try:
            return self._corpus_iter.next()
        except StopIteration:
            self._corpus_iter = (sent for sent in self.corpus.parsed_sents())
            raise
コード例 #4
0
ファイル: negra.py プロジェクト: francolq/lq-nlp-commons
class Negra(treebank.SavedTreebank):
    default_basedir = 'negra-corpus'
    trees = []
    filename = 'negra.treebank'
    
    def __init__(self, basedir=None):
        if basedir == None:
            basedir = self.default_basedir
        self.basedir = basedir
        self.reader = BracketParseCorpusReader(basedir, 'negra-corpus2.penn', comment_char='%')
    
    def parsed(self, files=None):
        #for t in treebank.SavedTreebank.parsed(self, files):
        for (i, t) in itertools.izip(itertools.count(), self.reader.parsed_sents()):
            yield NegraTree(t, labels=i)
    
    def get_tree(self, offset=0):
        t = self.get_trees2(offset, offset+1)[0]
        return t
    
    # Devuelve los arboles que se encuentran en la posicion i con start <= i < end
    def get_trees2(self, start=0, end=None):
        lt = [t for t in itertools.islice(self.parsed(), start, end)]
        return lt
    
    def is_ellipsis(self, s):
        return is_ellipsis(s)
    
    def is_punctuation(self, s):
        return is_punctuation(s)
コード例 #5
0
def write_corpus_file(dirname):
    """Takes all .mrg PTB files in a directory and puts them in a single file.
    This allows for faster retrieval from disk."""
    fileids = r"wsj_.*\.mrg"
    reader = BracketParseCorpusReader(dirname, fileids)
    text = "\n\n".join(str(tree) for tree in reader.parsed_sents())

    filename = dirname + ".txt"
    with open(filename, "w") as fh:
        fh.write(text)
コード例 #6
0
 def _get_trees(path: str) -> Iterator[list]:
     """Takes either a directory of .mrg files or a single .txt file."""
     if os.path.isdir(path):
         fileids = r"wsj_.*\.mrg"
         reader = BracketParseCorpusReader(path, fileids)
         yield from reader.parsed_sents()
     else:
         with open(path) as fh:
             for line in fh.read().split("\n\n"):
                 yield Tree.fromstring(line)
コード例 #7
0
def write_to_file(data_root, splits, outfile, add_top=False):
    reader = BracketParseCorpusReader('.', glob_files(data_root, splits))
    with open(outfile, 'w') as f:
        for tree in reader.parsed_sents():
            tree_rep = tree.pformat(margin=1e100)
            if add_top:
                tree_rep = "(TOP %s)" % tree_rep
            assert ('\n' not in tree_rep)
            f.write(tree_rep)
            f.write("\n")
コード例 #8
0
def get_id_list(target_root, splits):
    res = []
    for fname in glob_tree_files(target_root, splits):
        reader = BracketParseCorpusReader('.', [fname])
        num_sents = len(reader.parsed_sents())
        doc_id = os.path.splitext(os.path.split(fname)[-1])[0]
        for sent_id in range(num_sents):
            sent_id = "{}_{:03}".format(doc_id, sent_id)
            res.append((doc_id, sent_id))
    return res
コード例 #9
0
f_dev = codecs.open(TB_DEV, 'w', 'UTF-8')
for s in corpus_dev:
    f_dev.write(u"{0}\n".format(s))
f_dev.close()

corpus_test = data[int(s_total * (p_train + p_dev)):]
f_test = codecs.open(TB_TEST, 'w', 'UTF-8')
for s in corpus_test:
    f_test.write(u"{0}\n".format(s))
f_test.close()

corpus_root = r"./corpus/"

dev_file_pattern = r".*_dev\.tb"
ptb_dev = BracketParseCorpusReader(corpus_root, dev_file_pattern)
trees = ptb_dev.parsed_sents()
f_out = codecs.open(TXT_DEV, 'w', 'UTF-8')
for tree in trees:
    f_out.write(u"{0}\n".format(u" ".join(tree.leaves())))
f_out.close()

test_file_pattern = r".*_test\.tb"
ptb_test = BracketParseCorpusReader(corpus_root, test_file_pattern)
trees = ptb_test.parsed_sents()
f_out = codecs.open(TXT_TEST, 'w', 'UTF-8')
for tree in trees:
    f_out.write(u"{0}\n".format(u" ".join(tree.leaves())))
f_out.close()

print("Corpus size: %d" % s_total)
print("Train set size: %d" % len(corpus_train))
コード例 #10
0
def load_trees(const_path, text_path=None, text_processing="default"):
    """Load a treebank.

    The standard tree format presents an abstracted view of the raw text, with the
    assumption that a tokenizer and other early stages of the NLP pipeline have already
    been run. These can include formatting changes like escaping certain characters
    (e.g. -LRB-) or transliteration (see e.g. the Arabic and Hebrew SPMRL datasets).
    Tokens are not always delimited by whitespace, and the raw whitespace in the source
    text is thrown away in the PTB tree format. Moreover, in some treebanks the leaves
    of the trees are lemmas/stems rather than word forms.

    All of this is a mismatch for pre-trained transformer models, which typically do
    their own tokenization starting with raw unicode strings. A mismatch compared to
    pre-training often doesn't affect performance if you just want to report F1 scores
    within the same treebank, but it raises some questions when it comes to releasing a
    parser for general use: (1) Must the parser be integrated with a tokenizer that
    matches the treebank convention? In fact, many modern NLP libraries like spaCy train
    on dependency data that doesn't necessarily use the same tokenization convention as
    constituency treebanks. (2) Can the parser's pre-trained model be merged with other
    pre-trained system components (via methods like multi-task learning or adapters), or
    must it remain its own system because of tokenization mismatches?

    This tree-loading function aims to build a path towards parsing from raw text by
    using the `text_path` argument to specify an auxiliary file that can be used to
    recover the original unicode string for the text. Parser layers above the
    pre-trained model may still use gold tokenization during training, but this will
    possibly help make the parser more robust to tokenization mismatches.

    On the other hand, some benchmarks involve evaluating with gold tokenization, and
    naively switching to using raw text degrades performance substantially. This can
    hopefully be addressed by making the parser layers on top of the pre-trained
    transformers handle tokenization more intelligently, but this is still a work in
    progress and the option remains to use the data from the tree files with minimal
    processing controlled by the `text_processing` argument to clean up some escaping or
    transliteration.

    Args:
        const_path: Path to the file with one tree per line.
        text_path: (optional) Path to a file that provides the correct spelling for all
            tokens (without any escaping, transliteration, or other mangling) and
            information about whether there is whitespace after each token. Files in the
            CoNLL-U format (https://universaldependencies.org/format.html) are accepted,
            but the parser also accepts similarly-formatted files with just three fields
            (ID, FORM, MISC) instead of the usual ten. Text is recovered from the FORM
            field and any "SpaceAfter=No" annotations in the MISC field.
        text_processing: Text processing to use if no text_path is specified:
            - 'default': undo PTB-style escape sequences and attempt to guess whitespace
                surrounding punctuation
            - 'arabic': guess that all tokens are separated by spaces
            - 'arabic-translit': undo Buckwalter transliteration and guess that all
                tokens are separated by spaces
            - 'chinese': keep all tokens unchanged (i.e. do not attempt to find any
                escape sequences), and assume no whitespace between tokens
            - 'hebrew': guess that all tokens are separated by spaces
            - 'hebrew-translit': undo transliteration (see Sima'an et al. 2002) and
                guess that all tokens are separated by spaces

    Returns:
        A list of ParsingExample objects, which have the following attributes:
            - `tree` is an instance of nltk.Tree
            - `words` is a list of strings
            - `space_after` is a list of booleans
    """
    reader = BracketParseCorpusReader("", [const_path])
    trees = reader.parsed_sents()

    if text_path is not None:
        sents = read_text(text_path)
    elif text_processing in ("arabic-translit", "hebrew-translit"):
        translit = transliterate.TRANSLITERATIONS[
            text_processing.replace("-translit", "")
        ]
        sents = []
        for tree in trees:
            words = [translit(word) for word in tree.leaves()]
            sp_after = [True for _ in words]
            sents.append((words, sp_after))
    elif text_processing in ("arabic", "hebrew"):
        sents = []
        for tree in trees:
            words = tree.leaves()
            sp_after = [True for _ in words]
            sents.append((words, sp_after))
    elif text_processing == "chinese":
        sents = []
        for tree in trees:
            words = tree.leaves()
            sp_after = [False for _ in words]
            sents.append((words, sp_after))
    elif text_processing == "default":
        sents = []
        for tree in trees:
            words = ptb_unescape.ptb_unescape(tree.leaves())
            sp_after = ptb_unescape.guess_space_after(tree.leaves())
            sents.append((words, sp_after))
    else:
        raise ValueError(f"Bad value for text_processing: {text_processing}")

    assert len(trees) == len(sents)
    treebank = Treebank(
        [
            ParsingExample(tree=tree, words=words, space_after=space_after)
            for tree, (words, space_after) in zip(trees, sents)
        ]
    )
    for example in treebank:
        assert len(example.words) == len(example.leaves()), (
            "Constituency tree has a different number of tokens than the CONLL-U or "
            "other file used to specify reversible tokenization."
        )
    return treebank
コード例 #11
0
# Objects for binary rules (A -> BC)
binary_rules_freq = defaultdict(float)
binary_rules_cnt_by_lhs = defaultdict(int)
binary_rules_occur_cnt = 0
binary_lhs_set = set()
binary_rhs_set = set()

# Objects for terminal rules (POS -> <word>)
postags_freq = defaultdict(float)
postags_cnt_by_pos = defaultdict(int)
postags_occur_cnt = 0
words_occur_cnt = defaultdict(int)
postags_set = set()
words_set = set()

trees = ptb_train.parsed_sents()
for tree in trees:
    t = tree.copy()
    t.chomsky_normal_form(horzMarkov=2)
    #t.collapse_unary(collapsePOS=True, collapseRoot=False)
    prods = t.productions()
    for prod in prods:
        lhs = prod.lhs().symbol()
        rh = prod.rhs()
        #rhs = ' '.join([r.symbol() if isinstance(r, nltk.grammar.Nonterminal) else r for r in rh])
        if isinstance(rh[0], unicode):  # Ternimal production (POS -> <word>)
            rhs = rh[0]
            postags_freq[(lhs, rhs)] += 1
            postags_cnt_by_pos[lhs] += 1
            postags_occur_cnt += 1
            words_occur_cnt[rhs] += 1
コード例 #12
0
ファイル: extract_pcfg.py プロジェクト: rengotj/NLP_TD2
def pcfg_extraction():
    #####################################################################
    #                      Load Corpus treebanks                        #
    #####################################################################
    treebank_train = BracketParseCorpusReader("", "sequoia_train.tb")

    #####################################################################
    #    Initialisation for unary, binary and terminal rules            #
    #####################################################################
    #Unary rules
    unary_freq = defaultdict(float)  # How frequent is the rule A->B ?
    unary_cnt_by_lhs = defaultdict(
        int)  # How many times A is the left part of a unary rule ?
    unary_occur_cnt = 0  # How many unary rules are there ?
    unary_lhs_set = set()  # Set of left part symbols
    unary_rhs_set = set()  # Set of right part symbols

    #binary rules
    binary_freq = defaultdict(float)
    binary_cnt_by_lhs = defaultdict(int)
    binary_occur_cnt = 0
    binary_lhs_set = set()
    binary_rhs_set = set()

    #terminal rules
    postags_freq = defaultdict(float)
    postags_cnt_by_pos = defaultdict(int)
    postags_occur_cnt = 0
    words_occur_cnt = defaultdict(int)
    postags_set = set()
    words_set = set()

    #####################################################################
    #           Parsing collection of rules and words                   #
    #####################################################################
    for tree in treebank_train.parsed_sents():
        t = tree.copy()
        t.chomsky_normal_form(
            horzMarkov=2
        )  # Convert a tree into its Chomsky Normal Form equivalent
        prods = t.productions()  # Get the recursive productions

        for prod in prods:
            left_symbol = prod.lhs().symbol()  # Left hand side
            right_part = prod.rhs()  # Right hand side

            if isinstance(
                    right_part[0], str
            ):  # Termination found : left side = part-of-speech tags
                right_symbol = right_part[0]
                #save it in terminal rules
                postags_freq[(left_symbol, right_symbol)] += 1
                postags_cnt_by_pos[left_symbol] += 1
                postags_occur_cnt += 1
                words_occur_cnt[right_symbol] += 1
                postags_set.add(left_symbol)
                words_set.add(right_symbol)

            else:
                if len(right_part) == 1:  # Unary found
                    right_symbol = right_part[0].symbol()
                    #save it in unary rules
                    unary_freq[(left_symbol, right_symbol)] += 1
                    unary_cnt_by_lhs[left_symbol] += 1
                    unary_occur_cnt += 1
                    unary_lhs_set.add(left_symbol)
                    unary_rhs_set.add(right_symbol)

                elif len(right_part) == 2:  # Binary found
                    right_symbol = tuple([nt.symbol() for nt in right_part])
                    #save it in binary rules
                    binary_freq[(left_symbol, right_symbol)] += 1
                    binary_cnt_by_lhs[left_symbol] += 1
                    binary_occur_cnt += 1
                    binary_lhs_set.add(left_symbol)
                    binary_rhs_set.add(right_symbol)

    #####################################################################
    #           Look at the occurences of part-of-speech tags           #
    #####################################################################
    n_tag = len(words_occur_cnt.keys())
    print('There are ' + str(n_tag) +
          ' different part-of-speech tags in the training set')
    plt.scatter([i for i in range(len(words_occur_cnt.keys()))],
                [words_occur_cnt[i] for i in words_occur_cnt.keys()])
    plt.title('Occurences of part-of-speech tags')
    plt.xlabel('tag')
    plt.ylabel('occurence')
    plt.show()
    #####################################################################
    #             Group rare words into a new tag UNK                   #
    #####################################################################
    # Replace rare words with '<UNKNOWN>' tag
    unfrequent = set([w for w in words_set if words_occur_cnt[w] < 2])
    T_set = words_set.copy()
    T_set.difference_update(unfrequent)
    T_set.add(u"<UNKNOWN>")
    pw_pairs = list(postags_freq.keys())
    for (pos, w) in pw_pairs:
        if w in unfrequent:
            postags_freq[(pos, u"<UNKNOWN>")] += postags_freq[(pos, w)]
            postags_freq.pop((pos, w))

    #####################################################################
    #                          Normalisation                            #
    #####################################################################
    for (pos, w) in postags_freq:
        postags_freq[(pos, w)] /= postags_cnt_by_pos[pos]

    for (lhs, rhs) in unary_freq:
        unary_freq[(lhs,
                    rhs)] /= (unary_cnt_by_lhs[lhs] + binary_cnt_by_lhs[lhs])

    for (lhs, rhs) in binary_freq:
        binary_freq[(lhs,
                     rhs)] /= (unary_cnt_by_lhs[lhs] + binary_cnt_by_lhs[lhs])

    #####################################################################
    #                   Save the results in files                       #
    #####################################################################
    with codecs.open("PCFG_unary_freq.pkl", 'wb') as file:
        pickle.dump(unary_freq, file)
    file.close()

    with codecs.open("PCFG_binary_freq.pkl", 'wb') as file:
        pickle.dump(binary_freq, file)
    file.close()

    with codecs.open("PCFG_postags_freq.pkl", 'wb') as file:
        pickle.dump(postags_freq, file)
    file.close()

    #####################################################################
    #                       rhs -> lhs dictionary                       #
    #####################################################################
    unary_dict = {}
    binary_dict = {}
    postags_dict = {}

    for rhs in unary_rhs_set:
        unary_dict[rhs] = {}
    for (lhs, rhs) in unary_freq:
        unary_dict[rhs][lhs] = unary_freq[(lhs, rhs)]

    for rhs in binary_rhs_set:
        binary_dict[rhs] = {}
    for (lhs, rhs) in binary_freq:
        binary_dict[rhs][lhs] = binary_freq[(lhs, rhs)]

    for w in T_set:
        postags_dict[w] = {}
    for (pos, w) in postags_freq:
        postags_dict[w][pos] = postags_freq[(pos, w)]

    #####################################################################
    #                   Save the results in files                       #
    #####################################################################
    with codecs.open("PCFG_unary_dict.pkl", 'wb') as file:
        pickle.dump(unary_dict, file)
    file.close()

    with codecs.open("PCFG_binary_dict.pkl", 'wb') as file:
        pickle.dump(binary_dict, file)
    file.close()

    with codecs.open("PCFG_postags_dict.pkl", 'wb') as file:
        pickle.dump(postags_dict, file)
    file.close()

    #####################################################################
    #            the set of non-terminals and terminals                 #
    #####################################################################
    # Store the set of non-terminals and terminals
    NT_set = unary_lhs_set.union(binary_lhs_set)

    with codecs.open("NT_set.pkl", 'wb') as file:
        pickle.dump(NT_set, file)
    file.close()

    with codecs.open("T_set.pkl", 'wb') as file:
        pickle.dump(T_set, file)
    file.close()

    with codecs.open("postags_set.pkl", 'wb') as file:
        pickle.dump(postags_set, file)
    file.close()

    with codecs.open("words_set.pkl", 'wb') as file:
        pickle.dump(words_set, file)
    file.close()

    return ()