コード例 #1
0
def loadCorpora():

    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
    wordlists.words('connectives')

    corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg" 
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    ptb.fileids()
    len(ptb.sents())
    ptb.sents(fileids='20/wsj_2013.mrg')[19]
コード例 #2
0
ファイル: handle.py プロジェクト: JOI-2019-276/algorithm
 def open_flod(self, root_path, file_type ):
     ptb         = BracketParseCorpusReader(root_path, file_type)
     files_list  = ptb.fileids()
     files_path  = []
     for f in files_list:
         files_path.append(os.path.join(root_path,f))
     return (files_path,files_list)
コード例 #3
0
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
コード例 #4
0
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
コード例 #5
0
def extracting_cfg(
        corpus_root,
        file_pattern):  #returns cfg eith only 2 non-terminals on the right
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cfg_dict = {}
    unite_productions = {}
    lexicon = {}
    for file in ptb.fileids():
        #file = ptb.fileids()[0]
        print(file)
        for sentence in ptb.parsed_sents(file):  # iterating through sentences
            #sentence =ptb.parsed_sents(file)[some_i]
            if len(sentence.leaves()) <= 8:
                #print(sentence.leaves())
                for subtree in sentence.subtrees():  # extracting subtree
                    left_side = subtree.label()
                    right_side = []
                    for children in subtree:
                        if isinstance(children, str):  # reached leaf node
                            right_side.append(children)
                            if left_side in lexicon:
                                lexicon[left_side].add(children)
                            else:
                                lexicon[left_side] = set()
                                lexicon[left_side].add(children)
                        else:  # still not leafe node
                            right_side.append(children.label())
                    while len(
                            right_side
                    ) > 2:  # making only 2 non-terminals on the right side
                        new_head = '_'.join(
                            right_side[1:]
                        )  # generating new left side of the rule
                        new_right_side = right_side[:1] + [
                            new_head
                        ]  # generating new right side of the rule
                        tup = tuple(new_right_side)
                        if left_side not in cfg_dict:  # new key
                            cfg_dict[left_side] = set()
                            cfg_dict[left_side].add(tup)
                        else:
                            cfg_dict[left_side].add(tup)
                        left_side = new_head
                        right_side = right_side[1:]
                    if len(right_side) == 1:  #unite production
                        if left_side in unite_productions:
                            unite_productions[left_side].add(tuple(right_side))
                        else:
                            unite_productions[left_side] = set()
                            unite_productions[left_side].add(tuple(right_side))
                    if left_side in cfg_dict:  # adding rule to the dict
                        cfg_dict[left_side].add(tuple(right_side))
                    else:
                        cfg_dict[left_side] = set()
                        cfg_dict[left_side].add(tuple(right_side))
    return cfg_dict, lexicon, unite_productions
コード例 #6
0
ファイル: grammar_extraction.py プロジェクト: maxikfu/NLP-HW2
def extracting_cnf(corpus_root, file_pattern):
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cnf_dict = {}
    cnf_dict['lexicon'] = set()
    #for file in ptb.fileids():
    #for file in ptb.fileids():
    file = ptb.fileids()[0]
    print(file)
    for s in range(1, len(ptb.parsed_sents(file))):
        tree = ptb.parsed_sents(file)[s]
        for sub in tree.subtrees():
            return_rule(sub, cnf_dict, file)
    return cnf_dict
コード例 #7
0
ファイル: Corpus.py プロジェクト: pbamotra/cgrnnlm
def print_corpus_metrics(corpus_dir='data'):
    ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN)
    words = ptb.words()
    print 'Total number of words', len(words)
    print 'Total number of unique words', len(set(words))
    print 'Total number of documents', len(ptb.fileids())
コード例 #8
0
def load_reader_and_filedids(lang,data_type):
    assert data_type in ('train','val','test')
    def filter_trees(tree, data_type):
        def _is_control(char):
            """Checks whether `chars` is a control character."""
            # These are technically control characters but we count them as whitespace
            # characters.
            if char == "\t" or char == "\n" or char == "\r":
                return False
            cat = unicodedata.category(char)
            if cat.startswith("C"):
                return True
            return False
        
        sent=tree.leaves()
        if data_type=='wsj' and len(sent)>10: return False
        if data_type!='wsj' and len(sent)>128: return False
        try:
            for c in ' '.join(sent):
                cp=ord(c)
                if cp == 0 or cp == 0xfffd or _is_control(c):
                    return False
            return True
        except:
            return False

    def filt_id(fileids,lang):
        assert lang in ('en','fr','zh')
        train_file_ids,valid_file_ids,test_file_ids=[],[],[]
        for id in fileids:
            prefix=id.split('.')[0]
            if lang=='en':
                if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299':
                    valid_file_ids.append(id)
                elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            elif lang=='zh':
                if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151':
                    valid_file_ids.append(id)
                elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            else:
                if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'):
                    valid_file_ids.append(id) 
                elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'):
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)
        return train_file_ids,valid_file_ids,test_file_ids

    assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de')
    lang_dir=treebank_dir+'/'+lang
    reader=BracketParseCorpusReader(lang_dir, '.*')
    fileids=reader.fileids()
    if data_type=='wsj10':
        return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)]
    train_file_ids = []
    valid_file_ids = []
    test_file_ids = []
    if lang in ('en','zh','fr'):
        train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang)
        train_trees=reader.parsed_sents(train_file_ids)
        val_trees=reader.parsed_sents(valid_file_ids)
        test_trees=reader.parsed_sents(test_file_ids)
    else:
        for fid in fileids:
            if 'train' in fid:
                train_trees=reader.parsed_sents(fid)
            elif 'val' in fid:
                val_trees=reader.parsed_sents(fid)
            elif 'test' in fid:
                test_trees=reader.parsed_sents(fid)
    if data_type=='train':
        train_trees=[t for t in train_trees if filter_trees(t,data_type)]
        print(f'train:{len(train_trees)}')
        return train_trees
    elif data_type=='val':
        val_trees=[t for t in val_trees if filter_trees(t,data_type)]
        print(f'val:{len(val_trees)}')
        return val_trees
    else:
        test_trees=[t for t in test_trees if filter_trees(t,data_type)]
        print(f'test:{len(test_trees)}')
        return test_trees     
コード例 #9
0
ファイル: PennToPCFG.py プロジェクト: jgontrum/PennToPCFG
def revertPOS(symbol):
    return symbol[1:-1]

###### Main #########################################################################
if __name__ == '__main__':
    clArgs = createArgParser().parse_args()
    #Check if any arguments are given. If not, display help
    active = False

    if clArgs.penn != None and clArgs.grammar != None:
        active = True
        ## Set up the treebank reader
        ptb = BracketParseCorpusReader(path.dirname(clArgs.penn), [path.basename(clArgs.penn)])

        ## Collect all terminal and nonterminals
        for tree in ptb.parsed_sents(ptb.fileids()[0]):
            # Also set the start symbol to the root of the first tree
            if len(start_symbol) == 0:
                start_symbol = tree.node
            findSymbolsInTree(tree)


        ## Find ambiguous symbols and map them to a unique alternative
        for symbol in nonterminals.intersection(pos):
            replacement = "_" + symbol + "_"
            symbolMap[symbol] = replacement
            if replacement in pos or replacement in nonterminals:
                print "Cannot make nonterminal unambiguous: ", symbol
                sys.exit(-1)

        ## Iterate over all trees and replace ambigous nonterminals with their unique alternative
コード例 #10
0
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

#Loading your own Corpus
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
# '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids='20/wsj_2013.mrg')[19]

# Conditional Frequency Distributions: 
# is a collection of frequency distributions, each one for a different "condition".
# The condition will often be the category of the text. 

# A frequency distribution counts observable events,
# such as the appearance of words in a text.
# A conditional frequency distribution needs to pair each event with a condition.
# So instead of processing a sequence of words,
# we have to process a sequence of pairs:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...''']
# Each pair has the form (condition, event). 
コード例 #11
0
ファイル: top_words.py プロジェクト: jsay-api/nltk-test
from nltk.corpus import stopwords #new
from nltk.probability import FreqDist #new
from nltk.tokenize import word_tokenize #new
from nltk.tokenize import RegexpTokenizer #new
from nltk.corpus import BracketParseCorpusReader #new
from itertools import zip_longest


default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(('mln', 'reuter', 'dlrs', 'pct', 'the', 'bc', 'reute', 'cts', 'shr', 'feb', 'vs', 'would', 'will', 'inc', 'corp', 'ltd', 'net', 'billion'))
stops = default_stopwords | custom_stopwords
corpus_root = r"articles/"
file_pattern = r"[A-Za-z0-9-]+.sgm"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
filtered_words = []
onlyfiles = [join(corpus_root, f) for f in ptb.fileids()]

# file = 'reut2-000.sgm'


for file in onlyfiles:
	with open(file) as file:
	    soup = BeautifulSoup(file, 'html.parser')

	    tokenizer = RegexpTokenizer(r'\w+')
	    words = tokenizer.tokenize(soup.getText())
	    for word in words:
	    	word = word.lower()
	    	if word not in stops and not word.isnumeric() and len(word)>1:
	    		filtered_words.append(word)
fdist = FreqDist(filtered_words)
コード例 #12
0
ファイル: NLP.py プロジェクト: Toma-L/NLP
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

from nltk.corpus import PlaintextCorpusReader
corpus_root = '' #yourown file
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wlrdlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r""
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids = '20/wsj_2013.mrg')[19]


#2.2====================

text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]

import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories = genre))
コード例 #13
0
    vocab = [i[0] for i in vocab]
    return vocab


if __name__ == '__main__':
    TRAIN_FILE = 'data/wsj_2-21'
    TEST_FILE = 'data/wsj_23'
    DEV_FILE = 'data/wsj_24'
    SECTIONS = [(2, 21), (23, 23), (24, 24)]
    MAXLEN = 50

    wsj = '/data/penn_tb_3.0/TREEBANK_3/PARSED/MRG/WSJ/'
    file_pattern = r".*/WSJ_.*\.MRG"
    ptb = BracketParseCorpusReader(wsj, file_pattern)
    print('Gathered %d files...' % len(ptb.fileids()))

    print('Generating vocabulary...')
    vocab = get_vocab()
    print('Done.')

    print('Preprocessing all sections...')
    for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS):
        print('Preprocessing %s...' % fn)
        h = open(fn, 'wt')
        for section in range(sections[0], sections[1] + 1):
            fileids = [
                i for i in ptb.fileids()
                if i.startswith(str(section).zfill(2))
            ]
            for sent, tree in zip(ptb.sents(fileids),
コード例 #14
0
ファイル: propbank.py プロジェクト: edvisees/DDSemantics
class PropBank(DataLoader):
    """Load PropBank data."""
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus)
        logging.info('Initialize PropBank reader.')

        if with_doc:
            self.wsj_treebank = BracketParseCorpusReader(
                root=params.wsj_path,
                fileids=params.wsj_file_pattern,
                tagset='wsj',
                encoding='ascii')

            logging.info('Found {} treebank files.'.format(
                len(self.wsj_treebank.fileids())))

        self.propbank = PropbankCorpusReader(
            root=FileSystemPathPointer(params.root),
            propfile=params.propfile,
            framefiles=params.frame_files,
            verbsfile=params.verbs_file,
        )

        self.propbank_annos = defaultdict(list)
        logging.info("Loading PropBank Data.")
        for inst in self.propbank.instances():
            docid = inst.fileid.split('/')[-1]
            self.propbank_annos[docid].append(inst)

        self.stats = {
            'predicate_count': 0,
            'argument_count': 0,
        }

    def add_all_annotations(self, doc):
        logging.info("Adding propbank annotations for " + doc.docid)

        instances = self.propbank_annos[doc.docid]

        for inst in instances:
            parsed_sents = doc.get_parsed_sents()

            tree = parsed_sents[inst.sentnum]

            p_word_idx = utils.make_words_from_pointer(tree, inst.predicate)
            pred_span = utils.get_nltk_span(doc.get_token_spans(),
                                            inst.sentnum, p_word_idx)

            pred_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum,
                                           inst.predicate)

            self.stats['predicate_count'] += 1

            for argloc, arg_slot in inst.arguments:
                a_word_idx = utils.make_words_from_pointer(tree, argloc)
                arg_span = utils.get_nltk_span(doc.get_token_spans(),
                                               inst.sentnum, a_word_idx)

                if len(arg_span) == 0:
                    continue

                self.stats['argument_count'] += 1

                p = doc.add_predicate(None, pred_span, frame_type='PROPBANK')
                arg_em = doc.add_entity_mention(None, arg_span)
                arg_node_repr = "%s:%d:%s" % (doc.docid, inst.sentnum, argloc)

                if p and arg_em:
                    p.add_meta('node', pred_node_repr)

                    arg_mention = doc.add_argument_mention(
                        p, arg_em.aid, arg_slot.lower())
                    arg_mention.add_meta('node', arg_node_repr)

    def print_stats(self):
        logging.info("Corpus statistics from Propbank")

        for key, value in self.stats.items():
            logging.info(f"{key} : {value}")
コード例 #15
0
from nltk.corpus import BracketParseCorpusReader;
corpus_root = r"xenopedia";
file_pattern = r".*\.txt";

ptb = BracketParseCorpusReader(corpus_root,file_pattern);

print ptb.fileids();
print len(ptb.sents());
print ptb.sents();
コード例 #16
0
# 1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = '/Temp/delete'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids())
print(wordlists.words('blake-poems.txt'))

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'.*/wsj_.*\.mrg'
file_pattern = r'wsj_.*.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
print(ptb)
print(ptb.fileids())
print(len(ptb.sents()))
print(ptb.sents(fileids='wsj_0199.mrg')[1])

# 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。
# 2.1. 条件 和 事件
# 2.2. 按文体计数词汇
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(genre_word)
print(len(genre_word))
print(genre_word[:4])
コード例 #17
0
ファイル: nombank.py プロジェクト: edvisees/DDSemantics
class NomBank(DataLoader):
    """Loading Nombank data and implicit argument annotations."""
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus, with_doc)

        self.wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset='wsj',
            encoding='ascii')

        logging.info('Found {} treebank files.'.format(
            len(self.wsj_treebank.fileids())))

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=self.wsj_treebank)

        logging.info("Loading G&C annotations.")
        self.gc_annos = self.load_gc_annotations()
        num_gc_preds = sum(
            [len(preds) for (d, preds) in self.gc_annos.items()])
        logging.info(f"Loaded {num_gc_preds} predicates")

        logging.info("Loading Nombank annotations")
        self.nombank_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            docid = nb_instance.fileid.split('/')[-1]
            self.nombank_annos[docid].append(nb_instance)

        self.stats = {
            'target_pred_count': Counter(),
            'predicates_with_implicit': Counter(),
            'implicit_slots': Counter(),
        }

        self.stat_dir = params.stat_dir

    class NomElement:
        def __init__(self, article_id, sent_num, tree_pointer):
            self.article_id = article_id
            self.sent_num = int(sent_num)
            self.pointer = tree_pointer

        @staticmethod
        def from_text(pointer_text):
            parts = pointer_text.split(':')
            if len(parts) != 4:
                raise ValueError("Invalid pointer text.")

            read_id = parts[0]
            full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg'

            return NomBank.NomElement(
                full_id, int(parts[1]),
                NombankTreePointer(int(parts[2]), int(parts[3])))

        def __str__(self):
            return 'Node-%s-%s:%s' % (self.article_id, self.sent_num,
                                      self.pointer.__repr__())

        def __hash__(self):
            return hash(
                (self.article_id, self.sent_num, self.pointer.__repr__()))

        def __eq__(self, other):
            return other and other.__str__() == self.__str__()

        __repr__ = __str__

    def load_gc_annotations(self):
        tree = ET.parse(self.params.implicit_path)
        root = tree.getroot()

        gc_annotations = defaultdict(dict)

        def merge_split_pointers(pointers):
            all_pointers = []
            split_pointers = []

            for pointer, is_split in pointers:
                if is_split:
                    split_pointers.append(pointer)
                else:
                    all_pointers.append(pointer)

            if len(split_pointers) > 0:
                sorted(split_pointers, key=lambda t: t.wordnum)
                all_pointers.append(NombankChainTreePointer(split_pointers))

            return all_pointers

        total_implicit_count = 0
        total_preds = 0

        for annotations in root:
            pred_node_pos = annotations.attrib['for_node']
            predicate = NomBank.NomElement.from_text(pred_node_pos)

            article_id = predicate.article_id

            total_preds += 1

            explicit_roles = set()

            arg_annos = defaultdict(list)

            for annotation in annotations:
                arg_type = annotation.attrib['value']
                arg_node_pos = annotation.attrib['node']

                (arg_article_id, arg_sent_id, arg_terminal_id,
                 arg_height) = arg_node_pos.split(':')

                is_split = False
                is_explicit = False

                for attribute in annotation[0]:
                    if attribute.text == 'Split':
                        is_split = True
                    elif attribute.text == 'Explicit':
                        is_explicit = True

                if pred_node_pos == arg_node_pos:
                    # Incorporated nodes are explicit.
                    is_explicit = True

                if is_explicit:
                    explicit_roles.add(arg_type)
                else:
                    p = NombankTreePointer(int(arg_terminal_id),
                                           int(arg_height))
                    # Arguments are group by their sentences.
                    arg_annos[(arg_sent_id, arg_type)].append((p, is_split))

            all_args = defaultdict(list)
            implicit_role_here = set()
            for (arg_sent_id, arg_type), l_pointers in arg_annos.items():
                if int(arg_sent_id) > predicate.sent_num:
                    # Ignoring annotations after the sentence.
                    continue

                if arg_type not in explicit_roles:
                    for p in merge_split_pointers(l_pointers):
                        arg_element = NomBank.NomElement(
                            article_id, arg_sent_id, p)

                        if not predicate.pointer == arg_element.pointer:
                            # Ignoring incorporated ones.
                            all_args[arg_type].append(arg_element)
                            implicit_role_here.add(arg_type)

            gc_annotations[article_id.split('/')[-1]][predicate] = all_args

            total_implicit_count += len(implicit_role_here)

        logging.info(f"Loaded {total_preds} predicates, "
                     f"{total_implicit_count} implicit arguments.")

        return gc_annotations

    def add_predicate(self, doc, parsed_sents, predicate_node):
        pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num,
                                       predicate_node.pointer)
        p_tree = parsed_sents[predicate_node.sent_num]
        p_word_idx = utils.make_words_from_pointer(p_tree,
                                                   predicate_node.pointer)
        predicate_span = utils.get_nltk_span(doc.token_spans,
                                             predicate_node.sent_num,
                                             p_word_idx)

        if len(predicate_span) == 0:
            logging.warning("Zero length predicate found")
            return

        p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK')

        if p:
            p.add_meta('node', pred_node_repr)

        return p

    def add_nombank_arg(self,
                        doc,
                        parsed_sents,
                        wsj_spans,
                        arg_type,
                        predicate,
                        arg_node,
                        implicit=False):
        arg_type = arg_type.lower()

        a_tree = parsed_sents[arg_node.sent_num]
        a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer)

        arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num,
                                      arg_node.pointer)
        argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num,
                                            a_word_idx)

        if len(argument_span) == 0:
            # Some arguments are empty nodes, they will be ignored.
            return

        em = doc.add_entity_mention(None, argument_span)

        if em:
            if implicit:
                arg_type = 'i_' + arg_type

            arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type)
            arg_mention.add_meta('node', arg_node_repr)

            if implicit:
                arg_mention.add_meta('implicit', True)
                arg_mention.add_meta('sent_num', arg_node.sent_num)
                arg_mention.add_meta('text', em.text)

            return arg_mention

    def get_predicate_text(self, p):
        p_text = p.text.lower()
        if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss':
            p_text = 'loss'
        else:
            p_text = p_text.rstrip('s')

        if p_text == 'savings-and-loan':
            p_text = 'loan'

        if '-' in p_text:
            p_text = p_text.split('-')[1]
        return p_text

    def add_all_annotations(self, doc, parsed_sents):
        logging.info("Adding Nombank annotation for " + doc.docid)
        nb_instances = self.nombank_annos[doc.docid]

        for nb_instance in nb_instances:
            predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                                nb_instance.predicate)

            p = self.add_predicate(doc, parsed_sents, predicate_node)

            for argloc, argid in nb_instance.arguments:
                arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                              argloc)
                arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans,
                                           argid, p, arg_node)

                if arg_node.pointer == predicate_node.pointer:
                    arg.add_meta('incorporated', True)

        if not self.params.explicit_only and doc.docid in self.gc_annos:
            for predicate_node, gc_args in self.gc_annos[doc.docid].items():
                added_args = defaultdict(list)

                p = self.add_predicate(doc, parsed_sents, predicate_node)
                p_text = utils.normalize_pred_text(p.text)

                p.add_meta('from_gc', True)

                self.stats['target_pred_count'][p_text] += 1

                for arg_type, arg_nodes in gc_args.items():
                    for arg_node in arg_nodes:
                        arg = self.add_nombank_arg(doc, parsed_sents,
                                                   doc.token_spans, arg_type,
                                                   p, arg_node, True)
                        added_args[arg_type].append(arg)

                        # The following should be useless already.
                        if arg_node.pointer == predicate_node.pointer:
                            arg.add_meta('incorporated', True)

                        if arg_node.sent_num > predicate_node.sent_num:
                            arg.add_meta('succeeding', True)

                if len(added_args) > 0:
                    self.stats['predicates_with_implicit'][p_text] += 1
                    self.stats['implicit_slots'][p_text] += len(added_args)

    def set_wsj_text(self, doc, fileid):
        text = ''
        w_start = 0

        spans = []
        for tagged_sent in self.wsj_treebank.tagged_sents(fileid):
            word_spans = []

            for word, tag in tagged_sent:
                if not tag == '-NONE-':
                    text += word + ' '
                    word_spans.append((w_start, w_start + len(word)))
                    w_start += len(word) + 1
                else:
                    # Ignoring these words.
                    word_spans.append(None)

            text += '\n'
            w_start += 1

            spans.append(word_spans)

        doc.set_text(text)

        return spans

    def load_nombank(self):
        all_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            all_annos[nb_instance.fileid].append(nb_instance)
        return all_annos

    def get_doc(self):
        for docid, instances in self.nombank_annos.items():
            if self.params.gc_only and docid not in self.gc_annos:
                continue

            doc = DEDocument(self.corpus)
            doc.set_id(docid)

            fileid = docid.split('_')[-1][:2] + '/' + docid

            parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid)
            doc.set_parsed_sents(parsed_sents)

            token_spans = self.set_wsj_text(doc, fileid)
            doc.set_token_spans(token_spans)

            self.add_all_annotations(doc, parsed_sents)

            yield doc

    def print_stats(self):
        logging.info("Corpus statistics from Nombank")

        keys = self.stats.keys()
        headline = 'predicate\t' + '\t'.join(keys)
        sums = Counter()

        if not os.path.exists(self.stat_dir):
            os.makedirs(self.stat_dir)

        preds = sorted(self.stats['predicates_with_implicit'].keys())

        with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out:
            print(headline)
            out.write(f'{headline}\n')

            for pred in preds:
                line = f"{pred}:"
                for key in keys:
                    line += f"\t{self.stats[key][pred]}"
                    sums[key] += self.stats[key][pred]
                print(line)
                out.write(f'{line}\n')

            sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys])
            print(sum_line)
            out.write(f'{sum_line}\n')
コード例 #18
0
# 中文是字符型的,不能使用单词读入函数 words()
# chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_words[:13])

# 中文是字符型的,不能使用句子读入函数 sents()
# chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_sents[:13])

# 3.1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

# 这个在 C 盘根目录下,子目录中需要放入一些文件
corpus_root = '/nltk_data/tokenizers/punkt'
word_lists = PlaintextCorpusReader(corpus_root, '.*')
print("自己语料库的文件列表= ", word_lists.fileids())

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'wsj_.*\.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

show_subtitle("文件列表")
print(ptb.fileids()[:13])

show_subtitle("句子列表")
print(ptb.sents()[:3])

show_subtitle("指定文件中的句子")
print(ptb.sents(fileids='wsj_0003.mrg')[19])
コード例 #19
0
import nltk
import random

from nltk.corpus import BracketParseCorpusReader
from nltk import induce_pcfg

treebank = BracketParseCorpusReader(
    "resources/",
    "skladnica_with_heads.txt",
)

productions = []
for item in treebank.fileids()[:2]:
    for tree in treebank.parsed_sents(item):
        #tree.draw()
        productions += tree.productions()

grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions)
print(grammar.start())
#print(grammar.productions())
#print(grammar._lhs_index)
#print(grammar.productions(lhs=grammar.start()))

#print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę")))
#print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|.")))

used_symbols = []


def generate_symbols(symbol):
コード例 #20
0
from nltk.corpus import BracketParseCorpusReader
corpus_root = r"xenopedia"
file_pattern = r".*\.txt"

ptb = BracketParseCorpusReader(corpus_root, file_pattern)

print ptb.fileids()
print len(ptb.sents())
print ptb.sents()