Beispiel #1
0
def extracting_cnf(corpus_root, file_pattern):
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cnf_dict = {}
    cnf_dict['lexicon'] = set()
    #for file in ptb.fileids():
    #for file in ptb.fileids():
    file = ptb.fileids()[0]
    print(file)
    for s in range(1, len(ptb.parsed_sents(file))):
        tree = ptb.parsed_sents(file)[s]
        for sub in tree.subtrees():
            return_rule(sub, cnf_dict, file)
    return cnf_dict
Beispiel #2
0
def read_ptb():
    sys.stderr.write("\nReading PTB data from " + PTB_DATA_DIR + " ...\n")
    sentences = []
    senno = 0
    with codecs.open("ptb.sents", "w", "utf-8") as ptbsf:
        for constitfile in os.listdir(PTB_DATA_DIR):
            reader = BracketParseCorpusReader(PTB_DATA_DIR, constitfile)
            parses = reader.parsed_sents()
            # TODO: map from parses to sentences
            for p in parses:
                ptbsf.write(" ".join(p.leaves()) + "\n")
                tokpos = p.pos()
                tokens = [VOCDICT.addstr(tok) for tok, pos in tokpos]
                postags = [POSDICT.addstr(pos) for tok, pos in tokpos]
                s = Sentence(
                    "constit",
                    sentnum=senno,
                    tokens=tokens,
                    postags=postags,
                )
                s.get_all_parts_of_ctree(p, CLABELDICT, False)
                sentences.append(s)
                senno += 1
        sys.stderr.write("# PTB sentences: %d\n" % len(sentences))
        ptbsf.close()
    return sentences
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
def tree_reader():
    d = {}
    trees = BracketParseCorpusReader("parsed_sentences/", ".*")
    for name in trees.fileids():
        d_name = re.sub(r"\.tree", "", name)
        d[d_name] = list(trees.parsed_sents(name))

    return d
Beispiel #5
0
def extracting_cfg(
        corpus_root,
        file_pattern):  #returns cfg eith only 2 non-terminals on the right
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    cfg_dict = {}
    unite_productions = {}
    lexicon = {}
    for file in ptb.fileids():
        #file = ptb.fileids()[0]
        print(file)
        for sentence in ptb.parsed_sents(file):  # iterating through sentences
            #sentence =ptb.parsed_sents(file)[some_i]
            if len(sentence.leaves()) <= 8:
                #print(sentence.leaves())
                for subtree in sentence.subtrees():  # extracting subtree
                    left_side = subtree.label()
                    right_side = []
                    for children in subtree:
                        if isinstance(children, str):  # reached leaf node
                            right_side.append(children)
                            if left_side in lexicon:
                                lexicon[left_side].add(children)
                            else:
                                lexicon[left_side] = set()
                                lexicon[left_side].add(children)
                        else:  # still not leafe node
                            right_side.append(children.label())
                    while len(
                            right_side
                    ) > 2:  # making only 2 non-terminals on the right side
                        new_head = '_'.join(
                            right_side[1:]
                        )  # generating new left side of the rule
                        new_right_side = right_side[:1] + [
                            new_head
                        ]  # generating new right side of the rule
                        tup = tuple(new_right_side)
                        if left_side not in cfg_dict:  # new key
                            cfg_dict[left_side] = set()
                            cfg_dict[left_side].add(tup)
                        else:
                            cfg_dict[left_side].add(tup)
                        left_side = new_head
                        right_side = right_side[1:]
                    if len(right_side) == 1:  #unite production
                        if left_side in unite_productions:
                            unite_productions[left_side].add(tuple(right_side))
                        else:
                            unite_productions[left_side] = set()
                            unite_productions[left_side].add(tuple(right_side))
                    if left_side in cfg_dict:  # adding rule to the dict
                        cfg_dict[left_side].add(tuple(right_side))
                    else:
                        cfg_dict[left_side] = set()
                        cfg_dict[left_side].add(tuple(right_side))
    return cfg_dict, lexicon, unite_productions
Beispiel #6
0
def seg_pos_ctb(ctb_dir, fileids):
    reader = BracketParseCorpusReader(ctb_dir, fileids)
    #生成词语和词性元组
    # tree=reader.tagged_sents()
    #生成每个句子的树结构,对于部分数据如40.nw中五年来一句无法正确解析
    tree = reader.parsed_sents()
    print('tree len: {}'.format(len(tree)))

    seg_pos_sentences = []
    broken_parses = []
    for s in tree:
        s = s.pos()

        if s and s != [] and type(s[0]) == tuple:
            s = [j if j[1] != '-NONE-' else (' NONE ', 'NONE') for j in s]
            seg_pos_sentences.append(s)
        else:
            broken_parses.append(s)

    return seg_pos_sentences, broken_parses
Beispiel #7
0
def parse_trees(dir, fileid):
    # reader = BracketParseCorpusReader('/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/new_ctb', fileid)
    reader = BracketParseCorpusReader(dir, fileid)
    tree = reader.parsed_sents()
    return tree
def load_reader_and_filedids(lang,data_type):
    assert data_type in ('train','val','test')
    def filter_trees(tree, data_type):
        def _is_control(char):
            """Checks whether `chars` is a control character."""
            # These are technically control characters but we count them as whitespace
            # characters.
            if char == "\t" or char == "\n" or char == "\r":
                return False
            cat = unicodedata.category(char)
            if cat.startswith("C"):
                return True
            return False
        
        sent=tree.leaves()
        if data_type=='wsj' and len(sent)>10: return False
        if data_type!='wsj' and len(sent)>128: return False
        try:
            for c in ' '.join(sent):
                cp=ord(c)
                if cp == 0 or cp == 0xfffd or _is_control(c):
                    return False
            return True
        except:
            return False

    def filt_id(fileids,lang):
        assert lang in ('en','fr','zh')
        train_file_ids,valid_file_ids,test_file_ids=[],[],[]
        for id in fileids:
            prefix=id.split('.')[0]
            if lang=='en':
                if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299':
                    valid_file_ids.append(id)
                elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            elif lang=='zh':
                if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151':
                    valid_file_ids.append(id)
                elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            else:
                if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'):
                    valid_file_ids.append(id) 
                elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'):
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)
        return train_file_ids,valid_file_ids,test_file_ids

    assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de')
    lang_dir=treebank_dir+'/'+lang
    reader=BracketParseCorpusReader(lang_dir, '.*')
    fileids=reader.fileids()
    if data_type=='wsj10':
        return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)]
    train_file_ids = []
    valid_file_ids = []
    test_file_ids = []
    if lang in ('en','zh','fr'):
        train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang)
        train_trees=reader.parsed_sents(train_file_ids)
        val_trees=reader.parsed_sents(valid_file_ids)
        test_trees=reader.parsed_sents(test_file_ids)
    else:
        for fid in fileids:
            if 'train' in fid:
                train_trees=reader.parsed_sents(fid)
            elif 'val' in fid:
                val_trees=reader.parsed_sents(fid)
            elif 'test' in fid:
                test_trees=reader.parsed_sents(fid)
    if data_type=='train':
        train_trees=[t for t in train_trees if filter_trees(t,data_type)]
        print(f'train:{len(train_trees)}')
        return train_trees
    elif data_type=='val':
        val_trees=[t for t in val_trees if filter_trees(t,data_type)]
        print(f'val:{len(val_trees)}')
        return val_trees
    else:
        test_trees=[t for t in test_trees if filter_trees(t,data_type)]
        print(f'test:{len(test_trees)}')
        return test_trees     
Beispiel #9
0
def revertPOS(symbol):
    return symbol[1:-1]

###### Main #########################################################################
if __name__ == '__main__':
    clArgs = createArgParser().parse_args()
    #Check if any arguments are given. If not, display help
    active = False

    if clArgs.penn != None and clArgs.grammar != None:
        active = True
        ## Set up the treebank reader
        ptb = BracketParseCorpusReader(path.dirname(clArgs.penn), [path.basename(clArgs.penn)])

        ## Collect all terminal and nonterminals
        for tree in ptb.parsed_sents(ptb.fileids()[0]):
            # Also set the start symbol to the root of the first tree
            if len(start_symbol) == 0:
                start_symbol = tree.node
            findSymbolsInTree(tree)


        ## Find ambiguous symbols and map them to a unique alternative
        for symbol in nonterminals.intersection(pos):
            replacement = "_" + symbol + "_"
            symbolMap[symbol] = replacement
            if replacement in pos or replacement in nonterminals:
                print "Cannot make nonterminal unambiguous: ", symbol
                sys.exit(-1)

        ## Iterate over all trees and replace ambigous nonterminals with their unique alternative
Beispiel #10
0
class PTBReader(object):
    def __init__(self, corpus_root, file_pattern):
        self.ptb = BracketParseCorpusReader(corpus_root, file_pattern)

        self.all_sents = []
        self.all_tagged_sents = []
        self.all_parsed_sents = []
        self.ptb_file_id = ''

    def read_ptb_file(self, node):
        if node.file_id != self.ptb_file_id:
            path = '{0}/{1}.mrg'.format(node.directory, node.file_id)
            self.all_sents = self.ptb.sents(fileids=path)
            self.all_tagged_sents = self.ptb.tagged_sents(fileids=path)
            self.all_parsed_sents = self.ptb.parsed_sents(fileids=path)
            self.ptb_file_id = node.file_id

    def get_subtree_pos(self, node):
        parsed_sent = self.all_parsed_sents[node.sent_id]
        token_pos = parsed_sent.leaf_treeposition(node.token_id)
        subtree_pos = token_pos[:-(node.phrase_level + 1)]
        return subtree_pos

    def is_child_node(self, parent, child):
        if not (isinstance(parent, Node) and isinstance(child, Node)):
            return False
        if not (parent.file_id == child.file_id
                and parent.sent_id == child.sent_id):
            return False

        self.read_ptb_file(parent)
        parent_subtree_pos = self.get_subtree_pos(parent)
        child_subtree_pos = self.get_subtree_pos(child)
        if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos:
            return True
        else:
            return False

    def parse_node(self, node):
        if node.__class__ == SplitNode:
            # parse each node in the split node
            for n in node.node_list:
                self.parse_node(n)

            # combine the ptb_surface of each node
            node.ptb_idx_list = [
                idx for n in node.node_list for idx in n.ptb_idx_list
            ]
            node.ptb_surface = ' '.join(
                [n.ptb_surface for n in node.node_list])

        else:
            self.read_ptb_file(node)

            node.subtree_pos = self.get_subtree_pos(node)

            parsed_sent = self.all_parsed_sents[node.sent_id]
            node.ptb_idx_list = []
            for idx in range(len(parsed_sent.leaves())):
                if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \
                        == node.subtree_pos:
                    node.ptb_idx_list.append(idx)

            assert node.ptb_idx_list == \
                range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \
                'Error in matching indices for subtree leaves: {0}'.format(node)

            tagged_sent = self.all_tagged_sents[node.sent_id]
            node.ptb_surface = ' '.join([
                word[0]
                for word in [tagged_sent[i] for i in node.ptb_idx_list]
            ])
Beispiel #11
0
def parse_trees(fileid):
    reader = BracketParseCorpusReader('/home/lnn/Downloads/ctb_test', fileid)
    tree = reader.parsed_sents()
    return tree
Beispiel #12
0
    "sound": "os-sound",
    "sounded": "os-sound",
    "sounds": "os-sound",
    "sounding": "os-sound",
    "ask": "os-ask",
    "asked": "os-ask",
    "asks": "os-ask",
    "asking": "os-ask",
}

ptb = BracketParseCorpusReader(
    corpus_root,
    file_pattern,
    #encoding='utf-8'
    encoding='iso-8859-1')
ptbS = ptb.parsed_sents()


def getClauseHead(st):
    clauseHead = ''
    # delete disfluencies at the S level
    for daughter in st:
        if daughter.label() in [
                'EDITED', 'RS', 'PRN', '-DFL-', 'CONJP', 'ADVP'
        ]:  # including "not only ..."
            del daughter
    #print(joinLeaves(st), st.label())
    if st[0].label()[:2] == 'WH' and joinLeaves(st[0]).lower() == '0':
        clauseHead = 'whNull'
    i = 0
    while clauseHead == '' and i < min(2, len(st)):
Beispiel #13
0
    return ToString(tree), ToPlainString(tree)


#corpus_root="/home/jihuni/ptb/treebank_3/parsed/mrg/wsj/train/"
#output="/home/jihuni/ptb/treebank_3/wsj.train"
corpus_root = sys.argv[1]
output = sys.argv[2]
file_pattern = r".*/wsj_.*\.mrg"
if (sys.argv > 3):
    file_pattern = sys.argv[3]

ptb = BracketParseCorpusReader(corpus_root, file_pattern)
#ptb.fileids()

sents = [' '.join(words) for words in ptb.sents()]
parsed_sents = ptb.parsed_sents()
#binary_trees=[ToBinaryTreeStr(tree) for tree in parsed_sents]
trimmed_binary_trees = [ToTrimmedBinaryTreeStr(tree) for tree in parsed_sents]

#with open(output, 'w') as f:
#    for sent in sents:
#        f.write(sent+'\n')
#with open(output+'.tree', 'w') as f:
#    for sent in binary_trees:
#        f.write(sent+'\n')
with open(output + '.trim', 'w') as f:
    with open(output + '.trim.tree', 'w') as f2:
        for sent, plain_sent in trimmed_binary_trees:
            f.write(plain_sent.encode('utf-8') + '\n')
            f2.write(sent.encode('utf-8') + '\n')
Beispiel #14
0
class NomBank(DataLoader):
    """Loading Nombank data and implicit argument annotations."""
    def __init__(self, params, corpus, with_doc=False):
        super().__init__(params, corpus, with_doc)

        self.wsj_treebank = BracketParseCorpusReader(
            root=params.wsj_path,
            fileids=params.wsj_file_pattern,
            tagset='wsj',
            encoding='ascii')

        logging.info('Found {} treebank files.'.format(
            len(self.wsj_treebank.fileids())))

        self.nombank = NombankCorpusReader(
            root=FileSystemPathPointer(params.nombank_path),
            nomfile=params.nomfile,
            framefiles=params.frame_file_pattern,
            nounsfile=params.nombank_nouns_file,
            parse_fileid_xform=lambda s: s[4:],
            parse_corpus=self.wsj_treebank)

        logging.info("Loading G&C annotations.")
        self.gc_annos = self.load_gc_annotations()
        num_gc_preds = sum(
            [len(preds) for (d, preds) in self.gc_annos.items()])
        logging.info(f"Loaded {num_gc_preds} predicates")

        logging.info("Loading Nombank annotations")
        self.nombank_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            docid = nb_instance.fileid.split('/')[-1]
            self.nombank_annos[docid].append(nb_instance)

        self.stats = {
            'target_pred_count': Counter(),
            'predicates_with_implicit': Counter(),
            'implicit_slots': Counter(),
        }

        self.stat_dir = params.stat_dir

    class NomElement:
        def __init__(self, article_id, sent_num, tree_pointer):
            self.article_id = article_id
            self.sent_num = int(sent_num)
            self.pointer = tree_pointer

        @staticmethod
        def from_text(pointer_text):
            parts = pointer_text.split(':')
            if len(parts) != 4:
                raise ValueError("Invalid pointer text.")

            read_id = parts[0]
            full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg'

            return NomBank.NomElement(
                full_id, int(parts[1]),
                NombankTreePointer(int(parts[2]), int(parts[3])))

        def __str__(self):
            return 'Node-%s-%s:%s' % (self.article_id, self.sent_num,
                                      self.pointer.__repr__())

        def __hash__(self):
            return hash(
                (self.article_id, self.sent_num, self.pointer.__repr__()))

        def __eq__(self, other):
            return other and other.__str__() == self.__str__()

        __repr__ = __str__

    def load_gc_annotations(self):
        tree = ET.parse(self.params.implicit_path)
        root = tree.getroot()

        gc_annotations = defaultdict(dict)

        def merge_split_pointers(pointers):
            all_pointers = []
            split_pointers = []

            for pointer, is_split in pointers:
                if is_split:
                    split_pointers.append(pointer)
                else:
                    all_pointers.append(pointer)

            if len(split_pointers) > 0:
                sorted(split_pointers, key=lambda t: t.wordnum)
                all_pointers.append(NombankChainTreePointer(split_pointers))

            return all_pointers

        total_implicit_count = 0
        total_preds = 0

        for annotations in root:
            pred_node_pos = annotations.attrib['for_node']
            predicate = NomBank.NomElement.from_text(pred_node_pos)

            article_id = predicate.article_id

            total_preds += 1

            explicit_roles = set()

            arg_annos = defaultdict(list)

            for annotation in annotations:
                arg_type = annotation.attrib['value']
                arg_node_pos = annotation.attrib['node']

                (arg_article_id, arg_sent_id, arg_terminal_id,
                 arg_height) = arg_node_pos.split(':')

                is_split = False
                is_explicit = False

                for attribute in annotation[0]:
                    if attribute.text == 'Split':
                        is_split = True
                    elif attribute.text == 'Explicit':
                        is_explicit = True

                if pred_node_pos == arg_node_pos:
                    # Incorporated nodes are explicit.
                    is_explicit = True

                if is_explicit:
                    explicit_roles.add(arg_type)
                else:
                    p = NombankTreePointer(int(arg_terminal_id),
                                           int(arg_height))
                    # Arguments are group by their sentences.
                    arg_annos[(arg_sent_id, arg_type)].append((p, is_split))

            all_args = defaultdict(list)
            implicit_role_here = set()
            for (arg_sent_id, arg_type), l_pointers in arg_annos.items():
                if int(arg_sent_id) > predicate.sent_num:
                    # Ignoring annotations after the sentence.
                    continue

                if arg_type not in explicit_roles:
                    for p in merge_split_pointers(l_pointers):
                        arg_element = NomBank.NomElement(
                            article_id, arg_sent_id, p)

                        if not predicate.pointer == arg_element.pointer:
                            # Ignoring incorporated ones.
                            all_args[arg_type].append(arg_element)
                            implicit_role_here.add(arg_type)

            gc_annotations[article_id.split('/')[-1]][predicate] = all_args

            total_implicit_count += len(implicit_role_here)

        logging.info(f"Loaded {total_preds} predicates, "
                     f"{total_implicit_count} implicit arguments.")

        return gc_annotations

    def add_predicate(self, doc, parsed_sents, predicate_node):
        pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num,
                                       predicate_node.pointer)
        p_tree = parsed_sents[predicate_node.sent_num]
        p_word_idx = utils.make_words_from_pointer(p_tree,
                                                   predicate_node.pointer)
        predicate_span = utils.get_nltk_span(doc.token_spans,
                                             predicate_node.sent_num,
                                             p_word_idx)

        if len(predicate_span) == 0:
            logging.warning("Zero length predicate found")
            return

        p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK')

        if p:
            p.add_meta('node', pred_node_repr)

        return p

    def add_nombank_arg(self,
                        doc,
                        parsed_sents,
                        wsj_spans,
                        arg_type,
                        predicate,
                        arg_node,
                        implicit=False):
        arg_type = arg_type.lower()

        a_tree = parsed_sents[arg_node.sent_num]
        a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer)

        arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num,
                                      arg_node.pointer)
        argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num,
                                            a_word_idx)

        if len(argument_span) == 0:
            # Some arguments are empty nodes, they will be ignored.
            return

        em = doc.add_entity_mention(None, argument_span)

        if em:
            if implicit:
                arg_type = 'i_' + arg_type

            arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type)
            arg_mention.add_meta('node', arg_node_repr)

            if implicit:
                arg_mention.add_meta('implicit', True)
                arg_mention.add_meta('sent_num', arg_node.sent_num)
                arg_mention.add_meta('text', em.text)

            return arg_mention

    def get_predicate_text(self, p):
        p_text = p.text.lower()
        if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss':
            p_text = 'loss'
        else:
            p_text = p_text.rstrip('s')

        if p_text == 'savings-and-loan':
            p_text = 'loan'

        if '-' in p_text:
            p_text = p_text.split('-')[1]
        return p_text

    def add_all_annotations(self, doc, parsed_sents):
        logging.info("Adding Nombank annotation for " + doc.docid)
        nb_instances = self.nombank_annos[doc.docid]

        for nb_instance in nb_instances:
            predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                                nb_instance.predicate)

            p = self.add_predicate(doc, parsed_sents, predicate_node)

            for argloc, argid in nb_instance.arguments:
                arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum,
                                              argloc)
                arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans,
                                           argid, p, arg_node)

                if arg_node.pointer == predicate_node.pointer:
                    arg.add_meta('incorporated', True)

        if not self.params.explicit_only and doc.docid in self.gc_annos:
            for predicate_node, gc_args in self.gc_annos[doc.docid].items():
                added_args = defaultdict(list)

                p = self.add_predicate(doc, parsed_sents, predicate_node)
                p_text = utils.normalize_pred_text(p.text)

                p.add_meta('from_gc', True)

                self.stats['target_pred_count'][p_text] += 1

                for arg_type, arg_nodes in gc_args.items():
                    for arg_node in arg_nodes:
                        arg = self.add_nombank_arg(doc, parsed_sents,
                                                   doc.token_spans, arg_type,
                                                   p, arg_node, True)
                        added_args[arg_type].append(arg)

                        # The following should be useless already.
                        if arg_node.pointer == predicate_node.pointer:
                            arg.add_meta('incorporated', True)

                        if arg_node.sent_num > predicate_node.sent_num:
                            arg.add_meta('succeeding', True)

                if len(added_args) > 0:
                    self.stats['predicates_with_implicit'][p_text] += 1
                    self.stats['implicit_slots'][p_text] += len(added_args)

    def set_wsj_text(self, doc, fileid):
        text = ''
        w_start = 0

        spans = []
        for tagged_sent in self.wsj_treebank.tagged_sents(fileid):
            word_spans = []

            for word, tag in tagged_sent:
                if not tag == '-NONE-':
                    text += word + ' '
                    word_spans.append((w_start, w_start + len(word)))
                    w_start += len(word) + 1
                else:
                    # Ignoring these words.
                    word_spans.append(None)

            text += '\n'
            w_start += 1

            spans.append(word_spans)

        doc.set_text(text)

        return spans

    def load_nombank(self):
        all_annos = defaultdict(list)
        for nb_instance in self.nombank.instances():
            all_annos[nb_instance.fileid].append(nb_instance)
        return all_annos

    def get_doc(self):
        for docid, instances in self.nombank_annos.items():
            if self.params.gc_only and docid not in self.gc_annos:
                continue

            doc = DEDocument(self.corpus)
            doc.set_id(docid)

            fileid = docid.split('_')[-1][:2] + '/' + docid

            parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid)
            doc.set_parsed_sents(parsed_sents)

            token_spans = self.set_wsj_text(doc, fileid)
            doc.set_token_spans(token_spans)

            self.add_all_annotations(doc, parsed_sents)

            yield doc

    def print_stats(self):
        logging.info("Corpus statistics from Nombank")

        keys = self.stats.keys()
        headline = 'predicate\t' + '\t'.join(keys)
        sums = Counter()

        if not os.path.exists(self.stat_dir):
            os.makedirs(self.stat_dir)

        preds = sorted(self.stats['predicates_with_implicit'].keys())

        with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out:
            print(headline)
            out.write(f'{headline}\n')

            for pred in preds:
                line = f"{pred}:"
                for key in keys:
                    line += f"\t{self.stats[key][pred]}"
                    sums[key] += self.stats[key][pred]
                print(line)
                out.write(f'{line}\n')

            sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys])
            print(sum_line)
            out.write(f'{sum_line}\n')
from nltk.corpus import BracketParseCorpusReader
import numpy as np
import scipy
from scipy import spatial
import matplotlib.pyplot as plt
import math
import re
import sys
import csv

corpus_root = r"all/"
file_pattern = r".*\.mrg"

sw = BracketParseCorpusReader(corpus_root, file_pattern)

trees = sw.parsed_sents()


def give(t):
    return t.label() == 'VP'


all_vp = []

for tree in trees:
    for vp in tree.subtrees(give):
        children = []
        pps = []
        pp = []
        for child in vp:
            if 'PP' in child.label():
Beispiel #16
0
import nltk
import random

from nltk.corpus import BracketParseCorpusReader
from nltk import induce_pcfg

treebank = BracketParseCorpusReader(
    "resources/",
    "skladnica_with_heads.txt",
)

productions = []
for item in treebank.fileids()[:2]:
    for tree in treebank.parsed_sents(item):
        #tree.draw()
        productions += tree.productions()

grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions)
print(grammar.start())
#print(grammar.productions())
#print(grammar._lhs_index)
#print(grammar.productions(lhs=grammar.start()))

#print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę")))
#print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|.")))

used_symbols = []


def generate_symbols(symbol):
    file_pattern = r".*/WSJ_.*\.MRG"
    ptb = BracketParseCorpusReader(wsj, file_pattern)
    print('Gathered %d files...' % len(ptb.fileids()))

    print('Generating vocabulary...')
    vocab = get_vocab()
    print('Done.')

    print('Preprocessing all sections...')
    for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS):
        print('Preprocessing %s...' % fn)
        h = open(fn, 'wt')
        for section in range(sections[0], sections[1] + 1):
            fileids = [
                i for i in ptb.fileids()
                if i.startswith(str(section).zfill(2))
            ]
            for sent, tree in zip(ptb.sents(fileids),
                                  ptb.parsed_sents(fileids)):
                sent = [
                    normalize(word) if normalize(word) in vocab else '<unk>'
                    for word in sent
                ]
                lin = linearize(tree, token=True, label=False)

                if len(sent) < MAXLEN and len(lin.split()) < MAXLEN:
                    h.write('%s\t%s\n' % (' '.join(sent), lin))
        h.close()
        print('Done.')
    print('Done.')
Beispiel #18
0
def read_brackets(constitfile):
    sys.stderr.write("\nReading constituents from " + constitfile + " ...\n")
    reader = BracketParseCorpusReader(PARSER_DATA_DIR + "rnng/", constitfile)
    parses = reader.parsed_sents()
    return parses
Beispiel #19
0
                nodeCopy = node.copy()
                node[0:] = []  # delete the children

                curNode = node
                numChildren = len(nodeCopy)
                for i in range(1, numChildren - 1):
                    if factor == "right":
                        newHead = "%s%s<%s>%s" % (
                            originalNode, childChar, "-".join(
                                childNodes[i:min([i +
                                                  horzMarkov, numChildren])]),
                            parentString)  # create new head
                        newNode = Tree(newHead, [])
                        curNode[0:] = [nodeCopy.pop(0), newNode]
                    curNode = newNode

                curNode[0:] = [child for child in nodeCopy]


corpus_root = r"C:\Users\maksi\Documents\Python\NLP class\HW2\wsj\wsj"
file_pattern = r".*/wsj_.*\.mrg"

ptb = BracketParseCorpusReader(corpus_root, file_pattern)

tree = ptb.parsed_sents('00/wsj_0001.mrg')[1]
#s= "(S (NP (DT the) (NNS kids)) (VP (VBD opened) (NP (DT the) (NN box)) (PP (IN on)(NP (DT the) (NN floor)))))"
#tree = Tree.fromstring(tree1)
tree.chomsky_normal_form()
for p in tree.productions():
    print(p)