Exemple #1
0
def retrieve_data():
    train_data = BracketParseCorpusReader("data", "02-21.10way.clean")
    val_data = BracketParseCorpusReader("data", "22.auto.clean")
    test_data = BracketParseCorpusReader("data", "23.auto.clean")

    train_words = [x.lower() for x in train_data.words()]
    val_words = [x.lower() for x in val_data.words()]
    test_words = [x.lower() for x in test_data.words()]

    all_words = train_words + val_words + test_words

    word_counter = Counter(all_words)

    vocab = ['PAD', 'SOS', 'EOS'] + list(word_counter.keys())
    vocab_size = len(vocab)

    word2idx = {ch: i for i, ch in enumerate(vocab)}
    idx2word = {i: ch for i, ch in enumerate(vocab)}

    train_sents = [[w.lower() for w in sent] for sent in train_data.sents()]
    val_sents = [[w.lower() for w in sent] for sent in val_data.sents()]
    test_sents = [[w.lower() for w in sent] for sent in test_data.sents()]

    train_dataset = TextData(train_sents, word2idx, idx2word, vocab_size)
    val_dataset = TextData(val_sents, word2idx, idx2word, vocab_size)
    test_dataset = TextData(test_sents, word2idx, idx2word, vocab_size)

    return train_dataset, val_dataset, test_dataset
Exemple #2
0
def loadCorpora():

    corpus_root = '/usr/share/dict'
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    wordlists.fileids()
    wordlists.words('connectives')

    corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
    file_pattern = r".*/wsj_.*\.mrg" 
    ptb = BracketParseCorpusReader(corpus_root, file_pattern)
    ptb.fileids()
    len(ptb.sents())
    ptb.sents(fileids='20/wsj_2013.mrg')[19]
Exemple #3
0
def train(refresh=True):
    if refresh:
        ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN)
        train_folders = [str(i) + str(j) for i in range(2) for j in range(10)]
        train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)]

        dictionary = corpora.dictionary.Dictionary()
        train_documents = list()

        logger.debug('Starting to parse training documents')
        for folder in train_folders:
            for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)):
                document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)])
                if len(document_sentences) > DOC_LEN_THRESHOLD:
                    doc2sentence = list(chain.from_iterable(document_sentences))
                    doc2sentence = clean_text(doc2sentence)
                    dictionary.add_documents([doc2sentence])
                    train_documents.append(doc2sentence)
        logger.debug('Parsed all training documents')

        dictionary.filter_extremes(no_below=1, no_above=0.5)
        dictionary.save(DICTIONARY_FILE)

        logger.debug('Creating corpus for training data')
        corpus = [dictionary.doc2bow(text) for text in train_documents]
        logger.debug('Finished creating corpus')

        logger.debug('Training LDA model on corpus')
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20)
        logger.debug('Completed LDA training')

        lda.save(LDA_MODEL_FILE)
    else:
        dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE)
        lda = LdaModel.load(LDA_MODEL_FILE)

    return lda, dictionary
Exemple #4
0
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]

from nltk.corpus import PlaintextCorpusReader
corpus_root = '' #yourown file
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wlrdlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r""
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids = '20/wsj_2013.mrg')[19]


#2.2====================

text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]

import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories = genre))
Exemple #5
0
from nltk.corpus import PlaintextCorpusReader

corpus_root = '/Temp/delete'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids())
print(wordlists.words('blake-poems.txt'))

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'.*/wsj_.*\.mrg'
file_pattern = r'wsj_.*.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
print(ptb)
print(ptb.fileids())
print(len(ptb.sents()))
print(ptb.sents(fileids='wsj_0199.mrg')[1])

# 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。
# 2.1. 条件 和 事件
# 2.2. 按文体计数词汇
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(genre_word)
print(len(genre_word))
print(genre_word[:4])
print(genre_word[-4:])
Exemple #6
0
def get_sents_by_field_ids(field_ids):
    if not isinstance(field_ids, list):
        field_ids = [field_ids]
    ptb = BracketParseCorpusReader(DATA_DIR, FILE_PATTERN)
    return ptb.sents(fileids=field_ids)
sents[1:20]

#Loading your own Corpus
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
# '.*' can be a list of fileids, like ['a.txt', 'test/b.txt'], or a pattern that matches all fileids, like '[abc]/.*\.txt'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')

from nltk.corpus import BracketParseCorpusReader
corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj"
file_pattern = r".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids='20/wsj_2013.mrg')[19]

# Conditional Frequency Distributions: 
# is a collection of frequency distributions, each one for a different "condition".
# The condition will often be the category of the text. 

# A frequency distribution counts observable events,
# such as the appearance of words in a text.
# A conditional frequency distribution needs to pair each event with a condition.
# So instead of processing a sequence of words,
# we have to process a sequence of pairs:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', """..."""]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), '''...''']
# Each pair has the form (condition, event). 
# If we were processing the entire Brown Corpus by genre there would be 15 conditions 
Exemple #8
0
# 中文是字符型的,不能使用单词读入函数 words()
# chinese_mandarin_words=udhr.words('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_words[:13])

# 中文是字符型的,不能使用句子读入函数 sents()
# chinese_mandarin_sents=udhr.sents('Chinese_Mandarin-UTF8')
# print(chinese_mandarin_sents[:13])

# 3.1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

# 这个在 C 盘根目录下,子目录中需要放入一些文件
corpus_root = '/nltk_data/tokenizers/punkt'
word_lists = PlaintextCorpusReader(corpus_root, '.*')
print("自己语料库的文件列表= ", word_lists.fileids())

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'wsj_.*\.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

show_subtitle("文件列表")
print(ptb.fileids()[:13])

show_subtitle("句子列表")
print(ptb.sents()[:3])

show_subtitle("指定文件中的句子")
print(ptb.sents(fileids='wsj_0003.mrg')[19])
Exemple #9
0
class PTBReader(object):
    def __init__(self, corpus_root, file_pattern):
        self.ptb = BracketParseCorpusReader(corpus_root, file_pattern)

        self.all_sents = []
        self.all_tagged_sents = []
        self.all_parsed_sents = []
        self.ptb_file_id = ''

    def read_ptb_file(self, node):
        if node.file_id != self.ptb_file_id:
            path = '{0}/{1}.mrg'.format(node.directory, node.file_id)
            self.all_sents = self.ptb.sents(fileids=path)
            self.all_tagged_sents = self.ptb.tagged_sents(fileids=path)
            self.all_parsed_sents = self.ptb.parsed_sents(fileids=path)
            self.ptb_file_id = node.file_id

    def get_subtree_pos(self, node):
        parsed_sent = self.all_parsed_sents[node.sent_id]
        token_pos = parsed_sent.leaf_treeposition(node.token_id)
        subtree_pos = token_pos[:-(node.phrase_level + 1)]
        return subtree_pos

    def is_child_node(self, parent, child):
        if not (isinstance(parent, Node) and isinstance(child, Node)):
            return False
        if not (parent.file_id == child.file_id
                and parent.sent_id == child.sent_id):
            return False

        self.read_ptb_file(parent)
        parent_subtree_pos = self.get_subtree_pos(parent)
        child_subtree_pos = self.get_subtree_pos(child)
        if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos:
            return True
        else:
            return False

    def parse_node(self, node):
        if node.__class__ == SplitNode:
            # parse each node in the split node
            for n in node.node_list:
                self.parse_node(n)

            # combine the ptb_surface of each node
            node.ptb_idx_list = [
                idx for n in node.node_list for idx in n.ptb_idx_list
            ]
            node.ptb_surface = ' '.join(
                [n.ptb_surface for n in node.node_list])

        else:
            self.read_ptb_file(node)

            node.subtree_pos = self.get_subtree_pos(node)

            parsed_sent = self.all_parsed_sents[node.sent_id]
            node.ptb_idx_list = []
            for idx in range(len(parsed_sent.leaves())):
                if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \
                        == node.subtree_pos:
                    node.ptb_idx_list.append(idx)

            assert node.ptb_idx_list == \
                range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \
                'Error in matching indices for subtree leaves: {0}'.format(node)

            tagged_sent = self.all_tagged_sents[node.sent_id]
            node.ptb_surface = ' '.join([
                word[0]
                for word in [tagged_sent[i] for i in node.ptb_idx_list]
            ])
    file_pattern = r".*/WSJ_.*\.MRG"
    ptb = BracketParseCorpusReader(wsj, file_pattern)
    print('Gathered %d files...' % len(ptb.fileids()))

    print('Generating vocabulary...')
    vocab = get_vocab()
    print('Done.')

    print('Preprocessing all sections...')
    for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS):
        print('Preprocessing %s...' % fn)
        h = open(fn, 'wt')
        for section in range(sections[0], sections[1] + 1):
            fileids = [
                i for i in ptb.fileids()
                if i.startswith(str(section).zfill(2))
            ]
            for sent, tree in zip(ptb.sents(fileids),
                                  ptb.parsed_sents(fileids)):
                sent = [
                    normalize(word) if normalize(word) in vocab else '<unk>'
                    for word in sent
                ]
                lin = linearize(tree, token=True, label=False)

                if len(sent) < MAXLEN and len(lin.split()) < MAXLEN:
                    h.write('%s\t%s\n' % (' '.join(sent), lin))
        h.close()
        print('Done.')
    print('Done.')
Exemple #11
0
# 1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = '/Temp/delete'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('blake-poems.txt')

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'.*/wsj_.*\.mrg'
file_pattern = r'wsj_.*.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
ptb.sents(fileids='wsj_0199.mrg')[1]

# 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。
# 2.1. 条件 和 事件
# 2.2. 按文体计数词汇
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
len(genre_word)
genre_word[:4]
genre_word[-4:]
cfd = nltk.ConditionalFreqDist(genre_word)
    if left_nulls:
        iterable = [None] * (size - 1) + iterable

    iters = tee(iterable, size)
    for i in range(1, size):
        for each in iters[i:]:
            next(each, None)
    return zip(*iters)


corpus_root = "wsj"
file_pattern = ".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
for sent in ptb.sents():
    for word1, word2, word3, word4, word5 in window(sent, 5):
        counts[-2][word3][word1] += 1
        counts[-1][word3][word2] += 1
        counts[1][word3][word4] += 1
        counts[2][word3][word5] += 1
counts = dict(counts)

for index, outer_dict in counts.items():
    for word, inner_dict in outer_dict.items():
        counts[index][word] = dict(inner_dict)
    counts[index] = dict(outer_dict)

pickle.dump(counts, open('semantic_counts.pickle', 'wb'))

Exemple #13
0
    tree.chomsky_normal_form()
    return ToString(tree), ToPlainString(tree)


#corpus_root="/home/jihuni/ptb/treebank_3/parsed/mrg/wsj/train/"
#output="/home/jihuni/ptb/treebank_3/wsj.train"
corpus_root = sys.argv[1]
output = sys.argv[2]
file_pattern = r".*/wsj_.*\.mrg"
if (sys.argv > 3):
    file_pattern = sys.argv[3]

ptb = BracketParseCorpusReader(corpus_root, file_pattern)
#ptb.fileids()

sents = [' '.join(words) for words in ptb.sents()]
parsed_sents = ptb.parsed_sents()
#binary_trees=[ToBinaryTreeStr(tree) for tree in parsed_sents]
trimmed_binary_trees = [ToTrimmedBinaryTreeStr(tree) for tree in parsed_sents]

#with open(output, 'w') as f:
#    for sent in sents:
#        f.write(sent+'\n')
#with open(output+'.tree', 'w') as f:
#    for sent in binary_trees:
#        f.write(sent+'\n')
with open(output + '.trim', 'w') as f:
    with open(output + '.trim.tree', 'w') as f2:
        for sent, plain_sent in trimmed_binary_trees:
            f.write(plain_sent.encode('utf-8') + '\n')
            f2.write(sent.encode('utf-8') + '\n')
from nltk.corpus import BracketParseCorpusReader;
corpus_root = r"xenopedia";
file_pattern = r".*\.txt";

ptb = BracketParseCorpusReader(corpus_root,file_pattern);

print ptb.fileids();
print len(ptb.sents());
print ptb.sents();
    # Pad left with None's so that the the first iteration is [None, ..., None, iterable[0]]
    if left_nulls:
        iterable = [None] * (size - 1) + iterable

    iters = tee(iterable, size)
    for i in range(1, size):
        for each in iters[i:]:
            next(each, None)
    return zip(*iters)


corpus_root = "wsj"
file_pattern = ".*/wsj_.*\.mrg"
ptb = BracketParseCorpusReader(corpus_root, file_pattern)

counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
for sent in ptb.sents():
    for word1, word2, word3, word4, word5 in window(sent, 5):
        counts[-2][word3][word1] += 1
        counts[-1][word3][word2] += 1
        counts[1][word3][word4] += 1
        counts[2][word3][word5] += 1
counts = dict(counts)

for index, outer_dict in counts.items():
    for word, inner_dict in outer_dict.items():
        counts[index][word] = dict(inner_dict)
    counts[index] = dict(outer_dict)

pickle.dump(counts, open('semantic_counts.pickle', 'wb'))
Exemple #16
0
from nltk.corpus import BracketParseCorpusReader
corpus_root = r"xenopedia"
file_pattern = r".*\.txt"

ptb = BracketParseCorpusReader(corpus_root, file_pattern)

print ptb.fileids()
print len(ptb.sents())
print ptb.sents()