Esempio n. 1
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")

    assert isinstance(jeita.tagged_words()[0][1], str)
Esempio n. 2
0
def make_classifier():
    positive_file = 'positive_tweets.json'
    negative_file = 'negative_tweets.json'
    files = [positive_file, negative_file]

    twitter_samples = LazyCorpusLoader('twitter_samples',
                                       TwitterCorpusReader,
                                       files,
                                       word_tokenizer=CustomTokenizer())

    #this returns a list of lists
    twitter_tokens = twitter_samples.tokenized()

    #need to unpack our list of lists, using a nested list comprehension
    frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub)
    fequency_dist.pprint(100)
    master_list_of_words = tuple(requency_dist.keys())
    extraction_function = make_extract_features_func(master_list_of_words)

    positive_tokens = twitter_samples.tokenized(positive_file)
    negative_tokens = twitter_samples.tokenized(negative_file)

    poistive_tokens = [(token, 'positive') for token in positive_tokens]
    negative_tokens = [(token, 'negative') for token in negative_tokens]

    all_tokens = positive_tokens + negative_tokens
    random.shuffle(all_tokens)

    training_set = nltk.classify.apply_features(extraction_function,
                                                all_tokens)

    classifier = NaiveBayesClassifier.train(training_set)

    return classifier, master_list_of_words
Esempio n. 3
0
def ClassifierModel():
    positive_file = 'positive_tweets.json'
    negative_file = 'negative_tweets.json'
    files = [positive_file, negative_file]
    twitter_samples = LazyCorpusLoader('twitter_samples',
                                       TwitterCorpusReader,
                                       files,
                                       word_tokenizer=CustomTokenizer())

    #this returns a list of lists
    twitter_tokens = twitter_samples.tokenized()

    #need to unpack the list of lists using nested list
    frequency_dist = nltk.FreqDist(x for sub in twitter_tokens for x in sub)
    frequency_dist.pprint(200)

    master_list_of_words = tuple(frequency_dist.keys())
    extraction_function = feature_extraction(master_list_of_words)
    positive_tokens = twitter_samples.tokenized(positive_file)
    negative_tokens = twitter_samples.tokenized(negative_file)
    positive_tokens = [(token, 'positive') for token in positive_tokens]
    negative_tokens = [(token, 'negative') for token in negative_tokens]
    all_tokens = positive_tokens + negative_tokens
    random.shuffle(all_tokens)
    #creating training set
    training_set = nltk.classify.apply_features(extraction_function,
                                                all_tokens)

    #creating a classifier bt calling the train method
    classifier = NaiveBayesClassifier.train(training_set)

    return classifier, master_list_of_words
Esempio n. 4
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")

    assert isinstance(jeita.tagged_words()[0][1], basestring)
Esempio n. 5
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')

    assert isinstance(jeita.tagged_words()[0][1], basestring)
Esempio n. 6
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')

    assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
Esempio n. 7
0
File: knbc.py Progetto: DrDub/nltk
def test():

    from nltk.corpus.util import LazyCorpusLoader
    knbc = LazyCorpusLoader(
        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp")
    assert isinstance(knbc.words()[0], basestring)
    assert isinstance(knbc.sents()[0][0], basestring)
    assert type(knbc.tagged_words()[0]) == tuple
    assert type(knbc.tagged_sents()[0][0]) == tuple
Esempio n. 9
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
Esempio n. 10
0
 def loaddiff(self):
     corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
     ##中文目录乱码
     corpus_root = unicode(corpus_root, "GB2312")
     self.logger.info(corpus_root)
     pattern_1 = r".*/diff1/.*\.txt"
     self.logger.info("加载语料库 lazyload")
     self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                   pattern_1)
     self.logger.info("加载语料库 完毕")
Esempio n. 11
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print knbc.fileids()[:10]
    print ''.join( knbc.words()[:100] )

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] )
Esempio n. 12
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
    print "/".join(jeita.words()[22100:22140])

    print "\nEOS\n".join(
        ["\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]]
    )
Esempio n. 13
0
def print_corpus_information(corpus: LazyCorpusLoader,
                             corpus_name: str) -> None:
    """
    Prints information about an NLTK corpus e.g. the Brown corpus.
    :param corpus_name:
    :param corpus: the NLTK corpus in use.
    :return: None.
    """
    print("Number of words in {} corpus = {}".format(corpus_name,
                                                     len(corpus.words())))
    print("Number of sentences in {} corpus = {}".format(
        corpus_name, len(corpus.tagged_sents(tagset='universal'))))
Esempio n. 14
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
    print('/'.join( jeita.words()[22100:22140] ))


    print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
                          for sent in jeita.tagged_sents()[2170:2173]))
Esempio n. 15
0
def demo():
    
    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader(
        'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
    print '/'.join( jeita.words()[22100:22140] ) 


    print '\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent)
                          for sent in jeita.tagged_sents()[2170:2173]])
Esempio n. 16
0
def new_wordnet_instance():
    """
    Create a new wordnet instance. This is usefult for parallel workflows.
    Multiple processes cannot access the same wordnet instance (as when imported
    globally with `from wordnet.corpus import wordnet`). This is due nltk not
    being thread-safe.
    """
    return LazyCorpusLoader(
        'wordnet', WordNetCorpusReader,
        LazyCorpusLoader('omw', CorpusReader,
                         r'.*/wn-data-.*\.tab', encoding='utf8')
    )
Esempio n. 17
0
		def __init__(self, languages=LangIDDict().keys()):
		
			self.language_trigrams = {}
			self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt')
			
			for lang in languages:
				self.language_trigrams[lang] = FreqDist()
				for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
					self.language_trigrams[lang].inc(f[0], f[1])
				self.language_dicts = dict([
					(id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()]))
					for id, fdist in self.language_trigrams.items()
				])
Esempio n. 18
0
def read_knbc(train_file, test_file, reference_file):

	root = nltk.data.find('corpora/knbc/corpus1')
	fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
              if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

	knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
           sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

	sentences = knbc.sents()

	write_train(sentences[0:4000], train_file)
	write_test(sentences[4000:-1], test_file)
	write_reference(sentences[4000:-1], reference_file)
Esempio n. 19
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
Esempio n. 20
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print knbc.fileids()[:10]
    print ''.join( knbc.words()[:100] )

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] )
Esempio n. 21
0
def parse_wsj(processes=8):
    ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions
        'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg',
        cat_file='allcats.txt', tagset='wsj')

    fileids = ptb.fileids()
    params = []
    for f in fileids:
        corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f))
        for i, (parsed, tagged) in enumerate(corpus):
            params.append((f, i, parsed, tagged))

    p = Pool(processes)
    p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
Esempio n. 22
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita",
                             ChasenCorpusReader,
                             r".*chasen",
                             encoding="utf-8")
    print("/".join(jeita.words()[22100:22140]))

    print("\nEOS\n".join("\n".join("{}/{}".format(w[0], w[1].split("\t")[2])
                                   for w in sent)
                         for sent in jeita.tagged_sents()[2170:2173]))
Esempio n. 23
0
def demo(**kwargs):
    import nltk
    from nltk_contrib.coref import NLTK_COREF_DATA
    from nltk_contrib.coref.muc import muc6_documents, muc7_documents
    from nltk_contrib.coref.muc import MUCCorpusReader
    nltk.data.path.insert(0, NLTK_COREF_DATA)
    muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents)
    for sent in muc6.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc6.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
    muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents)
    for sent in muc7.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc7.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
Esempio n. 24
0
def load_data():
    abc = LazyCorpusLoader(
        "abc",
        PlaintextCorpusReader,
        r"(?!\.).*\.txt",
        encoding=[("science", "latin_1"), ("rural", "utf8")],
    )

    raw = abc.sents()
    sentences = []

    stopwords_ = list(stopwords.words('english'))
    final_stopwords = {w: 1 for w in stopwords_}

    for s in raw:
        words = []
        for w in s:
            if w.isalpha() and w not in final_stopwords:
                words.append(w.lower())
        sentences.append(words)

    word_counts = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            word_counts[word] += 1

    vocabulary = list(word_counts.keys())
    vocabulary.extend(["<START>", "<END>"])
    vocab_size = len(vocabulary)
    word_to_num = {word: n for n, word in enumerate(vocabulary)}
    num_to_word = {n: word for n, word in enumerate(vocabulary)}

    sums = [-2, -1, 1, 2]
    training_data = []
    for sentence in tqdm(sentences):
        length = len(sentence)
        for cur_index in range(length):
            cur_word = sentence[cur_index]
            context_vector = []
            for diff in sums:
                index = cur_index + diff
                if index >= 0 and index < length:
                    context_word = sentence[index]
                    context_vector.append(context_word)
            if len(context_vector) == 4:
                training_data.append([context_vector, cur_word])

    return vocab_size, vocabulary, word_to_num, num_to_word, training_data
Esempio n. 25
0
def ham_corpus_maker(outpath, word):
    corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader,
                              r'(?!\.).*\.xml')
    outfile = codecs.open(outpath, 'w', 'utf-8')
    count = 0
    instancenum = 0
    targetwordnum = 0
    for file in corpus.fileids():
        #print file

        for doc in corpus.xml(file).getchildren():

            # print doc.getchildren()
            cat = doc.getchildren()[3].text  #
            text = doc.getchildren()[5].text
            newtext = correctPersianString(text)
            newtext = newtext.replace('\n', ' ')
            textlines = newtext.split('.')
            if word in newtext.split():
                print newtext
                outfile.write(newtext)
                outfile.write('\n')
                print
                print

    print str(instancenum) + " seeds found "
    print str(targetwordnum) + " target word found "

    outfile.close()
Esempio n. 26
0
class LangDetect(object):
    language_trigrams = {}
    langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')

    def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang + "-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def detect(self, text):
        '''
        Detect the text's language
        '''
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1
        total = sum(trigrams.values())
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]

    def get_word_trigrams(self, match):
        return [
            ''.join(trigram) for trigram in nltk_trigrams(match)
            if trigram != None
        ]
Esempio n. 27
0
def load_treebank(sections):
    treebank_path = os.environ.get('NLTK_TREEBANK', 'treebank/combined')
    treebank = LazyCorpusLoader(
        treebank_path,
        BracketParseCorpusReader, 
        r'(%s\/)?wsj_%s.*\.mrg' % (sections, sections))
    return treebank
Esempio n. 28
0
    def build_terms(self, terms):
        # save the original corpus
        corpus_temp = terms.kwargs["corpus"]
        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)',
                          corpus_temp.root.path)
        terms.kwargs["corpus"] = LazyCorpusLoader(
            "c50_tags/" + groups.group(1),
            CategorizedPlaintextCorpusReader,
            r'.+/.+',
            cat_pattern=r'(.+)/.+')

        print "In ModeWeightClassCollocationPOS"
        cache_file = "%s.dat" % terms.name
        terms.tokens = []
        shelf = shelve.open(cache_file, protocol=2)

        for author in terms.kwargs["corpus"].categories():

            author_files = set(terms.kwargs["corpus"].fileids([author])) & set(
                terms.kwargs["source"])
            author_files = list(author_files)
            if len(author_files) == 0:
                continue

            author_files.sort()
            #print "str(author_files): " + str(author_files)
            #print "str(terms.kwargs["corpus"]): " + str(terms.kwargs["corpus"]) + " str(terms.kwargs["corpus"].fileids([author])): " + str(terms.kwargs["corpus"].fileids([author])) + " str(terms.kwargs[\"source\"]): " + str(terms.kwargs["source"])
            f_srcs = "|".join(author_files)

            terms.kwargs["string"] = \
            terms.kwargs["corpus"].raw(fileids=author_files).lower()

            if f_srcs in shelf and terms.kwargs["lazy"]:
                terms.tokens += shelf[f_srcs]
                #print(str(f_src))
                #print("%s ... Found in \"%s\"" % (f_src, cache_file))
            else:
                terms.kwargs["string"] = \
                terms.kwargs["corpus"].raw(fileids=author_files).lower()

                temp_tokens = terms.calc_terms()

                # because the latter function calc:terms get off this option,
                # but we still needed
                terms.kwargs["boolBuildSetGlobal"] = True
                terms.kwargs["mode"] = EnumModes.MODE_CORPUS_POS_GLOBAL_A
                ###############################################################

                terms.tokens += temp_tokens

                if terms.kwargs["lazy"]:
                    shelf[f_srcs] = temp_tokens

                #print ("%s ... Recalculated in \"%s\"" % (f_src, cache_file))
        terms.kwargs["boolBuildSetGlobal"] = False
        terms.kwargs["mode"] = EnumModes.MODE_CORPUS
        shelf.close()

        # restore the original corpus
        terms.kwargs["corpus"] = corpus_temp
Esempio n. 29
0
def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs):
    if corpus == 'timit':
        return LazyCorpusLoader('timit',
                                NumberedTaggedSentCorpusReader,
                                '.+\.tags',
                                tag_mapping_function=simplify_wsj_tag)

    real_corpus = getattr(nltk.corpus, corpus, None)

    if not real_corpus:
        if not reader:
            raise ValueError('you must specify a corpus reader')

        if not fileids:
            fileids = '.*'

        root = os.path.expanduser(corpus)

        if not os.path.isdir(root):
            if not corpus.startswith('corpora/'):
                path = 'corpora/%s' % corpus
            else:
                path = corpus

            try:
                root = nltk.data.find(path)
            except LookupError:
                raise ValueError('cannot find corpus path for %s' % corpus)

        reader_cls = import_attr(reader)
        real_corpus = reader_cls(root, fileids, **kwargs)

    return real_corpus
Esempio n. 30
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp")

    print knbc.fileids()[:10]
    print "".join(knbc.words()[:100])

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])

    print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
Esempio n. 31
0
def read_knbc(train_file, test_file, reference_file):

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    knbc = LazyCorpusLoader('knbc/corpus1',
                            KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort),
                            encoding='euc-jp')

    sentences = knbc.sents()

    write_train(sentences[0:4000], train_file)
    write_test(sentences[4000:-1], test_file)
    write_reference(sentences[4000:-1], reference_file)
Esempio n. 32
0
 def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es', 'th', 'pt', 'pl', "id", "ru", "it", "ru", "tr"]):
     logger.info("Build " + self.__class__.__name__ + " ... ")
     self.language_trigrams = {}
     self.langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')
     self.__mutex = threading.Semaphore()
     for lang in languages:
         self.language_trigrams[lang] = FreqDist()
         for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
             self.language_trigrams[lang].inc(f[0], f[1])
     logger.info("Build " + self.__class__.__name__ + ": done!")
Esempio n. 33
0
def treebank_tagger_demo():
    from nltk.corpus.util import LazyCorpusLoader    
    from nltk.corpus.reader import PlaintextCorpusReader
    from nltk_contrib.coref.util import TreebankTaggerCorpusReader
    
    state_union = LazyCorpusLoader(
        'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt')
    state_union = TreebankTaggerCorpusReader(state_union)
    
    print 'Treebank tagger demo...'
    print 'Tagged sentences:'
    for sent in state_union.tagged_sents()[500:505]:
        print sent
        print
    print
    print 'Tagged words:'
    for word in state_union.tagged_words()[500:505]:
        print word
    print
Esempio n. 34
0
    def from_nltk(cls):
        """Returns a fully populated Propbank with the help of NLTK's interface"""
        ptb = LazyCorpusLoader(
            'ptb',
            CategorizedBracketParseCorpusReader,
            r'wsj/\d\d/wsj_\d\d\d\d.mrg',
            cat_file='allcats.txt'
        )

        propbank_ptb = LazyCorpusLoader(
            'propbank', PropbankCorpusReader,
            'prop.txt', 'frames/.*\.xml', 'verbs.txt',
            lambda filename: filename.upper(),
            ptb
        ) # Must be defined *after* ptb corpus.

        role_dict = {}
        for roleset_xml in propbank_ptb.rolesets():
            role = Role.fromxml(roleset_xml)
            role_dict[role.roleset_id] = role

        instance_dict = defaultdict(dict)
        pb_instances = propbank_ptb.instances()
        for instance in pb_instances:
            instance.fileid = instance.fileid.lower()
            file_num = instance.fileid.split("/")[-1].split(".")[0].replace("wsj_", "")
            sentnum = str(instance.sentnum)
            predicate = instance.predicate
            tree = instance.tree

            if isinstance(predicate, nltk.corpus.reader.propbank.PropbankTreePointer):
                key = Propbank.pointer_to_word(predicate, tree)
            elif isinstance(predicate, nltk.corpus.reader.propbank.PropbankSplitTreePointer):
                key = tuple([Propbank.pointer_to_word(p, tree) for p in predicate.pieces])
            else:
                ### TODO: Investigate when this is the case ###
                #assert False
                continue

            pb_instance = PropbankInstance(instance.fileid, file_num, sentnum, key, instance.roleset, instance.arguments)
            instance_dict[(file_num, sentnum)][key] = pb_instance

        return Propbank(role_dict, instance_dict)
Esempio n. 35
0
    def loadcorpus(self):
        corpus_root = str(os.path.join(GetFilePathRoot.get_root_dir(), "data"))
        ##中文目录乱码
        corpus_root = unicode(corpus_root, "GB2312")
        self.logger.info(corpus_root)

        pattern_1 = r".*/diff1/.*\.txt"
        pattern_2 = r".*/diff2/.*\.txt"
        pattern_3 = r".*/diff3/.*\.txt"
        from nltk.corpus.util import LazyCorpusLoader
        from nltk.corpus import PlaintextCorpusReader
        self.logger.info("加载语料库")
        self.diff1 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_1)
        self.diff2 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_2)
        self.diff3 = LazyCorpusLoader(corpus_root, PlaintextCorpusReader,
                                      pattern_3)
        self.logger.info("加载完毕")
Esempio n. 36
0
def treebank_chunk_tagger_demo():
    from nltk.corpus.util import LazyCorpusLoader    
    from nltk.corpus.reader import PlaintextCorpusReader
    from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader
    
    state_union = LazyCorpusLoader(
        'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt')
    state_union = TreebankChunkTaggerCorpusReader(state_union)

    print 'Treebank chunker demo...'
    print 'Chunked sentences:'
    for sent in state_union.chunked_sents()[500:505]:
        print sent
        print
    print
    print 'Parsed sentences:'
    for tree in state_union.parsed_sents()[500:505]:
        print tree
        print
    print
Esempio n. 37
0
    def calc_terms(self, kwargs, f_src):
        # save the original corpus
        corpus_temp = kwargs["corpus"]

        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path)
        kwargs["corpus"] = LazyCorpusLoader("c50_term_SFM_23/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+')

        sfm_terms = Util.calc_SFM(kwargs["corpus"].raw(fileids=[f_src]))

        # restore the original corpus
        kwargs["corpus"] = corpus_temp
        return sfm_terms
Esempio n. 38
0
def demo(**kwargs):
    import nltk
    from nltk_contrib.coref import NLTK_COREF_DATA    
    from nltk_contrib.coref.muc import muc6_documents, muc7_documents
    from nltk_contrib.coref.muc import MUCCorpusReader
    nltk.data.path.insert(0, NLTK_COREF_DATA)   
    muc6 = LazyCorpusLoader('muc6/', MUCCorpusReader, muc6_documents)
    for sent in muc6.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc6.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
    muc7 = LazyCorpusLoader('muc7/', MUCCorpusReader, muc7_documents)
    for sent in muc7.iob_sents()[:]:
        for word in sent:
            print word
        print
    print
    for sent in muc7.mentions(depth=None):
        for mention in sent:
            print mention
        if sent: print
    print
Esempio n. 39
0
class LangDetectTwitter(ModifiedMRJob):
    DEFAULT_INPUT_PROTOCOL = 'raw_value'
    language_trigrams = {}
    langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')

    def configure_options(self):
        super(LangDetectTwitter, self).configure_options()
        #self.add_file_option('--langs', default='languages.txt')

        #def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
    def __init__(self, *args, **kwargs):
        super(LangDetectTwitter, self).__init__(*args, **kwargs)
        #languages = [x.strip() for x in open(self.options.langs, 'r').readlines()]
        languages = [
            'fr', 'en', 'ar', 'es', 'de', 'it', 'id', 'pt', 'tr', 'ru', 'nl',
            'hi', 'sv', 'fi', 'da', 'pl', 'hu', 'fa', 'he', 'ur', 'th'
        ]
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang + "-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def mapper(self, key, tweet):
        '''
		Detect the text's language
		'''
        obj = cjson.decode(tweet)
        text = obj['tx']
        words = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1
        total = sum(trigrams.values())
        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) / float(
                    frequencies.N())) * (float(count) / float(total))
        obj['lang'] = sorted(scores.items(), key=lambda x: x[1],
                             reverse=True)[0][0]
        yield key, obj

    def get_word_trigrams(self, match):
        return [
            ''.join(trigram) for trigram in nltk_trigrams(match)
            if trigram != None
        ]
Esempio n. 40
0
    def corpus(self):
        """
            This method is used to initialize the corpus object if it wasn't before
        """
        if self._corpus is None:
            # The use of r'(?!\.).*\.txt' and =r'(neg|pos)/.*' makes possible to find the files labeled with neg and pos
            self._corpus = LazyCorpusLoader(self._corpusName,
                                            CategorizedPlaintextCorpusReader,
                                            r'(?!\.).*\.txt',
                                            cat_pattern=r'(neg|pos)/.*',
                                            encoding='ascii')

        return self._corpus
Esempio n. 41
0
def test():

    from nltk.corpus.util import LazyCorpusLoader
    knbc = LazyCorpusLoader(
        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
    assert isinstance(knbc.words()[0], string_types)
    assert isinstance(knbc.sents()[0][0], string_types)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
Esempio n. 42
0
def test():

    from nltk.corpus.util import LazyCorpusLoader

    knbc = LazyCorpusLoader("knbc/corpus1",
                            KNBCorpusReader,
                            r".*/KN.*",
                            encoding="euc-jp")
    assert isinstance(knbc.words()[0], str)
    assert isinstance(knbc.sents()[0][0], str)
    assert isinstance(knbc.tagged_words()[0], tuple)
    assert isinstance(knbc.tagged_sents()[0][0], tuple)
Esempio n. 43
0
def dictionary_backoff(option_tone, backoff):
    '''Creates a dictionary according to the option: tonal/nontonal'''
    if option_tone == "tonal":
        bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba.txt"], option_tone, "POS")
        bambara_dict_toolbox.copy_files()
        reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba.txt"])
        entries = reader.entries("bamadaba.txt") #tonal
        words = reader.words("bamadaba.txt")#tonal
        pos = reader.words("bamadaba.txt", key="ps")#tonal
    else:
        bambara_dict_toolbox = BambaraTagging("cookbook/bambara", ["bamadaba_non_tonal.txt"], option_tone, "POS")
        bambara_dict_toolbox.copy_files()
        reader = LazyCorpusLoader("cookbook/bambara/", ToolboxCorpusReader, ["bamadaba_non_tonal.txt"])
        entries = reader.entries("bamadaba_non_tonal.txt") #tonal
        words = reader.words("bamadaba_non_tonal.txt")#tonal
        pos = reader.words("bamadaba_non_tonal.txt", key="ps")#tonal
        
    own_model = get_alt_pos(entries, pos, reader, option_tone)#tonal
    print("Dictionary created")
    dic = UnigramTagger(model=own_model, backoff=backoff)
    return dic
Esempio n. 44
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f
        for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader(
        "knbc/corpus1",
        KNBCorpusReader,
        sorted(fileids, key=_knbc_fileids_sort),
        encoding="euc-jp",
    )

    print(knbc.fileids()[:10])
    print("".join(knbc.words()[:100]))

    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: "/".join(
        "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
    ).encode("utf-8")

    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))

    print(
        "\n".join(
            " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )
Esempio n. 45
0
def hamshahri_targetword_corpus_maker(match, outpath):
    print 'loading hamshahri corpus'
    print
    corpus = LazyCorpusLoader('hamshahricorpus', XMLCorpusReader,
                              r'(?!\.).*\.xml')
    outfile = codecs.open(outpath, 'w', 'utf-8')
    punclist = [u'،', u'؛', u':', u'؟', u'#']

    matchnum = 0
    count = 0
    print 'creating target corpus'
    for file in corpus.fileids():
        #print file

        for doc in corpus.xml(file).getchildren():

            #    print doc.getchildren()
            #          cat=doc.getchildren()[3].text#
            text = doc.getchildren()[5].text
            newtext = correctPersianString(text)
            newtext = newtext.replace('\n', ' ')

            for item in punclist:
                if item in newtext:
                    newtext = newtext.replace(item, '')
    #
    #        #  print newtext
    #
    #
            if match in newtext.split():
                #
                matchnum += 1
                print newtext
                print '#'
                count += 1
                #
                outfile.write(newtext)
                outfile.write('ALI')

    outfile.close()
    print count
Esempio n. 46
0
	class LangDetector(object):
		
		def __init__(self, languages=LangIDDict().keys()):
		
			self.language_trigrams = {}
			self.langid = LazyCorpusLoader('langid', LangIdReader, r'(?!\.).*\.txt')
			
			for lang in languages:
				self.language_trigrams[lang] = FreqDist()
				for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
					self.language_trigrams[lang].inc(f[0], f[1])
				self.language_dicts = dict([
					(id, dict([(trigram, float(value)/float(fdist.N())) for trigram, value in fdist.items()]))
					for id, fdist in self.language_trigrams.items()
				])
				
		def detect(self, text):
		
			words = nltk_word_tokenize(text.lower())
			trigrams = {}
			scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])

			trigcount = [(trigram, 1.0) for match in words for trigram in self.get_word_trigrams(match)]
			if len(trigcount) > 0:
				trigdf = pandas.DataFrame(trigcount, columns = ["key", "value"])
				trigrams = trigdf.groupby("key")["value"].sum().to_dict()
			else:
				trigrams = {}

			total = sum(trigrams.values())
			maxscore, maxid = 0, ""
			for trigram, count in trigrams.items():
				trishare = (float(count) / float(total))
				for lang, frequencies in filter(lambda (l, f): trigram in f, self.language_dicts.iteritems()):
					scores[lang] += frequencies[trigram] * trishare
					if scores[lang] > maxscore:
						maxid, maxscore = lang, scores[lang]
						
			return sorted(scores.items(), key=lambda x: x[1], reverse=True)
Esempio n. 47
0
        treebank_train_sequence = treebank_train.tagged_sents()
        treebank_test = load_treebank('24')
        treebank_test_sequence = treebank_test.tagged_sents()
        treebank_estimator = LidstoneProbDistFactory
        model = train_model(HiddenMarkovModelTagger, 
                            treebank_train_sequence, 
                            treebank_test_sequence,
                            options.model_file, 
                            options.num_train_sents, 
                            options.num_test_sents,
                            estimator=treebank_estimator,
                            verbose=options.verbose)

    elif options.train_chunker:
        conll2k_train = LazyCorpusLoader(
            'conll2000', ConllChunkCorpusReader, 
            ['train.txt'], ('NP','VP','PP'))
        conll2k_train_sequence = conll2k_train.iob_sents()
        conll2k_test = LazyCorpusLoader(
            'conll2000', ConllChunkCorpusReader,
            ['test.txt'], ('NP','VP','PP'))
        conll2k_test_sequence = conll2k_test.iob_sents()
        conll2k_estimator = LidstoneProbDistFactory
        conll2k_transform = ClosedCategoryChunkTransform(TREEBANK_CLOSED_CATS)
        model = train_model(HiddenMarkovModelChunkTagger, 
                            conll2k_train_sequence, 
                            conll2k_test_sequence,
                            options.model_file, 
                            options.num_train_sents, 
                            options.num_test_sents,
                            estimator=conll2k_estimator,
Esempio n. 48
0
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams)
    print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy,
                                                                                               bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams)
    print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy,
                                                                                                trigram_perplexity))
Esempio n. 49
0
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import WordListCorpusReader

reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist.txt'])
print(isinstance(reader, LazyCorpusLoader))

print(reader.fileids())
print(isinstance(reader, LazyCorpusLoader))
print(isinstance(reader, WordListCorpusReader))
Esempio n. 50
0
File: knbc.py Progetto: amumu/nokuno
#!/usr/bin/env python
# encoding: utf-8
# KNBCコーパスをNLTKで読み込むサンプル

from nltk_jp import *
from nltk.corpus.reader import *
from nltk.corpus.util import LazyCorpusLoader

def _knbc_fileids_sort(x):
    cells = x.split('-')
    return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

# コーパスを読み込み
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
#print "fileids :", knbc.fileids()
print "words :", pp(knbc.words()[:10])
print "parsed_sents :", str(knbc.parsed_sents()[0])
print "tagged_words :", pp(knbc.tagged_words()[:5])

Esempio n. 51
0
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier

import sys, os
import cPickle
from feats import words_in_sentence

pathname = os.path.dirname(sys.argv[0])        

nltk.data.path.append(os.path.abspath(pathname)+'/data'); 
movie_reviews = LazyCorpusLoader(
	sys.argv[1], CategorizedPlaintextCorpusReader,
	r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
	encoding='utf-8')

train_test_ratio = 2.0/3



def pickleObject():
	obj = classifier
	savefile = open('classifier.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)

def pickleFeats():
	obj = words_in_sentence
	savefile = open('feats.pickle', 'w')
	cPickle.dump(obj, savefile, cPickle.HIGHEST_PROTOCOL)
Esempio n. 52
0
if args.cat_file:
	reader_kwargs['cat_file'] = args.cat_file
	
	if args.delimiter and args.delimiter != ' ':
		reader_kwargs['delimiter'] = args.delimiter
	
	if args.cat_pattern:
		reader_args.append(args.cat_pattern)
	else:
		reader_args.append('.+/.+')
elif args.cat_pattern:
	reader_args.append(args.cat_pattern)
	reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)

categorized_corpus = LazyCorpusLoader(args.corpus, reader_class[args.reader],
	*reader_args, **reader_kwargs)
labels = categorized_corpus.categories()
nlabels = len(labels)

if args.trace:
	print '%d labels: %s' % (nlabels, labels)

if not nlabels:
	raise ValueError('corpus does not have any categories')
elif nlabels == 1:
	raise ValueError('corpus must have more than 1 category')
elif nlabels == 2 and args.multi:
	raise ValueError('corpus must have more than 2 categories if --multi is specified')

########################
## text normalization ##
    For all words in top_words (most used in all corpus),
    set True/False if the word is in the document's words set.
    """
    # transform the list of words into a set to optimize search
    doc_words_set = set(doc_words)

    # build features dictionary
    features = {}
    for word in top_words:
        features['contains(%s)' % word] = (word in doc_words_set)
    return features


interrogazioni = LazyCorpusLoader(
    'opp_interrogazioni_macro',
    CategorizedPlaintextCorpusReader,
    r'\d*', cat_file='cats.txt', cat_delimiter=','
)

print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]


print "generating list of documents for each category"
documents = [
    (list(interrogazioni.words(fileid)), category)
    for category in interrogazioni.categories()
    for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)
Esempio n. 54
0
from __future__ import division
import sys
import os.path
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import poetry
import re
from nltk.corpus import cmudict
d = cmudict.dict()
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
suffdict = LazyCorpusLoader(
    'cmusuffdict', CMUDictCorpusReader, ['cmusuffdict'])
suffdict = suffdict.dict()


def suffdict_phonemes(word):
    # Use my cmu-based last syllable dictionary
    if re.search("((?i)[bcdfghjklmnpqrstvwxz]{1,2}[aeiouy]+[bcdfghjklmnpqrstvwxz]*(e|ed)?('[a-z]{1,2})?)(?![a-zA-Z]+)", word.lower()):
        last_syl = re.search("((?i)[bcdfghjklmnpqrstvwxz]{1,2}[aeiouy]+[bcdfghjklmnpqrstvwxz]*(e|ed)?('[a-z]{1,2})?)(?![a-zA-Z]+)", word.lower()).group()
        if last_syl in suffdict:
            return suffdict[last_syl][0]
        # else try without the first letter
        elif last_syl[1 - len(last_syl):] in suffdict:
            return suffdict[last_syl[1 - len(last_syl):]][0]
        # else try without the first 2 letters
        elif last_syl[2 - len(last_syl):] in suffdict:
            return suffdict[last_syl[2 - len(last_syl):]][0]
        # else try without the last 2 letters, if it ends in 's
        elif last_syl[-2:] == "'s":
            if last_syl[:-2] in suffdict:
Esempio n. 55
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

import nltk
import util

from knbc import *
from nltk.corpus.util import LazyCorpusLoader

root = nltk.data.find('corpora/KNBC_v1.0_090925/corpus1')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
           if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

def _knbc_fileids_sort(x):
    cells = x.split('-')
    return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

knbc = LazyCorpusLoader('KNBC_v1.0_090925/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

# print knbc.fileids()

# print '\n'.join( ''.join(sent) for sent in knbc.words() )

print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[0:2] )
print type(knbc.parsed_sents()[0])

# print '\n'.join( ' '.join("%s/%s"%(w[0], w[1][2]) for w in sent) for sent in knbc.tagged_words()[0:20] )
Esempio n. 56
0
def loadClassifier(outputdir):
    classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") 
    word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
    if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
        word_features = pickleLoad("word_features.pickle")
#        classifier = pickleLoad("originalnaivebayes.pickle")
#        MNB_classifier = pickleLoad("MNB_classifier.pickle")
#        BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
#        LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
#        SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
#        LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#        
#        voted_classifier = VoteClassifier(classifier,
##                                  NuSVC_classifier,
#                                  LinearSVC_classifier,
#                                  SGDClassifier_classifier,
#                                  MNB_classifier,
#                                  BernoulliNB_classifier,
#                                  LogisticRegression_classifier)
        voted_classifier= pickleLoad("voted_classifier.pickle")
        return voted_classifier, word_features
    else:
        criticas_cine = LazyCorpusLoader(
                'criticas_cine', CategorizedPlaintextCorpusReader,
                r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
                encoding='utf-8')
#        criticas_cine = LazyCorpusLoader(
#                'criticas_cine_neu', CategorizedPlaintextCorpusReader,
#                r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
#                encoding='utf-8')
            
        documents = [(list(criticas_cine.words(fileid)), category)
                     for category in criticas_cine.categories()
                     for fileid in criticas_cine.fileids(category)]
#            
#        document_pos = [(list(criticas_cine.words(fileid)), "pos")
#                        for fileid in criticas_cine.fileids("pos")]
#        document_neg = [(list(criticas_cine.words(fileid)), "neg")
#                        for fileid in criticas_cine.fileids("neg")]
#        document_neu = [(list(criticas_cine.words(fileid)), "neu")
#                        for fileid in criticas_cine.fileids("neu")]
        
        random.shuffle(documents)
        
#        random.shuffle(document_pos)
#        random.shuffle(document_neg)
#        random.shuffle(document_neu)
        
        all_words = []
        
        for w in criticas_cine.words():
            all_words.append(w.lower())
        
#        for w in criticas_cine.words():
#            if not is_filtered(w.lower()):
#                all_words.append(w.lower())
#        
        all_words = nltk.FreqDist(all_words)
        
        #print (all_words.most_common(50))
        
        # Filtering by type of word
        
#        for sample in all_words:
                    
        
        word_features = list(all_words.keys())[:3000]
        pickleDump(word_features, "word_features.pickle")
        
        featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
        
#        featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
#        featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
#        featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
        
#        training_set = featuresetpos[:1000]
#        training_set.extend(featuresetneg[:1000])
#        training_set.extend(featuresetneu[:1000])
#        testing_set = featuresetpos[1000:1273]
#        testing_set.extend(featuresetneg[1000:])
#        testing_set.extend(featuresetneu[1000:])

#        pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
#        neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
#        neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
                
        training_set = featuresets[:2000]
        testing_set =  featuresets[2000:]
        classifier = nltk.NaiveBayesClassifier.train(training_set)
#        pickleDump(classifier, "originalnaivebayes.pickle")
    
        NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
        
        print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
        
        accuracy = Accuracy(classifier,testing_set)
        print(accuracy)
        # order: neu, neg, pos
#        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
#        print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#        print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3)
#        print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100)
        classifier.show_most_informative_features(15)
        
        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier.train(training_set)
        MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set)
        print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100)
#        pickleDump(MNB_classifier, "MNB_classifier.pickle")
        
        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier.train(training_set)
        BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
        print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100)
#        pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle")
        
        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training_set)
        LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set)
        print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100)
#        pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle")
        
        SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
        SGDClassifier_classifier.train(training_set)
        SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
        print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100)
#        pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle")
        
        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training_set)
        LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set)
        print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100)
#        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
#        SVC_classifier = SklearnClassifier(SVC())
#        SVC_classifier.train(training_set)
#        print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
        
        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier.train(training_set)
        NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set)
        print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100)
        #        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
        
#        pickleDump([NaiveBayesClassifierAccuracy, 
#                    LinearSVC_classifierAccuracy,
#                    SGDClassifier_classifierAccuracy,
#                    MNB_classifierAccuracy,
#                    BernoulliNB_classifierAccuracy,
#                    LogisticRegression_classifierAccuracy], "accuracies.pickle")
        
        voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy],
                                          [NuSVC_classifier,NuSVC_classifierAccuracy],
                                          [LinearSVC_classifier,LinearSVC_classifierAccuracy],
                                          [SGDClassifier_classifier,SGDClassifier_classifierAccuracy],
                                          [MNB_classifier,MNB_classifierAccuracy],
                                          [BernoulliNB_classifier,BernoulliNB_classifierAccuracy],
                                          [LogisticRegression_classifier,LogisticRegression_classifierAccuracy])

        accuracy = Accuracy(voted_classifier,testing_set)
        print(accuracy)
        VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set)
        print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100)
#        print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100)
        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2)
        print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2)
        print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2)
        print("------------------------------------------");
                                          
        pickleDump(voted_classifier, "voted_classifier.pickle")

        return voted_classifier, word_features
Esempio n. 57
0
from nltk.tokenize import RegexpTokenizer, BlanklineTokenizer
from xml.etree.ElementTree import ElementTree,Element
from orthograph import detone

orthographic_word = RegexpTokenizer(r"(\w+([-]\w+)*[']?|[.:;!?,])")

test = LazyCorpusLoader(
        'bamana/test', PlaintextCorpusReader, r'source.txt', word_tokenizer=orthographic_word, encoding='utf-8')

wordlist = LazyCorpusLoader(
        'bamana/wordlist', PlaintextCorpusReader, r'bailleul.clean.wordlist', word_tokenizer=orthographic_word, encoding='utf-8')

properlist = LazyCorpusLoader(
        'bamana/propernames', PlaintextCorpusReader, r'.*\.clean\.wordlist', word_tokenizer=orthographic_word, encoding='utf-8')

propernames = LazyCorpusLoader(
        'bamana/propernames', ToolboxCorpusReader, '.*\.txt', encoding='utf-8')

bailleul = LazyCorpusLoader(
        'bamana/bailleul', ToolboxCorpusReader, r'bailleul.txt', encoding='utf-8')

lexicon = ElementTree(bailleul.xml('bailleul.txt'))

for file in propernames.fileids():
    for e in ElementTree(propernames.xml(file)).findall('record'):
        ge = Element('ge')
        ge.text = e.find('lx').text
        e.append(ge)
        ps = Element('ps')
        ps.text = 'n.prop'
        e.append(ps)
        lexicon.getroot().append(e)