コード例 #1
0
def training_downloads():
    # NLTK:
    info("Downloading (if necessary) NLTK resources:")
    download('punkt')
    download('stopwords')

    # Glove:
    info('Downloading Glove Embeddings:')
    if not os.path.exists(VEC_DIR):
        os.makedirs(VEC_DIR)
    download_and_extract(GLOVE_EMBEDDINGS_URL, VEC_DIR)

    # Squad:
    info('Downloading Squad:')
    if not os.path.exists(SQUAD_SOURCE_DIR):
        os.makedirs(SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/train-v1.1.json', SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/dev-v1.1.json', SQUAD_SOURCE_DIR)

    # TriviaQA:
    info('Downloading TriviaQA:')
    if not os.path.exists(TRIVIA_QA):
        os.makedirs(TRIVIA_QA)
    download_and_extract(TRIVIAQA_SERVER + 'triviaqa-rc.tar.gz', TRIVIA_QA)

    # LM:
    info('Downloading LM:')
    if not os.path.exists(LM_DIR):
        os.makedirs(LM_DIR)
    download_and_extract(LM_URL, LM_DIR)
コード例 #2
0
def clean_sw():
    try:
        sw = stopwords.words('english')
    except LookupError:
        downloader.download('stopwords')
        sw = stopwords.words('english')
    return set([english_stemmer(w) for w in sw])
コード例 #3
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
コード例 #4
0
ファイル: ex3.py プロジェクト: BabisK/M36209P
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
コード例 #5
0
def generate_documents(data_file):
    # Read data from csv export file
    data = pandas.read_csv(data_file, sep='\t', header=None,
                           names=[TEXT_IDENTIFIER_COLUMN, TEXT_COLUMN], skiprows=[0])

    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")

    if os.path.exists(DOCUMENTS_DIRECTORY):
        shutil.rmtree(DOCUMENTS_DIRECTORY)
    os.makedirs(DOCUMENTS_DIRECTORY)

    data = data.apply(tokenize, axis=1)
    data = data.apply(remove_stopwords, axis=1)
    data = data.apply(stem, axis=1)
    data.apply(save_to_document, axis=1)
コード例 #6
0
def get_query_likelihood_score(documents_directory, query_text):
    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")
    query_document = generate_query_document(query_text)
    if len(query_document) == 0:
        print("Query not precise enough. Please refine your query")
        return

    collection_bag_of_words = load_collection_bag_of_words(documents_directory)
    document_bags_of_words = load_document_bags_of_words(documents_directory)

    scores = calculate_query_likelihood(query_document,
                                        collection_bag_of_words,
                                        document_bags_of_words)
    for document_name, score in scores.items():
        print(document_name + "\t" + str(score))
コード例 #7
0
    def get_stopwords():
        """
        this will test to see if the stopwords from the nltk module have already been
        downloaded if they have not they will be download this function is needed for both
        word embedding and topic modeling and is just overall useful
        """
        from nltk.downloader import download
        from nltk.corpus import stopwords

        try:
            return stopwords.words("english")
        except:
            print(
                f"NLTK needs to download the stopwords. This will take a while."
            )
            download("stopwords")
            print(f"NLTK has finished downloading stopwords.")
            return stopwords.words("english")
コード例 #8
0
    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)
コード例 #9
0
def downloadNLTKData():
    """
    """
    logger = logging.getLogger('collective.classification')    
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
コード例 #10
0
def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
コード例 #11
0
ファイル: stopwords.py プロジェクト: dangur/ud120-projects
#!/usr/bin/python

from nltk.corpus import stopwords
from nltk.downloader import download

download('all', halt_on_error=False)
sw = stopwords.words("english")
count = len(sw)
# print(sw)
コード例 #12
0
def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')
コード例 #13
0
def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')
コード例 #14
0
def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')
コード例 #15
0
def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
コード例 #16
0
ファイル: ex4.py プロジェクト: BabisK/M36209P
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams)
    print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy,
                                                                                               bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams)
    print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy,
                                                                                                trigram_perplexity))
コード例 #17
0
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t)
                                       for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist,
                                      vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) /
        float(len(bigram_length_probabilities[length]))
        for length in bigram_length_probabilities.keys()
    }
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) /
        float(len(trigram_length_probabilities[length]))
        for length in trigram_length_probabilities.keys()
    }

    random_sentences = [[
        words[random.randint(0,
                             len(words) - 1)].lower() for i in range(key)
    ] for key in bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()),
                         color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()),
                          color='blue')
    random_bigram = plt.scatter(
        list(random_bigram_length_probabilities.values()),
        list(random_bigram_length_probabilities.keys()),
        color='green')
    random_trigram = plt.scatter(
        list(random_trigram_length_probabilities.values()),
        list(random_trigram_length_probabilities.keys()),
        color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the bigram model produced this text of length 30: {}'
        .format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the trigram model produced this text of length 30: {}'
        .format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(
        cpd_bigram, test_bigrams)
    print(
        'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'
        .format(bigram_entropy, bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(
        cpd_trigram, test_trigrams)
    print(
        'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'
        .format(trigram_entropy, trigram_perplexity))
コード例 #18
0
def _post_setup():
    from nltk.downloader import download
    download('punkt')
コード例 #19
0
import pickle
import re
import time
from SAR_utils import *
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import downloader

downloader.download('stopwords')


def compareT(a, b):
    (d1, p1) = a
    (d2, p2) = b
    if d1 == d2:
        return p1 - p2
    else:
        return d1 - d2


# Obtain posting list of a word
# If an index i is not specified it will match : patterns and develop stemming
def getPList(word, i=None, stemming=False):
    if i != None:
        return i.get(word, [])

    if ":" in word:
        [where, word] = word.split(":")
        if where == "headline" or where == "h":
            i = titleIndex
        elif where == "date" or where == "d":
コード例 #20
0
def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
コード例 #21
0
def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
コード例 #22
0
def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')
コード例 #23
0
ファイル: install_reuters.py プロジェクト: wedavey/atnlp
def main():
    if not os.path.exists(NLTK_DIR):
        os.makedirs(NLTK_DIR)
        
    download('reuters', download_dir=NLTK_DIR)
コード例 #24
0
def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')
コード例 #25
0
# -*- coding: utf-8 -*-

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import unicodedata
import operator
from nltk import downloader
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
downloader.download("stopwords")
np.set_printoptions(threshold=np.nan)

stopwords_list = set(stopwords.words("spanish"))
stemmer = SnowballStemmer("spanish")


def tratamiento1(documentos):
    # Tratamiento de datos básico
    new_documentos = []
    for d in range(len(documentos)):
        unaccented_text = ''.join(
            c for c in unicodedata.normalize('NFD', documentos[d])
            if unicodedata.category(c) != 'Mn')
        lower_words = [str.lower(word) for word in unaccented_text.split(" ")]
        new_documentos.append(" ".join(lower_words))
    return new_documentos


def tratamiento2(documentos):
コード例 #26
0
def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
コード例 #27
0
import config
from nltk import downloader

# Async, can't be run in main process :/
# for wordnet stemmer
downloader.download(info_or_id='wordnet', download_dir=config.NLTK_DATA_DIR)
# for snowball and porter stemmer
# downloader.download(info_or_id='punkt', download_dir=config.NLTK_DATA_DIR)
# stop words <- used in snowball
downloader.download(info_or_id='stopwords', download_dir=config.NLTK_DATA_DIR)
コード例 #28
0
def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
コード例 #29
0
def download_corpus():
    downloader = nltk.downloader.Downloader(download_dir=NLTK_DIR)
    downloader.download('wordnet', download_dir=NLTK_DIR)
コード例 #30
0
from nltk import downloader

downloader.download()
コード例 #31
0
ファイル: get_data.py プロジェクト: hxsebastien/topicmod
from nltk import downloader

if __name__ == "__main__":
    for ii in ["punkt", "stopwords", "wordnet"]:
        downloader.download(ii)
コード例 #32
0
def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')
コード例 #33
0
ファイル: __init__.py プロジェクト: trunghlt/cape-document-qa
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from nltk.downloader import download
from logging import info
from cape_document_qa.download_and_extract import download_and_extract
from cape_document_qa.cape_document_qa_settings import MODEL_FOLDER, MODEL_URL, MODELS_FOLDER, MODEL_MB_SIZE, \
    GLOVE_EMBEDDINGS_URL, DOWNLOAD_ALL_GLOVE_EMBEDDINGS

glove_filepath = os.path.join(MODEL_FOLDER, 'glove.840B.300d.txt')
if not os.path.isfile(os.path.join(MODEL_FOLDER, 'model.pkl')) or \
        not os.path.isfile(glove_filepath) or \
        (
                DOWNLOAD_ALL_GLOVE_EMBEDDINGS and os.path.getsize(glove_filepath) / 1e6 < 2e3
                # less than 2 GBs-> we only have the top X embeddings
        ):
    # Downloading NLTK dependencies
    info("Downloading (if necessary) NLTK ressources:")
    download('punkt')
    download('stopwords')
    info('Downloading default model with top X Glove embeddings:')
    download_and_extract(MODEL_URL, MODELS_FOLDER, total_mb_size=MODEL_MB_SIZE)
    if DOWNLOAD_ALL_GLOVE_EMBEDDINGS:
        info('Downloading complete Glove Embeddings:')
        download_and_extract(GLOVE_EMBEDDINGS_URL, MODEL_FOLDER)
コード例 #34
0
from config import *
from textblob import TextBlob
from nltk import downloader
import tweepy


class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print('A TWEET!')
        print(status.text)
        print('AND THE SENTIMENT PER SENTENCE IS:')
        blob = TextBlob(status.text)
        for sentence in blob.sentences:
            print(sentence.sentiment.polarity)


auth = tweepy.OAuthHandler(consumerkey, consumerkeysecret)
auth.set_access_token(accesstoken, accesstokensecret)

downloader.download('punkt')

myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=auth, listener=myStreamListener)

stream = tweepy.Stream(auth, myStreamListener)
stream.filter(track=['coca cola'], languages=['en'])