Esempi in Python per download, esempi in Python per nltk.downloader.download

Esempio n. 1

0

Mostra file

def training_downloads():
    # NLTK:
    info("Downloading (if necessary) NLTK resources:")
    download('punkt')
    download('stopwords')

    # Glove:
    info('Downloading Glove Embeddings:')
    if not os.path.exists(VEC_DIR):
        os.makedirs(VEC_DIR)
    download_and_extract(GLOVE_EMBEDDINGS_URL, VEC_DIR)

    # Squad:
    info('Downloading Squad:')
    if not os.path.exists(SQUAD_SOURCE_DIR):
        os.makedirs(SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/train-v1.1.json', SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/dev-v1.1.json', SQUAD_SOURCE_DIR)

    # TriviaQA:
    info('Downloading TriviaQA:')
    if not os.path.exists(TRIVIA_QA):
        os.makedirs(TRIVIA_QA)
    download_and_extract(TRIVIAQA_SERVER + 'triviaqa-rc.tar.gz', TRIVIA_QA)

    # LM:
    info('Downloading LM:')
    if not os.path.exists(LM_DIR):
        os.makedirs(LM_DIR)
    download_and_extract(LM_URL, LM_DIR)

Esempio n. 2

0

Mostra file

def clean_sw():
    try:
        sw = stopwords.words('english')
    except LookupError:
        downloader.download('stopwords')
        sw = stopwords.words('english')
    return set([english_stemmer(w) for w in sw])

Esempio n. 3

0

Mostra file

def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))

Esempio n. 4

0

Mostra file

File: ex3.py Progetto: BabisK/M36209P

def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))

Esempio n. 5

0

Mostra file

File: generate_documents.py Progetto: stegerp/andsports-ir

def generate_documents(data_file):
    # Read data from csv export file
    data = pandas.read_csv(data_file, sep='\t', header=None,
                           names=[TEXT_IDENTIFIER_COLUMN, TEXT_COLUMN], skiprows=[0])

    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")

    if os.path.exists(DOCUMENTS_DIRECTORY):
        shutil.rmtree(DOCUMENTS_DIRECTORY)
    os.makedirs(DOCUMENTS_DIRECTORY)

    data = data.apply(tokenize, axis=1)
    data = data.apply(remove_stopwords, axis=1)
    data = data.apply(stem, axis=1)
    data.apply(save_to_document, axis=1)

Esempio n. 6

0

Mostra file

File: get_query_likelyhood_score.py Progetto: stegerp/andsports-ir

def get_query_likelihood_score(documents_directory, query_text):
    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")
    query_document = generate_query_document(query_text)
    if len(query_document) == 0:
        print("Query not precise enough. Please refine your query")
        return

    collection_bag_of_words = load_collection_bag_of_words(documents_directory)
    document_bags_of_words = load_document_bags_of_words(documents_directory)

    scores = calculate_query_likelihood(query_document,
                                        collection_bag_of_words,
                                        document_bags_of_words)
    for document_name, score in scores.items():
        print(document_name + "\t" + str(score))

Esempio n. 7

0

Mostra file

File: TextFunctions.py Progetto: ldsands/SandsPythonFunctions

    def get_stopwords():
        """
        this will test to see if the stopwords from the nltk module have already been
        downloaded if they have not they will be download this function is needed for both
        word embedding and topic modeling and is just overall useful
        """
        from nltk.downloader import download
        from nltk.corpus import stopwords

        try:
            return stopwords.words("english")
        except:
            print(
                f"NLTK needs to download the stopwords. This will take a while."
            )
            download("stopwords")
            print(f"NLTK has finished downloading stopwords.")
            return stopwords.words("english")

Esempio n. 8

0

Mostra file

File: index_corpus.py Progetto: BluesparkLabs/guided-search

    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)

Esempio n. 9

0

Mostra file

File: downloader.py Progetto: soerensigfusson/collective.classification

def downloadNLTKData():
    """
    """
    logger = logging.getLogger('collective.classification')    
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

Esempio n. 10

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')

Esempio n. 11

0

Mostra file

File: stopwords.py Progetto: dangur/ud120-projects

#!/usr/bin/python

from nltk.corpus import stopwords
from nltk.downloader import download

download('all', halt_on_error=False)
sw = stopwords.words("english")
count = len(sw)
# print(sw)

Esempio n. 12

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')

Esempio n. 13

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')

Esempio n. 14

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')

Esempio n. 15

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

Esempio n. 16

0

Mostra file

File: ex4.py Progetto: BabisK/M36209P

def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams)
    print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy,
                                                                                               bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams)
    print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy,
                                                                                                trigram_perplexity))

Esempio n. 17

0

Mostra file

def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t)
                                       for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist,
                                      vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) /
        float(len(bigram_length_probabilities[length]))
        for length in bigram_length_probabilities.keys()
    }
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) /
        float(len(trigram_length_probabilities[length]))
        for length in trigram_length_probabilities.keys()
    }

    random_sentences = [[
        words[random.randint(0,
                             len(words) - 1)].lower() for i in range(key)
    ] for key in bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()),
                         color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()),
                          color='blue')
    random_bigram = plt.scatter(
        list(random_bigram_length_probabilities.values()),
        list(random_bigram_length_probabilities.keys()),
        color='green')
    random_trigram = plt.scatter(
        list(random_trigram_length_probabilities.values()),
        list(random_trigram_length_probabilities.keys()),
        color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the bigram model produced this text of length 30: {}'
        .format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the trigram model produced this text of length 30: {}'
        .format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(
        cpd_bigram, test_bigrams)
    print(
        'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'
        .format(bigram_entropy, bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(
        cpd_trigram, test_trigrams)
    print(
        'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'
        .format(trigram_entropy, trigram_perplexity))

Esempio n. 18

0

Mostra file

File: setup.py Progetto: PeterBishop0/image-captioning-with-attention

def _post_setup():
    from nltk.downloader import download
    download('punkt')

Esempio n. 19

0

Mostra file

import pickle
import re
import time
from SAR_utils import *
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import downloader

downloader.download('stopwords')


def compareT(a, b):
    (d1, p1) = a
    (d2, p2) = b
    if d1 == d2:
        return p1 - p2
    else:
        return d1 - d2


# Obtain posting list of a word
# If an index i is not specified it will match : patterns and develop stemming
def getPList(word, i=None, stemming=False):
    if i != None:
        return i.get(word, [])

    if ":" in word:
        [where, word] = word.split(":")
        if where == "headline" or where == "h":
            i = titleIndex
        elif where == "date" or where == "d":

Esempio n. 20

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')

Esempio n. 21

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')

Esempio n. 22

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')

Esempio n. 23

0

Mostra file

File: install_reuters.py Progetto: wedavey/atnlp

def main():
    if not os.path.exists(NLTK_DIR):
        os.makedirs(NLTK_DIR)
        
    download('reuters', download_dir=NLTK_DIR)

Esempio n. 24

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')

Esempio n. 25

0

Mostra file

File: vectorizacionTextos.py Progetto: Fasgort/AIA-ClasificacionTextos

# -*- coding: utf-8 -*-

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import unicodedata
import operator
from nltk import downloader
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
downloader.download("stopwords")
np.set_printoptions(threshold=np.nan)

stopwords_list = set(stopwords.words("spanish"))
stemmer = SnowballStemmer("spanish")


def tratamiento1(documentos):
    # Tratamiento de datos básico
    new_documentos = []
    for d in range(len(documentos)):
        unaccented_text = ''.join(
            c for c in unicodedata.normalize('NFD', documentos[d])
            if unicodedata.category(c) != 'Mn')
        lower_words = [str.lower(word) for word in unaccented_text.split(" ")]
        new_documentos.append(" ".join(lower_words))
    return new_documentos


def tratamiento2(documentos):

Esempio n. 26

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')

Esempio n. 27

0

Mostra file

File: install_nltk_data.py Progetto: arantes555/oblivious-movie-gharial

import config
from nltk import downloader

# Async, can't be run in main process :/
# for wordnet stemmer
downloader.download(info_or_id='wordnet', download_dir=config.NLTK_DATA_DIR)
# for snowball and porter stemmer
# downloader.download(info_or_id='punkt', download_dir=config.NLTK_DATA_DIR)
# stop words <- used in snowball
downloader.download(info_or_id='stopwords', download_dir=config.NLTK_DATA_DIR)

Esempio n. 28

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')

Esempio n. 29

0

Mostra file

File: tokenizer.py Progetto: rodart/retail_store_items_classification

def download_corpus():
    downloader = nltk.downloader.Downloader(download_dir=NLTK_DIR)
    downloader.download('wordnet', download_dir=NLTK_DIR)

Esempio n. 30

0

Mostra file

from nltk import downloader

downloader.download()

Esempio n. 31

0

Mostra file

File: get_data.py Progetto: hxsebastien/topicmod

from nltk import downloader

if __name__ == "__main__":
    for ii in ["punkt", "stopwords", "wordnet"]:
        downloader.download(ii)

Esempio n. 32

0

Mostra file

File: downloader.py Progetto: avoinea/collective.classification

def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')

Esempio n. 33

0

Mostra file

File: __init__.py Progetto: trunghlt/cape-document-qa

#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from nltk.downloader import download
from logging import info
from cape_document_qa.download_and_extract import download_and_extract
from cape_document_qa.cape_document_qa_settings import MODEL_FOLDER, MODEL_URL, MODELS_FOLDER, MODEL_MB_SIZE, \
    GLOVE_EMBEDDINGS_URL, DOWNLOAD_ALL_GLOVE_EMBEDDINGS

glove_filepath = os.path.join(MODEL_FOLDER, 'glove.840B.300d.txt')
if not os.path.isfile(os.path.join(MODEL_FOLDER, 'model.pkl')) or \
        not os.path.isfile(glove_filepath) or \
        (
                DOWNLOAD_ALL_GLOVE_EMBEDDINGS and os.path.getsize(glove_filepath) / 1e6 < 2e3
                # less than 2 GBs-> we only have the top X embeddings
        ):
    # Downloading NLTK dependencies
    info("Downloading (if necessary) NLTK ressources:")
    download('punkt')
    download('stopwords')
    info('Downloading default model with top X Glove embeddings:')
    download_and_extract(MODEL_URL, MODELS_FOLDER, total_mb_size=MODEL_MB_SIZE)
    if DOWNLOAD_ALL_GLOVE_EMBEDDINGS:
        info('Downloading complete Glove Embeddings:')
        download_and_extract(GLOVE_EMBEDDINGS_URL, MODEL_FOLDER)

Esempio n. 34

0

Mostra file

File: twitter.py Progetto: alexandreesl/socialmediasentimental-handson

from config import *
from textblob import TextBlob
from nltk import downloader
import tweepy


class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print('A TWEET!')
        print(status.text)
        print('AND THE SENTIMENT PER SENTENCE IS:')
        blob = TextBlob(status.text)
        for sentence in blob.sentences:
            print(sentence.sentiment.polarity)


auth = tweepy.OAuthHandler(consumerkey, consumerkeysecret)
auth.set_access_token(accesstoken, accesstokensecret)

downloader.download('punkt')

myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=auth, listener=myStreamListener)

stream = tweepy.Stream(auth, myStreamListener)
stream.filter(track=['coca cola'], languages=['en'])