Esempio n. 1
0
def training_downloads():
    # NLTK:
    info("Downloading (if necessary) NLTK resources:")
    download('punkt')
    download('stopwords')

    # Glove:
    info('Downloading Glove Embeddings:')
    if not os.path.exists(VEC_DIR):
        os.makedirs(VEC_DIR)
    download_and_extract(GLOVE_EMBEDDINGS_URL, VEC_DIR)

    # Squad:
    info('Downloading Squad:')
    if not os.path.exists(SQUAD_SOURCE_DIR):
        os.makedirs(SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/train-v1.1.json', SQUAD_SOURCE_DIR)
    download_file(SQUAD_SERVER + '/dev-v1.1.json', SQUAD_SOURCE_DIR)

    # TriviaQA:
    info('Downloading TriviaQA:')
    if not os.path.exists(TRIVIA_QA):
        os.makedirs(TRIVIA_QA)
    download_and_extract(TRIVIAQA_SERVER + 'triviaqa-rc.tar.gz', TRIVIA_QA)

    # LM:
    info('Downloading LM:')
    if not os.path.exists(LM_DIR):
        os.makedirs(LM_DIR)
    download_and_extract(LM_URL, LM_DIR)
Esempio n. 2
0
def clean_sw():
    try:
        sw = stopwords.words('english')
    except LookupError:
        downloader.download('stopwords')
        sw = stopwords.words('english')
    return set([english_stemmer(w) for w in sw])
Esempio n. 3
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [['start0'] + [
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] + ['end0'] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence) - 1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(
            lower_case_letters) + sentence[word][letter + 1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
Esempio n. 4
0
def main():
    # matplotlib.use('Qt5Agg')
    # import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [
        ['start0'] + [word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in
                      sentence] + ['end0']
        for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary = list(word_frequency_distribution)
    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))

    # Calculate the conditional frequency distribution for bigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)

    # Calculate the conditional probability distribution for bigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)

    lower_case_letters = string.ascii_lowercase
    error_test = copy.deepcopy(test)
    for sentence in error_test:
        word = random.randrange(1, len(sentence)-1)
        sentence[word] = random.choice(vocabulary)
        word = random.choice(sentence[1:-2])
        word = random.randrange(1, len(sentence) - 1)
        letter = random.randrange(0, len(sentence[word]))
        sentence[word] = sentence[word][0:letter] + random.choice(lower_case_letters) + sentence[word][letter+1:]

    corrected = viterbi(error_test[25][:-1], vocabulary, cpd_bigram)

    print('Corrected:{}'.format(corrected))
    print('Original:{}'.format(test[25]))
Esempio n. 5
0
def generate_documents(data_file):
    # Read data from csv export file
    data = pandas.read_csv(data_file, sep='\t', header=None,
                           names=[TEXT_IDENTIFIER_COLUMN, TEXT_COLUMN], skiprows=[0])

    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")

    if os.path.exists(DOCUMENTS_DIRECTORY):
        shutil.rmtree(DOCUMENTS_DIRECTORY)
    os.makedirs(DOCUMENTS_DIRECTORY)

    data = data.apply(tokenize, axis=1)
    data = data.apply(remove_stopwords, axis=1)
    data = data.apply(stem, axis=1)
    data.apply(save_to_document, axis=1)
def get_query_likelihood_score(documents_directory, query_text):
    download('punkt', download_dir="nltk_data")
    download('stopwords', download_dir="nltk_data")
    query_document = generate_query_document(query_text)
    if len(query_document) == 0:
        print("Query not precise enough. Please refine your query")
        return

    collection_bag_of_words = load_collection_bag_of_words(documents_directory)
    document_bags_of_words = load_document_bags_of_words(documents_directory)

    scores = calculate_query_likelihood(query_document,
                                        collection_bag_of_words,
                                        document_bags_of_words)
    for document_name, score in scores.items():
        print(document_name + "\t" + str(score))
    def get_stopwords():
        """
        this will test to see if the stopwords from the nltk module have already been
        downloaded if they have not they will be download this function is needed for both
        word embedding and topic modeling and is just overall useful
        """
        from nltk.downloader import download
        from nltk.corpus import stopwords

        try:
            return stopwords.words("english")
        except:
            print(
                f"NLTK needs to download the stopwords. This will take a while."
            )
            download("stopwords")
            print(f"NLTK has finished downloading stopwords.")
            return stopwords.words("english")
    def handle(self):
        """
        Process corpus documents indexation.

        """

        download('stopwords')
        indexdb = IndexDB()
        self.connection = indexdb.handler()
        data_dir = '/Users/pablocc/harvard_data/'
        counter = 0

        for filename in os.listdir(data_dir):
            if os.path.isdir(data_dir + filename) or filename[0] == '.':
                continue

            with open(data_dir + filename, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    document = self.prepare_record(record)
                    counter += 1
                    print("%s - processing document %s." %
                          (counter, document['id']))
                    self.index_document(document)
def downloadNLTKData():
    """
    """
    logger = logging.getLogger('collective.classification')    
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
Esempio n. 11
0
#!/usr/bin/python

from nltk.corpus import stopwords
from nltk.downloader import download

download('all', halt_on_error=False)
sw = stopwords.words("english")
count = len(sw)
# print(sw)
def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')
def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')
def downloadNLTKAlpinoCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Alpino corpus")
    download('alpino')
def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
Esempio n. 16
0
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[word.lower() if word_frequency_distribution[word.lower()] >= 10 else '<unknown>' for word in sentence]
                 for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f,), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t) for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist, vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist, vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) / float(len(bigram_length_probabilities[length])) for length in
        bigram_length_probabilities.keys()}
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) / float(len(trigram_length_probabilities[length])) for length
        in
        trigram_length_probabilities.keys()}

    random_sentences = [[words[random.randint(0, len(words) - 1)].lower() for i in range(key)] for key in
                        bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1,)].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()), color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()), color='blue')
    random_bigram = plt.scatter(list(random_bigram_length_probabilities.values()),
                                list(random_bigram_length_probabilities.keys()), color='green')
    random_trigram = plt.scatter(list(random_trigram_length_probabilities.values()),
                                 list(random_trigram_length_probabilities.keys()), color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the bigram model produced this text of length 30: {}'.format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print('Given the seed word "this", the trigram model produced this text of length 30: {}'.format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(cpd_bigram, test_bigrams)
    print('Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'.format(bigram_entropy,
                                                                                               bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(cpd_trigram, test_trigrams)
    print('Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'.format(trigram_entropy,
                                                                                                trigram_perplexity))
Esempio n. 17
0
def main():
    matplotlib.use('Qt5Agg')
    import matplotlib.pyplot as plt

    download('punkt')
    # Download and load the english europarl corpus
    downloader.download('europarl_raw')
    english = LazyCorpusLoader('europarl_raw/english',
                               EuroparlCorpusReader,
                               r'ep-.*\.en',
                               encoding='utf-8')

    words = english.words()

    # Calculate the frequency distribution of the words in the corpus
    word_frequency_distribution = FreqDist([word.lower() for word in words])

    # Get the sentences of the corpus, all in lower case, with infrequent words replaced by the token "<unknown>"
    sentences = [[
        word.lower()
        if word_frequency_distribution[word.lower()] >= 10 else '<unknown>'
        for word in sentence
    ] for sentence in english.sents()]

    # create train and test dataset
    train = sentences[0:int(len(sentences) * 0.8)]
    test = sentences[int(len(sentences) * 0.8):]

    vocabulary_length = word_frequency_distribution.B()

    # Calculate bigrams and trigrams
    bigrams_train = list(chain.from_iterable(ngrams_sentences(train, 2)))
    trigrams_train = list(chain.from_iterable(ngrams_sentences(train, 3)))

    # Calculate the conditional frequency distributions for bigrams and trigrams
    bigrams_fd = ConditionalFreqDist(((f, ), s) for f, s in bigrams_train)
    trigrams_fd = ConditionalFreqDist([((f, s), t)
                                       for f, s, t in trigrams_train])

    # Calculate the conditional probability distributions for bigrams and trigrams
    cpd_bigram = ConditionalProbDist(bigrams_fd, LaplaceProbDist,
                                     vocabulary_length)
    cpd_trigram = ConditionalProbDist(trigrams_fd, LaplaceProbDist,
                                      vocabulary_length)

    bigrams_test = ngrams_sentences(test, 2)
    bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_test:
        logprob = [cpd_bigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        bigram_length_probabilities[len(sentence)].append(logprob)

    x = 0
    s = None
    for sentence in bigrams_test:
        if (len(sentence) > x):
            x = len(sentence)
            s = sentence

    trigrams_test = ngrams_sentences(test, 3)
    trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_test:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        trigram_length_probabilities[len(sentence)].append(logprob)

    average_bigram_length_probabilities = {
        length: sum(bigram_length_probabilities[length]) /
        float(len(bigram_length_probabilities[length]))
        for length in bigram_length_probabilities.keys()
    }
    average_trigram_length_probabilities = {
        length: sum(trigram_length_probabilities[length]) /
        float(len(trigram_length_probabilities[length]))
        for length in trigram_length_probabilities.keys()
    }

    random_sentences = [[
        words[random.randint(0,
                             len(words) - 1)].lower() for i in range(key)
    ] for key in bigram_length_probabilities.keys()]

    bigrams_random = ngrams_sentences(random_sentences, 2)
    random_bigram_length_probabilities = defaultdict(list)
    for sentence in bigrams_random:
        logprob = [cpd_trigram[(w1, )].logprob(w2) for w1, w2 in sentence]
        logprob = sum(logprob)
        random_bigram_length_probabilities[len(sentence)].append(logprob)

    trigrams_random = ngrams_sentences(random_sentences, 3)
    random_trigram_length_probabilities = defaultdict(list)
    for sentence in trigrams_random:
        logprob = [
            cpd_trigram[(w1, w2)].logprob(w3) for w1, w2, w3 in sentence
        ]
        logprob = sum(logprob)
        random_trigram_length_probabilities[len(sentence)].append(logprob)

    bigram = plt.scatter(list(average_bigram_length_probabilities.values()),
                         list(average_bigram_length_probabilities.keys()),
                         color='red')
    trigram = plt.scatter(list(average_trigram_length_probabilities.values()),
                          list(average_trigram_length_probabilities.keys()),
                          color='blue')
    random_bigram = plt.scatter(
        list(random_bigram_length_probabilities.values()),
        list(random_bigram_length_probabilities.keys()),
        color='green')
    random_trigram = plt.scatter(
        list(random_trigram_length_probabilities.values()),
        list(random_trigram_length_probabilities.keys()),
        color='black')
    plt.xlabel('$log_2(P(W_1^k))$')
    plt.ylabel('$k$')
    plt.legend((bigram, trigram, random_bigram, random_trigram),
               ('Bigram', 'Trigram', 'Random bigram', 'Random trigram'))
    plt.ylim(ymin=0)
    # plt.show()
    plt.savefig('logprob')

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_bigram, seed, 'bigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the bigram model produced this text of length 30: {}'
        .format(seed))

    seed = 'this'
    for i in range(30):
        newword = predict_word(cpd_trigram, seed, 'trigram')
        if newword != None:
            seed += ' ' + newword
        else:
            break
    print(
        'Given the seed word "this", the trigram model produced this text of length 30: {}'
        .format(seed))

    test_bigrams = []
    for sentence in bigrams_test:
        test_bigrams += sentence
    bigram_entropy, bigram_perplexity = centropy_perplexity(
        cpd_bigram, test_bigrams)
    print(
        'Cross-entropy of the bigram model is {}. The corresponding perplexity is {}'
        .format(bigram_entropy, bigram_perplexity))

    test_trigrams = []
    for sentence in trigrams_test:
        test_trigrams += sentence
    trigram_entropy, trigram_perplexity = centropy_perplexity(
        cpd_trigram, test_trigrams)
    print(
        'Cross-entropy of the trigram model is {}. The corresponding perplexity is {}'
        .format(trigram_entropy, trigram_perplexity))
def _post_setup():
    from nltk.downloader import download
    download('punkt')
Esempio n. 19
0
import pickle
import re
import time
from SAR_utils import *
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import downloader

downloader.download('stopwords')


def compareT(a, b):
    (d1, p1) = a
    (d2, p2) = b
    if d1 == d2:
        return p1 - p2
    else:
        return d1 - d2


# Obtain posting list of a word
# If an index i is not specified it will match : patterns and develop stemming
def getPList(word, i=None, stemming=False):
    if i != None:
        return i.get(word, [])

    if ":" in word:
        [where, word] = word.split(":")
        if where == "headline" or where == "h":
            i = titleIndex
        elif where == "date" or where == "d":
def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
def downloadNLTKTokenizerData():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Punkt Tokenizer Models")
    download('punkt')
def downloadNLTKEurParlRaw():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Sample European Parliament Proceedings "
                "Parallel Corpus")
    download('europarl_raw')
Esempio n. 23
0
def main():
    if not os.path.exists(NLTK_DIR):
        os.makedirs(NLTK_DIR)
        
    download('reuters', download_dir=NLTK_DIR)
def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')
# -*- coding: utf-8 -*-

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import unicodedata
import operator
from nltk import downloader
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
downloader.download("stopwords")
np.set_printoptions(threshold=np.nan)

stopwords_list = set(stopwords.words("spanish"))
stemmer = SnowballStemmer("spanish")


def tratamiento1(documentos):
    # Tratamiento de datos básico
    new_documentos = []
    for d in range(len(documentos)):
        unaccented_text = ''.join(
            c for c in unicodedata.normalize('NFD', documentos[d])
            if unicodedata.category(c) != 'Mn')
        lower_words = [str.lower(word) for word in unaccented_text.split(" ")]
        new_documentos.append(" ".join(lower_words))
    return new_documentos


def tratamiento2(documentos):
def downloadNLTKBrownCorpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Brown corpus")
    download('brown')
import config
from nltk import downloader

# Async, can't be run in main process :/
# for wordnet stemmer
downloader.download(info_or_id='wordnet', download_dir=config.NLTK_DATA_DIR)
# for snowball and porter stemmer
# downloader.download(info_or_id='punkt', download_dir=config.NLTK_DATA_DIR)
# stop words <- used in snowball
downloader.download(info_or_id='stopwords', download_dir=config.NLTK_DATA_DIR)
def downloadNLTKPenTreeBank():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's Treebank POS Tagger (Max entropy)")
    download('maxent_treebank_pos_tagger')
def download_corpus():
    downloader = nltk.downloader.Downloader(download_dir=NLTK_DIR)
    downloader.download('wordnet', download_dir=NLTK_DIR)
Esempio n. 30
0
from nltk import downloader

downloader.download()
Esempio n. 31
0
from nltk import downloader

if __name__ == "__main__":
    for ii in ["punkt", "stopwords", "wordnet"]:
        downloader.download(ii)
def downloadNLTKConll2000Corpus():
    logger = logging.getLogger('collective.classification')
    logger.info("Downloading NLTK's conll2000 corpus")
    download('conll2000')
Esempio n. 33
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from nltk.downloader import download
from logging import info
from cape_document_qa.download_and_extract import download_and_extract
from cape_document_qa.cape_document_qa_settings import MODEL_FOLDER, MODEL_URL, MODELS_FOLDER, MODEL_MB_SIZE, \
    GLOVE_EMBEDDINGS_URL, DOWNLOAD_ALL_GLOVE_EMBEDDINGS

glove_filepath = os.path.join(MODEL_FOLDER, 'glove.840B.300d.txt')
if not os.path.isfile(os.path.join(MODEL_FOLDER, 'model.pkl')) or \
        not os.path.isfile(glove_filepath) or \
        (
                DOWNLOAD_ALL_GLOVE_EMBEDDINGS and os.path.getsize(glove_filepath) / 1e6 < 2e3
                # less than 2 GBs-> we only have the top X embeddings
        ):
    # Downloading NLTK dependencies
    info("Downloading (if necessary) NLTK ressources:")
    download('punkt')
    download('stopwords')
    info('Downloading default model with top X Glove embeddings:')
    download_and_extract(MODEL_URL, MODELS_FOLDER, total_mb_size=MODEL_MB_SIZE)
    if DOWNLOAD_ALL_GLOVE_EMBEDDINGS:
        info('Downloading complete Glove Embeddings:')
        download_and_extract(GLOVE_EMBEDDINGS_URL, MODEL_FOLDER)
from config import *
from textblob import TextBlob
from nltk import downloader
import tweepy


class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print('A TWEET!')
        print(status.text)
        print('AND THE SENTIMENT PER SENTENCE IS:')
        blob = TextBlob(status.text)
        for sentence in blob.sentences:
            print(sentence.sentiment.polarity)


auth = tweepy.OAuthHandler(consumerkey, consumerkeysecret)
auth.set_access_token(accesstoken, accesstokensecret)

downloader.download('punkt')

myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=auth, listener=myStreamListener)

stream = tweepy.Stream(auth, myStreamListener)
stream.filter(track=['coca cola'], languages=['en'])