def install_nltk(download_dir=None): """ Download specific collection identifiers """ if not download_dir: download_dir = settings.NLTK_DATA_PATH downloader = Downloader(download_dir=download_dir) downloader.download('punkt') downloader.download('maxent_treebank_pos_tagger')
def __init__(self, ): """A method to initialize parameters""" DEFAULT_URL = 'https://raw.githubusercontent.com/JosephSefara/AfricanWordNet/master/data/index.xml' """The default URL for the NLTK data server's index""" try: downloader = Downloader(server_index_url=DEFAULT_URL) downloader.download('africanwordnet') except: raise
def __init__(self): super(RssSkill, self).__init__('RssSkill') self._is_reading_headlines = False self.feeds = {} self.cached_items = {} self.cache_time = {} try: pos_tag('advance') except LookupError: logger.debug('Tagger not installed... Trying to download') dler = Downloader() if not dler.download('averaged_perceptron_tagger'): logger.debug('Trying alternative source...') dler = Downloader(ALT_NLTK_DATA) dler.download('averaged_perceptron_tagger', raise_on_error=True)
def dl_nltk(): TO_DL = ['stopwords', 'punkt'] dler = Downloader('https://pastebin.com/raw/D3TBY4Mj') for to_dl in TO_DL: if not nltk.download(to_dl): print('Downloading NLTK data from alternative source...') if not dler.download(to_dl): print('Failed download NLTK data...')
def build_list_from_nltk(self, lang): downloader = Downloader() # Check if NLTK data directory exists. if StopwordRemover.nltk_dir == None: # Create temporary directory for download StopwordRemover.nltk_dir = tempfile.mkdtemp(prefix='cherami') nltk.data.path = [StopwordRemover.nltk_dir] logger.info('NLTK data directory is "{0}"' .format(StopwordRemover.nltk_dir)) # Check if the NLTK data has already been downloaded. if not downloader.is_installed('stopwords'): logger.info('Downloading NLTK stopword data...') downloader.download('stopwords', StopwordRemover.nltk_dir, True) logger.info('NLTK stopword data downloaded.') for word in stopwords.words(lang): self.stopword_list.add(word)
def build_list_from_nltk(self, lang): downloader = Downloader() tempdir = None # Check if the NLTK data has already been downloaded. if not downloader.is_installed('stopwords'): # Create temporary directory for download tempdir = tempfile.mkdtemp(prefix='cherami') logger.info('Downloading NLTK stopword data into "{0}"' '...'.format(tempdir)) downloader.download('stopwords', tempdir, True) logger.info('NLTK stopword data downloaded.') nltk.data.path = [tempdir] for word in stopwords.words(lang): self.stopword_list.add(word) # Clean up after we're done. if tempdir is not None: shutil.rmtree(tempdir)
from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet_ic as wn_ic from nltk.corpus.reader.wordnet import Synset as WordNetSynset # Make sure the necessary corpora are downloaded to the local drive for token in ("wordnet", "wordnet_ic", "sentiwordnet"): try: nltk.data.find("corpora/" + token) except LookupError: try: nltk.download(token, quiet = True, raise_on_error = True) except ValueError: # Sometimes there are problems with the default index.xml URL. Then we will try this... from nltk.downloader import Downloader as NLTKDownloader d = NLTKDownloader("http://nltk.github.com/nltk_data/") d.download(token, quiet = True, raise_on_error = True) # Use the Brown corpus for calculating information content (IC) brown_ic = wn_ic.ic('ic-brown.dat') IC_CORPUS, IC_MAX = brown_ic, {} for key in IC_CORPUS: IC_MAX[key] = max(IC_CORPUS[key].values()) # This will hold the WordNet version VERSION = wn.get_version() or "3.0" #--------------------------------------------------------------------------------------------------- DIACRITICS = { "a": ("á", "ä", "â", "à", "å"), "e": ("é", "ë", "ê", "è"),
import os, nltk from nltk.downloader import Downloader from nltk.stem import WordNetLemmatizer from nltk import word_tokenize, pos_tag, map_tag nltk_packages = [ 'punkt', 'maxent_treebank_pos_tagger', 'universal_tagset', 'wordnet' ] nltk_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk' nltk.data.path.append(nltk_path) nltk_dl = Downloader(download_dir = nltk_path) for package in nltk_packages: nltk_dl.download(package) primary_tags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON']) processes = [ 'parts_all', 'tokens_adj', 'tokens_adv', 'tokens_all', 'tokens_dense', 'tokens_noun', 'tokens_other', 'tokens_pron', 'tokens_verb' ]
import pandas as pd import matplotlib import matplotlib.pyplot as plt import nltk from nltk.downloader import Downloader from nltk.probability import FreqDist from nltk.probability import FreqDist from nltk.corpus import stopwords as stopwords_corpus from nltk import pos_tag from nltk.util import ngrams from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures from sklearn.metrics import f1_score downloader = Downloader() downloader.download("stopwords") nltk.download('averaged_perceptron_tagger') with open("../data/full.csv", "r") as csvfile: reader = csv.reader(csvfile, quoting=csv.QUOTE_ALL) documents = [t[0] for t in reader] matplotlib.rcParams.update({'font.size': 22}) stop_words = set(stopwords_corpus.words("english")) _non_alpha = re.compile("[^a-zA-Z ]") def normalize(text): """Map a token to a canonical form, e.g. lower case it, remove non-alpha characters, etc.