def main(childes_xml_remote_root='http://childes.psy.cmu.edu/data-xml/', verbose=False): user_data_path = Downloader.default_download_dir(Downloader()) childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/') try: os.makedirs(childes_corpus_path) except OSError: pass CHILDESDir(childes_xml_remote_root, childes_corpus_path).download(verbose)
def dl_nltk(): TO_DL = ['stopwords', 'punkt'] dler = Downloader('https://pastebin.com/raw/D3TBY4Mj') for to_dl in TO_DL: if not nltk.download(to_dl): print('Downloading NLTK data from alternative source...') if not dler.download(to_dl): print('Failed download NLTK data...')
def __init__(self, ): """A method to initialize parameters""" DEFAULT_URL = 'https://raw.githubusercontent.com/JosephSefara/AfricanWordNet/master/data/index.xml' """The default URL for the NLTK data server's index""" try: downloader = Downloader(server_index_url=DEFAULT_URL) downloader.download('africanwordnet') except: raise
def nltk_updater(): packages = Set(['brown', 'wordnet', 'wordnet_ic', 'maxent_treebank_pos_tagger', 'universal_tagset']) d = Downloader() for dirpath, dirnames, filenames in os.walk(d.default_download_dir()): module = os.path.basename(dirpath) if module in packages: packages.remove(module) for module in packages: print 'Missing', module download(module)
def build_list_from_nltk(self, lang): downloader = Downloader() # Check if NLTK data directory exists. if StopwordRemover.nltk_dir == None: # Create temporary directory for download StopwordRemover.nltk_dir = tempfile.mkdtemp(prefix='cherami') nltk.data.path = [StopwordRemover.nltk_dir] logger.info('NLTK data directory is "{0}"' .format(StopwordRemover.nltk_dir)) # Check if the NLTK data has already been downloaded. if not downloader.is_installed('stopwords'): logger.info('Downloading NLTK stopword data...') downloader.download('stopwords', StopwordRemover.nltk_dir, True) logger.info('NLTK stopword data downloaded.') for word in stopwords.words(lang): self.stopword_list.add(word)
def __init__(self): self.logger = logging.getLogger('vert') super(WordMoversDistance, self).__init__() self.logger.info("Loading Word2Vec embeddings.") if not Downloader().is_installed('stopwords'): download('stopwords') self.stopwords = stopwords.words('english') self.model = KeyedVectors.load_word2vec_format( "./data/GoogleNews-vectors-negative300.bin", #.gz takes 2.5x longer binary=True) self.logger.info("Done: loading Word2Vec embeddings.")
def install_nltk(download_dir=None): """ Download specific collection identifiers """ if not download_dir: download_dir = settings.NLTK_DATA_PATH downloader = Downloader(download_dir=download_dir) downloader.download('punkt') downloader.download('maxent_treebank_pos_tagger')
def build_list_from_nltk(self, lang): downloader = Downloader() tempdir = None # Check if the NLTK data has already been downloaded. if not downloader.is_installed('stopwords'): # Create temporary directory for download tempdir = tempfile.mkdtemp(prefix='cherami') logger.info('Downloading NLTK stopword data into "{0}"' '...'.format(tempdir)) downloader.download('stopwords', tempdir, True) logger.info('NLTK stopword data downloaded.') nltk.data.path = [tempdir] for word in stopwords.words(lang): self.stopword_list.add(word) # Clean up after we're done. if tempdir is not None: shutil.rmtree(tempdir)
def __init__(self): # check for stopwords installation if not Downloader().is_installed('stopwords'): download('stopwords') if config['wmd']['save_memory']: # 25% slower but 50% memory savings by reducing datatype size. self.model = KeyedVectors.load_word2vec_format( config['wmd']['word2vec'], binary=True, datatype=np.float16) else: self.model = KeyedVectors.load_word2vec_format( config['wmd']['word2vec'], binary=True) if config['wmd']['normalize']: # computes L2-norms of word weight vectors self.model.init_sims(replace=True) self.stopwords = stopwords.words('english')
def __init__(self): super(RssSkill, self).__init__('RssSkill') self._is_reading_headlines = False self.feeds = {} self.cached_items = {} self.cache_time = {} try: pos_tag('advance') except LookupError: logger.debug('Tagger not installed... Trying to download') dler = Downloader() if not dler.download('averaged_perceptron_tagger'): logger.debug('Trying alternative source...') dler = Downloader(ALT_NLTK_DATA) dler.download('averaged_perceptron_tagger', raise_on_error=True)
def nltk_download_corpus( resource_path, local_data=LOCAL_DATA, nltk_dir=NLTK_DIR, ): """ Download the specified NLTK corpus file unless it has already been downloaded. Returns True if the corpus needed to be downloaded. """ print(os.getcwd()) from nltk.data import find if not (local_data): from nltk import download else: from nltk.downloader import Downloader _downloader = Downloader(local_data=local_data, nltk_dir=nltk_dir) #print(os.listdir(_downloader._download_dir)) DEBUG, don't throw away download = _downloader.download from os.path import split, sep from zipfile import BadZipfile # Download the NLTK data only if it is not already downloaded _, corpus_name = split(resource_path) if not resource_path.endswith(sep): resource_path = resource_path + sep downloaded = False try: find(resource_path) except LookupError: download(corpus_name) downloaded = True except BadZipfile: raise BadZipfile( 'The NLTK corpus file being opened is not a zipfile, ' 'or it has been corrupted and needs to be manually deleted.') return downloaded
def setup_nltk_on_lambda(): """ Getting nltk to run on lambda is tricky. We need to pregenerate the nltk libraries we are going to use and make them available to lambda """ # we have to force the home variable in order to download # nltk files to the correct place for packaging os.environ['HOME'] = __here__ from nltk.downloader import Downloader import nltk # deployed version of nltk data nltk.data.path = [os.path.join(__here__, 'nltk_data')] + nltk.data.path log.info("NLTK Path: %s", nltk.data.path) log.info("Default NLTK dir: %s", Downloader().default_download_dir()) nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet')
def _get_duc_sentences_2004(): """ Create a DucArticle for each article in the docs folder of Duc2004. Complete fields 'ID', 'folder', and 'sentence'. Returns: list<DucArticle> """ if not Downloader().is_installed('punkt'): download('punkt') filenames = list() for root, _, files in os.walk(config["duc4_sentences_folder"], topdown=False): for name in files: filenames.append(os.path.join(root, name)) articles = list() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for filename in filenames: with open(filename, 'r') as f: parsed_html = BeautifulSoup(f.read(), "lxml") corpus = parsed_html.find_all('text')[0].string tokenized = tokenizer.tokenize(corpus) if tokenized[0].split()[-1] not in config["duc_ending_exceptions"]: sentence = tokenized[0].encode('ascii', 'ignore') else: sentence = (tokenized[0] + ' ' + tokenized[1]).encode( 'ascii', 'ignore') article = DucArticle() article.id = parsed_html.docno.string.rstrip().lstrip().replace( '\n', ' ').encode('ascii', 'ignore') article.folder = filename.lstrip( config["duc4_sentences_folder"])[:5] article.sentence = _tokenize_sentence_generic(sentence) articles.append(article) return articles
import os, nltk from nltk.downloader import Downloader from nltk.stem import WordNetLemmatizer from nltk import word_tokenize, pos_tag, map_tag nltk_packages = [ 'punkt', 'maxent_treebank_pos_tagger', 'universal_tagset', 'wordnet' ] nltk_path = os.path.dirname(os.path.realpath(__file__)) + '/nltk' nltk.data.path.append(nltk_path) nltk_dl = Downloader(download_dir = nltk_path) for package in nltk_packages: nltk_dl.download(package) primary_tags = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON']) processes = [ 'parts_all', 'tokens_adj', 'tokens_adv', 'tokens_all', 'tokens_dense', 'tokens_noun', 'tokens_other', 'tokens_pron', 'tokens_verb' ]
import re import pandas as pd import matplotlib import matplotlib.pyplot as plt import nltk from nltk.downloader import Downloader from nltk.probability import FreqDist from nltk.probability import FreqDist from nltk.corpus import stopwords as stopwords_corpus from nltk import pos_tag from nltk.util import ngrams from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures from sklearn.metrics import f1_score downloader = Downloader() downloader.download("stopwords") nltk.download('averaged_perceptron_tagger') with open("../data/full.csv", "r") as csvfile: reader = csv.reader(csvfile, quoting=csv.QUOTE_ALL) documents = [t[0] for t in reader] matplotlib.rcParams.update({'font.size': 22}) stop_words = set(stopwords_corpus.words("english")) _non_alpha = re.compile("[^a-zA-Z ]") def normalize(text): """Map a token to a canonical form, e.g. lower case it, remove non-alpha characters, etc.
import os from nltk.downloader import Downloader from childespy import CHILDESDir childes_xml_remote_root = 'http://childes.psy.cmu.edu/data-xml/' user_data_path = Downloader.default_download_dir(Downloader()) childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/') try: os.makedirs(childes_corpus_path) except OSError: pass CHILDESDir(childes_xml_remote_root, childes_corpus_path).download()
import os from nltk.downloader import Downloader from childespy import CHILDESDir childes_xml_remote_root='http://childes.psy.cmu.edu/data-xml/' user_data_path = Downloader.default_download_dir(Downloader()) childes_corpus_path = os.path.join(user_data_path, 'corpora/CHILDES/') try: os.makedirs(childes_corpus_path) except OSError: pass CHILDESDir(childes_xml_remote_root, childes_corpus_path).download()
from nltk.corpus import wordnet as wn from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet_ic as wn_ic from nltk.corpus.reader.wordnet import Synset as WordNetSynset # Make sure the necessary corpora are downloaded to the local drive for token in ("wordnet", "wordnet_ic", "sentiwordnet"): try: nltk.data.find("corpora/" + token) except LookupError: try: nltk.download(token, quiet = True, raise_on_error = True) except ValueError: # Sometimes there are problems with the default index.xml URL. Then we will try this... from nltk.downloader import Downloader as NLTKDownloader d = NLTKDownloader("http://nltk.github.com/nltk_data/") d.download(token, quiet = True, raise_on_error = True) # Use the Brown corpus for calculating information content (IC) brown_ic = wn_ic.ic('ic-brown.dat') IC_CORPUS, IC_MAX = brown_ic, {} for key in IC_CORPUS: IC_MAX[key] = max(IC_CORPUS[key].values()) # This will hold the WordNet version VERSION = wn.get_version() or "3.0" #--------------------------------------------------------------------------------------------------- DIACRITICS = { "a": ("á", "ä", "â", "à", "å"),
def __init__(self, data_id=default_data_id): Downloader.__init__(self) self.data_id = data_id
from nltk.corpus import wordnet as wn from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet_ic as wn_ic from nltk.corpus.reader.wordnet import Synset as WordNetSynset # Make sure the necessary corpora are downloaded to the local drive for token in ("wordnet", "wordnet_ic", "sentiwordnet"): try: nltk.data.find("corpora/" + token) except LookupError: try: nltk.download(token, quiet = True, raise_on_error = True) except ValueError: # Sometimes there are problems with the default index.xml URL. Then we will try this... from nltk.downloader import Downloader as NLTKDownloader d = NLTKDownloader("http://nltk.github.com/nltk_data/") d.download(token, quiet = True, raise_on_error = True) # Use the Brown corpus for calculating information content (IC) brown_ic = wn_ic.ic('ic-brown.dat') IC_CORPUS, IC_MAX = brown_ic, {} for key in IC_CORPUS: IC_MAX[key] = max(IC_CORPUS[key].values()) # This will hold the WordNet version VERSION = wn.get_version() or "3.0" #--------------------------------------------------------------------------------------------------- DIACRITICS = { "a": ("á", "ä", "â", "à", "å"),
from qanom.candidate_extraction import cand_utils from qanom.annotations.common import read_csv from qanom.candidate_extraction.verb_to_nom import SuffixBasedNominalizationCandidates as VTN """ Define which resources should be used (by default) for filtering nouns as candidate nominalizations. """ default_resources = { "wordnet": True, "catvar": True, "affixes_heuristic": True } vtn = None # init this global VTN object only if required # by default, use nltk's default pos_tagger ('averaged_perceptron_tagger'): tagger_package = 'averaged_perceptron_tagger' nltk_downloader = Downloader() if (not nltk_downloader.is_installed(tagger_package)): nltk.download(tagger_package) pos_tag = nltk.pos_tag """ Alternatively, when extracting candidates for crowdsourcing QANom annotations through the qasrl-crowdsourcing project, one should use the same POS model as inside qasrl-crowdsourcing for consistency. qasrl-crowdsourcing uses the CoreNLPParser model in Java. We will use here nltk's CoreNLPParser wrapper. To run the CoreNLPParser model as a server on your machine (port 9000), pre-run the following command from the unzipped directory of the stanford-core-nlp project (see https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/ for instructions): ```bash java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000 ``` Then, replace the above two-lines in the python script with the following block:
class OWPreprocess(OWWidget): name = 'Preprocess Text' description = 'Construct a text pre-processing pipeline.' icon = 'icons/TextPreprocess.svg' priority = 30 inputs = [(Input.CORPUS, Corpus, 'set_data')] outputs = [(Output.PP_CORPUS, Corpus)] autocommit = settings.Setting(True) # Persistent data for each module is stored here. persistent_data_tokenizer = settings.Setting({}) persistent_data_casing = settings.Setting({}) persistent_data_stemmer = settings.Setting({}) persistent_data_filter = settings.Setting({}) preprocessors = [] # Pre-processing modules for the current run. UserAdviceMessages = [ widget.Message( "Some preprocessing methods require data (like word relationships, stop words, " "punctuation rules etc.) from the NLTK package. This data, if you didn't have it " "already, was downloaded to: {}".format( Downloader().default_download_dir()), "nltk_data") ] def __init__(self, parent=None): super().__init__(parent) self.corpus = None # -- INFO -- info_box = gui.widgetBox(self.controlArea, 'Info') self.controlArea.layout().addStretch() self.info_label = gui.label(info_box, self, 'No input corpus detected.') # Commit checkbox and commit button. output_box = gui.widgetBox(self.controlArea, 'Output') auto_commit_box = gui.auto_commit(output_box, self, 'autocommit', 'Commit', box=False) auto_commit_box.setMinimumWidth(170) # -- PIPELINE -- frame = QFrame() frame.setContentsMargins(0, 0, 0, 0) frame.setFrameStyle(QFrame.Box) frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }') frame_layout = QVBoxLayout() frame_layout.setContentsMargins(0, 0, 0, 0) frame_layout.setSpacing(0) frame.setLayout(frame_layout) # Load the previous states. persistent_data = [ self.persistent_data_tokenizer, self.persistent_data_stemmer, self.persistent_data_casing, self.persistent_data_filter, ] for ModuleClass, ModuleData in zip(PREPROCESSOR_MODULES, persistent_data): pp_module_widget = ModuleClass(ModuleData) # Create pp instance. self.preprocessors.append(pp_module_widget) pp_module_widget.change_signal.connect(self.settings_invalidated) pp_module_widget.error_signal.connect(self.display_message) frame_layout.addWidget(pp_module_widget) self.store_pipeline() # Store the pipeline after loading it. frame_layout.addStretch() self.mainArea.layout().addWidget(frame) self.progress_bar = None # Progress bar initialization. def set_data(self, data=None): self.corpus = data self.update_info() self.commit() def update_info(self): if self.corpus is not None: info = 'Document count: {}'.format(len(self.corpus)) else: info = 'No input corpus detected.' self.info_label.setText(info) def commit(self): self.store_pipeline() # Store the new pipeline. if self.corpus is not None: pp = self.assemble_preprocessor() if pp is not None: self.apply(pp) def apply(self, preprocessor): with self.progressBar(len(self.corpus) * 2) as progress_bar: self.progress_bar = progress_bar output = preprocessor(self.corpus) self.progress_bar = None self.send(Output.PP_CORPUS, output) def assemble_preprocessor(self): self.error(0, '') pp_settings = { # If disabled, this defaults to True, which is not what we want. 'lowercase': False, } for pp in self.preprocessors: if pp.enabled: pp_settings.update(pp.get_pp_setting()) pp_settings['callback'] = self.document_finished try: preprocessor = Preprocessor(**pp_settings) except Exception as e: self.error(0, str(e)) return None return preprocessor def store_pipeline(self): for pp in self.preprocessors: if isinstance(pp, TokenizerModule): self.persistent_data_tokenizer = pp.export_data() elif isinstance(pp, CasingModule): self.persistent_data_casing = pp.export_data() elif isinstance(pp, TransformationModule): self.persistent_data_stemmer = pp.export_data() elif isinstance(pp, FilteringModule): self.persistent_data_filter = pp.export_data() def document_finished(self): if self.progress_bar is not None: self.progress_bar.advance() @Slot() def settings_invalidated(self): self.commit() @Slot(str) def display_message(self, message): self.error(0, message)