def compute_emb(pages_path_in, pages_path_out, vocab): wemb = pkl.load(open(prm.wordemb_path, 'rb')) dim_emb = wemb[wemb.keys()[0]].shape[0] W = 0.01 * np.random.randn(len(vocab), dim_emb).astype(np.float32) for word, pos in vocab.items(): if word in wemb: W[pos,:] = wemb[word] f = h5py.File(pages_path_in, 'r') if prm.att_doc and prm.att_segment_type == 'sentence': nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') os.remove(pages_path_out) if os.path.exists(pages_path_out) else None # Save to HDF5 fout = h5py.File(pages_path_out,'a') if prm.att_doc: shape = (f['text'].shape[0],prm.max_segs_doc,prm.dim_emb) else: shape=(f['text'].shape[0],prm.dim_emb) embs = fout.create_dataset('emb', shape=shape, dtype=np.float32) mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32) i = 0 for text in f['text']: st = time.time() if prm.att_doc: if prm.att_segment_type == 'section': segs = [''] for line in text.split('\n'): if line.strip().startswith('==') and line.strip().endswith('=='): segs.append('') segs[-1] += line + '\n' elif prm.att_segment_type == 'sentence': segs = tokenizer.tokenize(text.decode('ascii', 'ignore')) else: raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter.') segs = segs[:prm.max_segs_doc] emb_ = utils.Word2Vec_encode(segs, wemb) embs[i,:len(emb_),:] = emb_ mask[i] = len(emb_) else: bow0, bow1 = utils.BOW(wordpunct_tokenize(text.lower()), vocab) emb = (W[bow0] * bow1[:,None]).sum(0) embs[i,:] = emb i += 1 #if i > 3000: # break print 'processing article', i, 'time', time.time()-st f.close() fout.close()
def _build_wordset(clazz, obscurity_limit): # I'm sorry this method is so disgusting. # It's all in the cause of fast loading in the main case. from nltk import FreqDist # Ensure corpora are loaded. try: from nltk.corpus import cmudict cmudict.entries() except LookupError: print "CMUDict corpus not found. Downloading..." from nltk import download download('cmudict') print "[Done]" if obscurity_limit is not None: from nltk.corpus import brown try: brown.words() except LookupError: print "Brown corpus not found. Downloading...", from nltk import download download('brown') print "[Done]" words = cmudict.entries() if obscurity_limit is not None: freqs = FreqDist([w.lower() for w in brown.words()]) words = sorted(words, key=lambda x: freqs[x[0].lower()], reverse=True) return words[:obscurity_limit] else: return list(words)
def _download_nltk_data(): """Install corpus data. """ for directory, data in nltk_data.iteritems(): for datum in data: if not exists(join(NLTK_DATA_DIR, directory, datum)): nltk.download(datum, download_dir=NLTK_DATA_DIR)
def handle(self, *args, **options): if args is None or len(args) < 2: pages = Page.objects.all() for page in pages: self._log.info("Page #%s: %s" % (page.id, page.fb_page_name)) raise CommandError('Invalid arguments. Expected: <page_id> <action>, where action might be: extract, tfidf, webidf') page_id = args[0] action = args[1] if page_id == 'setup': self._log.info("invoking nltk download") nltk.download() exit() self._log.info('AnalyticsCommand initializing.') self._log.info('Page-Id: %s' % page_id) page = Page.objects.get(id=page_id) if action == "extract": self.processPageExtract(page) elif action == "tfidf": self.processTfIdf(page) elif action == "webidf": self.processWebIdf(page) else: self._log.warn("Unknown action: %s" % action) self._log.info("All done for now.")
def installNLTKResources(): ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: nltk.data.find('corpora/wordnet') except LookupError: nltk.download('wordnet') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: nltk.data.find('corpora/europarl_raw') except LookupError: nltk.download('europarl_raw') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: nltk.data.find('corpora/gutenberg') except LookupError: nltk.download('gutenberg') ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def __init__(self, save_path=None, download=False, tokenizer="wordpunct_tokenize", *args, **kwargs): super().__init__(save_path=save_path) if download: nltk.download() self.tokenizer = getattr(nltk.tokenize, tokenizer, None) if not callable(self.tokenizer): raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
def _post_install(dir): import site reload(site) import nltk nltk.download('punkt') nltk.download('stopwords')
def __init__(self): print("Please Install the brown-corpus and wordnet on your machine : ") nltk.download() self.pfile = open("pcent_plurals.txt","w") self.pfile.write("%s %s \n" % ("Plurals".ljust(20),"Percentages")) self.plural_dict = {} self.single_dict = {}
def search_for_all_strings(line, file_format): '''Search for all strings with NLTK''' result = [] for regexp in Config.excluded_lines: for match in re.finditer(regexp, line): if match: return([]) for regexp in Config.strings_patterns[file_format]: for match in re.finditer(regexp, line): if not match: continue group = match.group(1) if len(group) > 0 and not contains_forbidden_patterns(group): try: tokens = nltk.word_tokenize(group) if len(tokens) > 0: for word in tokens: morf = wn.morphy(word) if morf and len(str(morf)) > 1: if (output_format == "csv") | (group not in global_word_pull): result.append(group) global_word_pull.add(group) break except: print ("Unexpected error:{0}".format(sys.exc_info())) traceback.print_tb(sys.exc_info()[2]) url = os.path.join(os.path.split(os.path.realpath(__file__))[0] + "/nltk_info.html") print("See here for installation instructions:\n" + url) webbrowser.open_new(url) nltk.download() sys.exit(2) return result
def main(): nltk.download('stopwords') nltk.download('vader_lexicon') print("\n================================================================================\n") print("---------------------------------- Platform Information ------------------------") print('machine: {}'.format(platform.machine())) print('node: {}'.format(platform.node())) print('processor: {}'.format(platform.processor())) print('release: {}'.format(platform.release())) print('system: {}'.format(platform.system())) print('version: {}'.format(platform.version())) print('uname: {}'.format(platform.uname())) #mem = virtual_memory() #print('memory: {}'.format(mem.total)) # total physical memory available print('python_build: {}'.format(platform.python_build())) print('python_compiler: {}'.format(platform.python_compiler())) print('python_branch: {}'.format(platform.python_branch())) print('python_implementation: {}'.format(platform.python_implementation())) print('python_revision: {}'.format(platform.python_revision())) print('python_version: {}'.format(platform.python_version())) print("\n================================================================================\n")
def lemma_tokenize(paragraph): lmtzr = WordNetLemmatizer() try: return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence] except LookupError: nltk.download('wordnet') return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def __init__(self, ngram=False, use_idf=False): self.ngram = ngram self.use_idf = use_idf # Load WordNet synsets and download data if necessary try: wordnet_path = nltk.data.find("corpora/wordnet") except LookupError: nltk.download("wordnet") wordnet_path = nltk.data.find("corpora/wordnet") self.wn = wordnet.WordNetCorpusReader(wordnet_path) # Initialize the two types of n-gram generators pentagram_vectorizer = CountVectorizer( ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list ) unigram_vectorizer = CountVectorizer( ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list ) # Function for generating five-grams through unigrams self.pent_analyze = pentagram_vectorizer.build_analyzer() # Function for generating just unigrams self.uni_analyze = unigram_vectorizer.build_analyzer() # Load IDF scores self.IDF = self.get_idf_scores() self.counts = self.get_counts()
def main(): import io with io.open(os.path.join(HERE, "README.rst"), "r") as readme: setup( name=app.__project__, version=app.__version__, description=app.__doc__, long_description=readme.read(), classifiers=app.__classifiers__, author=app.__author__, author_email=app.__author_email__, # url = app.__url__, license=[c.rsplit("::", 1)[1].strip() for c in app.__classifiers__ if c.startswith("License ::")][0], keywords=" ".join(app.__keywords__), packages=["mancify"], package_data={}, include_package_data=True, platforms=app.__platforms__, install_requires=app.__requires__, extras_require=app.__extra_requires__, zip_safe=True, entry_points=app.__entry_points__, tests_require=["pytest-cov", "pytest", "mock"], cmdclass={"test": PyTest}, ) # Download the required NLTK packages automatically import nltk nltk.download("cmudict") nltk.download("maxent_treebank_pos_tagger")
def run(self): _install.run(self) import nltk try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')
def annotations_to_words(terms, dag, ipr_map, lower): """ Converts a string of accesssions into a string of the corresponding english-text representations. """ try: sws = stopwords.words('english') except LookupError: nltk.download('stopwords') sws = stopwords.words('english') if lower: sws = set([x.lower() for x in sws]) case = string.lower else: sws = set([x.upper() for x in sws]) case = string.upper go_terms = [t.upper() for t in terms if 'GO' in t.upper()] ipr_terms = [t.upper() for t in terms if t.upper() in ipr_map] go_descriptions = ' '.join([case(dag[t].name) for t in go_terms]).split(' ') ipr_descriptions = ' '.join([case(ipr_map[t]) for t in ipr_terms]).split(' ') go_descriptions = [x.translate(None, string.punctuation) for x in go_descriptions] ipr_descriptions = [x.translate(None, string.punctuation) for x in ipr_descriptions] go_descriptions = [x for x in go_descriptions if case(x) not in sws] ipr_descriptions = [x for x in ipr_descriptions if case(x) not in sws] line = ' '.join(go_descriptions + ipr_descriptions) return line
def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False, *args, **kwargs): if download: nltk.download() self.tokenizer = getattr(nltk.tokenize, tokenizer, None) if not callable(self.tokenizer): raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
def boostrap_nltk_data(): nltk.data.path.append('./data/') nltkdata_exists = Path('./data/tokenizers/punkt/english.pickle') if not nltkdata_exists.exists(): logging.info("Downloading NLTK Data") nltk.download('punkt', './data')
def _post_install(): from importlib import reload import site reload(site) import nltk nltk.download('punkt')
def __init__(self): try: from sacremoses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except (ImportError, TypeError) as err: if isinstance(err, TypeError): warnings.warn('The instantiation of MosesDetokenizer in sacremoses is' ' currently only supported in python3.' ' Now try NLTKMosesDetokenizer using NLTK ...') else: warnings.warn('sacremoses is not installed. ' 'To install sacremoses, use pip install -U sacremoses' ' Now try NLTKMosesDetokenizer using NLTK ...') try: import nltk try: nltk.data.find('perluniprops') except LookupError: nltk.download('perluniprops') from nltk.tokenize.moses import MosesDetokenizer self._detokenizer = MosesDetokenizer() except ImportError: raise ImportError('NLTK is not installed. ' 'You must install NLTK <= 3.2.5 in order to use the ' 'NLTKMosesDetokenizer. You can refer to the official ' 'installation guide in https://www.nltk.org/install.html .')
def nltk_corpus(corpus_name): corpus = getattr(nltk.corpus, corpus_name) try: corpus.ensure_loaded() except: nltk.download(corpus_name) return corpus
def generate(dictionary='/usr/share/dict/british-english', output='../../gb-us-synonyms.txt'): nltk.download('wordnet') with open(dictionary) as dict_file: with open(output, 'w') as output_file: for gb, us in gen_synonyms(dict_file): output_file.write(gb + ', ' + us + '\n') print(gb + ',', us)
def nltk_download_corpus(resource_path): """ Download the specified NLTK corpus file unless it has already been downloaded. Returns True if the corpus needed to be downloaded. """ from nltk.data import find from nltk import download from os.path import split # Download the wordnet data only if it is not already downloaded _, corpus_name = split(resource_path) ## From http://www.nltk.org/api/nltk.html ## # When using find() to locate a directory contained in a zipfile, # the resource name must end with the forward slash character. # Otherwise, find() will not locate the directory. #### # Helps when resource_path=='sentiment/vader_lexicon'' if not resource_path.endswith('/'): resource_path = resource_path + '/' downloaded = False try: find(resource_path) except LookupError: download(corpus_name) downloaded = True return downloaded
def morphy(doc): """Lemmatize tokens using morphy, WordNet's lemmatizer.""" # XXX Results will be better if we do POS tagging first, but then we # need to map Penn Treebank tags to WordNet tags. nltk.download('wordnet', quiet=False) return map(nltk.WordNetLemmatizer().lemmatize, _tokenize_if_needed(fetch(doc)))
def download_dataset(data_name): if data_name == 'flowers': print('== Flowers dataset ==') flowers_dir = os.path.join(DATA_DIR, 'flowers') flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz') make_sure_path_exists(flowers_dir) # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included # the text_c10 directory from that archive as a bzipped file in the repo captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2') print('Extracting ' + captions_tbz) captions_tar = tarfile.open(captions_tbz, 'r:bz2') captions_tar.extractall(flowers_dir) flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' print('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url) urlretrieve(flowers_url, flowers_jpg_tgz, reporthook=dl_progress_hook) print('Extracting ' + flowers_jpg_tgz) flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz') flowers_jpg_tar.extractall(flowers_dir) # archive contains jpg/ folder elif data_name == 'skipthoughts': print('== Skipthoughts models ==') SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts') SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/' make_sure_path_exists(SKIPTHOUGHTS_DIR) # following https://github.com/ryankiros/skip-thoughts#getting-started skipthoughts_files = [ 'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz', 'bi_skip.npz.pkl', ] for filename in skipthoughts_files: src_url = SKIPTHOUGHTS_BASE_URL + filename print('Downloading ' + src_url) urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename), reporthook=dl_progress_hook) elif data_name == 'nltk_punkt': import nltk print('== NLTK pre-trained Punkt tokenizer for English ==') nltk.download('punkt') elif data_name == 'pretrained_model': print('== Pretrained model ==') MODEL_DIR = os.path.join(DATA_DIR, 'Models') pretrained_model_filename = 'latest_model_flowers_temp.ckpt' src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename print('Downloading ' + src_url) urlretrieve( src_url, os.path.join(MODEL_DIR, pretrained_model_filename), reporthook=dl_progress_hook, ) else: raise ValueError('Unknown dataset name: ' + data_name)
def _post_install(): # since nltk may have just been install # we need to update our PYTHONPATH import site reload(site) # Now we can import nltk import nltk nltk.download('stopwords')
def test_notebook_runner_2a_eco_nlp_correction(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") import nltk nltk.download('stopwords') self.common_notebook_runner_2a_eco_nlp_enonce("correction")
def download_packages(self): import nltk for x in [comp for comp in self._missing if "/" in comp]: package = x.split("/")[1] self.updateLabel.emit(package) nltk.download(package, raise_on_error=True) self.progressTheBar.emit()
def setup_dependencies(self): if not self._nltk_data_downloaded and bool( self.section.get('shortlog_imperative_check', True)): nltk.download([ 'punkt', 'averaged_perceptron_tagger', ]) type(self)._nltk_data_downloaded = True
def main(): nltk.download("punkt") parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("training_datasets", type=str, nargs="+", help="path to training data sets") args = parser.parse_args(sys.argv[1:]) run(args)
def __init__(self): from nltk.data import find from nltk import download try: find('wordnet.zip') except LookupError: download('wordnet')
import numpy as np import time from textblob import TextBlob import dill as pickle # import pickle from customTransfomers import * t0 = time.time() #------------------------------------- #***********Setting NTLK************** #------------------------------------- nltk.download('punkt') nltk.download('stopwords') #------------------------------------- #***********Reading Data************** #------------------------------------- print("Reading Data ......") df = pd.read_csv('fakecorpusWithMeta.csv') print("Cleaning the Data ......") # Drop unused columns df = df.drop(columns=["scraped_at", "index", "Unnamed: 0"]) print("Filtering Data ......") # If title is missing drop the entry df = df.dropna(subset=['title'])