def run(self): # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html) nltk_path.append(join(config.paths["rawdata"], "nltk")) try: # Check which classes are valid depending on min_docs_per_class nbprint('Loading classes') self.load_valid_classes() # Load the documents with data.document_writer(self.info) as document_writer: # Initialize info classes self.classinfo = ClassInfo() self.docinfo = DocumentInfo(document_writer) # Load documents and store class information in classinfo self.load_documents() # Print Meta Information self.docinfo.save_meta(self.info) self.classinfo.save_meta(self.info) except (LookupError, FileNotFoundError): raise ImporterError( info, 'Directory "{}" does not contain the required corpus.'.format( nltk_path)) # Save the classes classes = self.classinfo.make_class_list() data.save_classes(classes, self.info)
def setup_directories(processed_path, nlkt_data_path): """Just in case delete/create a directory for processed files It's okay to store files data in memory if they are small. However, I would save/upload them them somewhere if the data is large""" if os.path.exists(processed_path): shutil.rmtree(processed_path) os.makedirs(processed_path) if not os.path.exists(nlkt_data_path): os.makedirs(nlkt_data_path) nltk_download(['punkt', 'stopwords'], download_dir=nlkt_data_path) nltk_path.append(nlkt_data_path)
import logging, logging.handlers from string import punctuation, digits, maketrans from splunk.appserver.mrsparkle.lib.util import make_splunkhome_path from splunk import setupSplunkLogger from nltk import word_tokenize, pos_tag from nltk.data import path as nltk_data_path from nltk.corpus import wordnet, stopwords as stop_words from nltk.stem import WordNetLemmatizer, PorterStemmer from nltk.util import ngrams from splunklib.searchcommands import dispatch, StreamingCommand, Configuration, Option, validators BASE_DIR = make_splunkhome_path(["etc", "apps", "nlp-text-analytics"]) CORPORA_DIR = os.path.join(BASE_DIR, 'bin', 'nltk_data') nltk_data_path.append(CORPORA_DIR) @Configuration(local=True) class CleanText(StreamingCommand): """ Counts the number of non-overlapping matches to a regular expression in a set of fields. ##Syntax .. code-block:: cleantext textfield=<field> [default_clean=<bool>] [remove_urls=<bool>] [remove_stopwords=<bool>] [base_word=<bool>] [base_type=<string>] [mv=<bool>] [force_nltk_tokenize=<bool>] [pos_tagset=<string>] [custom_stopwords=<comma_separated_string_list>] [term_min_len=<int>] [ngram_range=<int>-<int>] [ngram_mix=<bool>] ##Description
# Import the libraries import tensorflow as tf import numpy as np import nltk import csv from nltk.data import path # append your path for nltk data path.append("C:\\Users\\andri\\AppData\\Roaming\\nltk_data") # Load the data file_path = '.\Data\\train.csv' # path for the data set X, y2 = [], [] with open(file_path, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') next(reader, None) # Skip header for row in reader: y2.append(row[1]) X.append(row[2]) y_real = [] for i in y2: y_real.append(int(i)) # Making vector y one_hot y = [] # one hot y for i in range(len(y_real)):
# encapsulated return [field_block] def _word(self, s): return [fields[1] for fields in s] def _tag(self, s, _): return [(fields[1], fields[3]) for fields in s] def _parse(self, s): # dependencygraph wants it all back together... block = '\n'.join('\t'.join(line) for line in s) return DependencyGraph(block, top_relation_label='root') path.append(abspath(dirname(__file__))) ud_english = LazyCorpusLoader( 'ud_english', UniversalDependencyCorpusReader, r'.*\.conll') mystery = LazyCorpusLoader( 'mystery', UniversalDependencyCorpusReader, r'.*\.conll') class Transducer(object): """Provides generator methods for converting between data types Args: word_list : an ordered list of words in the corpus. word_list[i] will be assigned the id = i + 1. root is assigned id 0 and any words not in the list are assigned id = len(word_list) + 2.
import operator from sklearn.decomposition import TruncatedSVD from sklearn import pipeline from sklearn.preprocessing import Normalizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import networkx as nx import joblib import gensim from gensim import corpora import numpy as np import re from nltk.data import path from pprint import pprint path.append("/home/analytics/data_partition/nltk_data") from nltk.corpus import stopwords stops = set(stopwords.words('english')) import warnings warnings.filterwarnings('ignore') home = os.path.abspath(os.path.dirname(__file__)) sys.path.append(home + '/../../') from src.mysql_utils import MySqlUtils from src.NLP.preprocessing import clean_tweet Lda = gensim.models.ldamodel.LdaModel threshold = 0.95
import os, sys, re import json from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer from nltk.data import path as nltk_path import gensim from gensim import utils, corpora, models from gensim.corpora.wikicorpus import remove_markup from preprocess_text import preprocess import logging nltk_path.append('./nltk_data/') logging.basicConfig(stream=sys.stdout, level=logging.INFO) NUM_TOPICS = 20 db_dir = '/mnt/lascar/qqiscen/src/TextTopicNet/data/VOC2007/VOCdevkit/VOC2007/' train_dict_path = 'train_dict_ImageCLEF_Wikipedia.json' print ' ' + sys.argv[0] print ' Learns LDA topic model with ' + str( NUM_TOPICS) + ' topics from corpora on ' + train_dict_path print ' (...)' img_dir = db_dir + 'JPEGImages/' xml_dir = db_dir + 'Annotations/' if not os.path.isdir(db_dir):
""" import os import numpy as np import matplotlib.pyplot as plt import random try: import ConfigParser except: import configparser as ConfigParser import pandas as pd from nltk.data import path as nltk_data_path nltk_data_location = os.getenv('NLTK_DATA_PATH') if nltk_data_location is not None: nltk_data_path.append(nltk_data_location) from nltk.tag import pos_tag from nltk.tokenize import word_tokenize import utils import transferlearning as tl from stratified_split import writefile def run_experiment(transfer_exp_name): """ Run an experiment given in the experiments directory. Some configuration details (which seeds to use, which datasets to use, which transfer methods to use, which classifiers to use, number of sentences in the target training set, etc.) are given in the .cfg file within the directory transfer_exp_name.
#Ramon Ruiz Dolz #Salvador Marti Roman import nltk from nltk.corpus import * from nltk.corpus import PlaintextCorpusReader from nltk.probability import * from nltk.tokenize import * from nltk.stem import SnowballStemmer import io import os from nltk.data import path dir_path = os.path.dirname(os.path.realpath(__file__)) corpus_root = dir_path.replace(".idea", "") path.append(dir_path + "\\NLTK") #Act1 originalContent = io.open("./library/quijote.txt", encoding="utf8").read() #Act2 originalFreq = FreqDist( w for w in RegexpTokenizer(".").tokenize(originalContent)) print("Act2") print(sorted(originalFreq.keys())) #Act3 filterContent = re.sub('[.|,"¡!()\-:;¿?«»\'\]\[\\n]', '', originalContent) #Act4 print("Act4") filterFreq = FreqDist(filterContent) print(sorted(filterFreq.keys()))
from nltk.corpus import wordnet from nltk.data import path from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from spellchecker import SpellChecker from pathlib import Path # # Startup # # BAD. I know, vendor lock-in. But right now we're # doing it on replit, so it's fine. path.append(str(Path("~/B5Chatbot1MalcolmMaxim/nltk_data").expanduser())) # Class instances lemmatizer = WordNetLemmatizer() spellcheck = SpellChecker() # # Functions # def tokenize(user_input): """Peforms tokenization on user input as well as multiple other parsing steps including spellchecking, synonym generation and lemmatization. """
import operator import md5 import urllib2 import sys from selenium import webdriver from nltk import word_tokenize from nltk.data import path as nltk_path from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer import config # NLTK resource initialization nltk_path.append(config.NLTK_DATA_PATH) nltk_to_download = [] try: stopwords.words('english') except LookupError: nltk_to_download.append('stopwords') try: word_tokenize('token test') except LookupError: nltk_to_download.append('punkt') if nltk_to_download: print 'Performing first-time setup' from nltk import download as nltk_download for package in nltk_to_download: print '\tDownloading:', package nltk_download(package)
#STOP WORDS :: used just for removing the not usable words like 'of','the' etc. from nltk.data import path from nltk.corpus import stopwords from nltk.tokenize import word_tokenize path.append("F:\\nltk_data") example_sentence = "This is an example showing of stop word filteration" stop_words = set(stopwords.words("english")) words = word_tokenize(example_sentence) #filtered_sentence=[] #for w in words: # if w not in stop_words: # filtered_sentence.append(w) filtered_sentence = [w for w in words if w not in stop_words] #oneliner of above code print(filtered_sentence)
import re import os import multiprocessing import time import sqlite3 from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords from nltk.data import path as nltk_path from database_manager import DatabaseManager from search_config import DIR_FILES, DIR_DUMP, DIR_DATABASE stemmer = SnowballStemmer('english') nltk_path.append("../resources/nltk_data") def get_files_to_clean(dir): return filter(lambda x: x[0] != '.', sorted(os.listdir(dir))) def write_to_file(content, new_file_name): new_file = open(new_file_name, 'w') new_file.write(content) new_file.close() def chunks(l, n): newn = int(len(l) / n) for i in xrange(0, n-1): yield l[i*newn:i*newn+newn] yield l[n*newn-newn:]
from .multi_process import create_process_pool, multiprocess_search_mdx, pre_pool_search prpool = None thpool = None if check_system() == 0: prpool = create_process_pool() pre_pool_search(prpool) # else: # thpool = create_thread_pool() try: from nltk.data import path as nltk_path from nltk.stem import WordNetLemmatizer nltk_path.append(os.path.join(ROOT_DIR, 'media', 'nltk_data')) lemmatizer = WordNetLemmatizer() lemmatizer.lemmatize('a') # WordNetLemmatizer()第一次运行lemmatize()慢,需要初始化,将本地语料库调入内存,耗时1秒多,因此这里要预加载。 except Exception as e: lemmatizer = None print(e) spell = SpellChecker(distance=1) # 默认距离为2,比较慢,大概1.6秒左右,设置距离为1后,大约0.001秒左右。 builtin_dic_name = '内置词典' def search(query_list, group): record_list = []
from base.sys_utils import check_system from mdict.models import MyMdictEntry, MyMdictItem from mdict.serializers import mdxentry from mysite.settings import BASE_DIR from .init_utils import init_mdict_list from .loop_search import loop_search_sug from .mdict_config import get_config_con if check_system() == 0: from .multiprocess_search import pool, multiprocess_search_mdx, multiprocess_search_sug, check_pool_recreate, \ loop_create_model else: from .multithread_search import thpool, multithread_search_mdx, multithread_search_sug, \ check_threadpool_recreate, loop_create_thread_model nltk_path.append(BASE_DIR + os.sep + 'media' + os.sep + 'nltk_data') lemmatizer = WordNetLemmatizer() lemmatizer.lemmatize('a') # WordNetLemmatizer()第一次运行lemmatize()慢,需要初始化,将本地语料库调入内存,耗时1秒多,因此这里要预加载。 spell = SpellChecker(distance=1) # 默认距离为2,比较慢,大概1.6秒左右,设置距离为1后,大约0.001秒左右。 builtin_dic_name = '内置词典' def search(query, is_en, group): record_list = [] query = query.strip() t2 = time.perf_counter() try: