def get_list(text): # Tokenize each string and change to normalize list1 = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') list1_text = list1.tokenize(str(text)) temp = [] for lis in list1_text: lis = lis.lower() temp.append(lis) return temp
def filter_words(text_file): text = read_text(text_file) tokenizer = RegexpTokenizer(r'\w+') token = tokenizer.tokenize(text) pos_list = nltk.pos_tag(token) filtered_words = [ w for w in pos_list if not w[0] in stopwords.words('english') ] return filtered_words
def tokenization(self): """ Tokenize the contents of the posts and remove the strings that contains punctuations, numbers or only single letter :return: a dataframe which each row becomes list of tokens """ tqdm.pandas() tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}') tokens_df = self.text_df.progress_apply( lambda x: tokenizer.tokenize(x.lower())) return tokens_df
def inference(model_dir, text): tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}') tokens = tokenizer.tokenize(text.lower()) text = ' '.join(tokens) # load vectorizer transformer = pickle.load(open(model_dir + "tfidf_transformer.pkl", 'rb')) # Create new tfidfVectorizer with old vocabulary vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, vocabulary=transformer.vocabulary_) vec = vectorizer.fit_transform([text]) # load model from file model = pickle.load(open(model_dir + "bigram_SVM.dat", "rb")) y_pred = model.predict(vec) return y_pred[0] # 1: STRESS
for index, line in enumerate(lines): if "PROLOGUE" in line: lines = lines[index+1:] sentences = [] for line in lines: if len(line) > 1: for sentence in sent_tokenize(line): sentences.append(sentence) #print(sentences) stop_words = set(stopwords.words("english")) tokenizer = RegexpTokenizer(r'\w+') filtered_sentences = [] for sentence in sentences: sent = [] strspace = " " #print(sent.join(tokenizer.tokenize(str(sentence)))) for word in word_tokenize( " ".join(tokenizer.tokenize(str(sentence)))): if word not in list(stop_words): sent.append(word.lower()) filtered_sentences.append(" ".join(sent)) #print(filtered_sentences[:100]) model = gensim.models.Word2Vec(filtered_sentences) model.vocabulary
documents = f.readlines() f.close() # stoplist = set('for a of the and to in'.split()) # texts = [[word for word in document.lower().split() if word not in stoplist] # for document in documents] from nltk.corpus import RegexpTokenizer from nltk.corpus import stopwords from string import punctuation texts = [] for sentence in documents: sentence = sentence.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) custom_set = set(stopwords.words('english') + list(punctuation)) filtered_words = [w for w in tokens if w not in custom_set] texts.append(filtered_words) from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from pprint import pprint # pretty-printer from gensim import corpora # pprint(texts)
''' Created on 06/05/2013 @author: Rodrigo ''' from nltk.corpus import stopwords, RegexpTokenizer english_stops = set(stopwords.words('english')) tokenizer = RegexpTokenizer('\s+', gaps=True) print [w for w in tokenizer.tokenize("This is not a common book") if not w in english_stops]