def run(self): """ How do I run this Task? Luigi will call this method if the Task needs to be run. """ # remove stop words and punctuation stop = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'\w+') wordnet = WordNetLemmatizer() docs = [] #ipdb.set_trace() for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects lines = 0 words = [] for line in f.open('r'): if lines == 0: label = line lines +=1 else: words.extend(tokenizer.tokenize(line)) lines +=1 words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')] docs.append((label, '\t'.join(words))) out = self.output().open('w') for label, tokens in docs: out.write("%s,%s\n" % (label.strip(), tokens.strip())) out.close()
def map(self): mc=MongoClient('ec2-52-0-148-244.compute-1.amazonaws.com',27017) dbmc=mc.genid idoc=dbmc.gentable.find_one_and_update(filter={},update={ "$inc": { "score": 1 } },upsert=True); k=Key(self.bucket) y=stopwords.words('english') i=1 strx=str(int(idoc['score'])) strz=None filestring="" for line in sys.stdin: line = unicode(line, "utf-8","ignore") pattern = re.compile(r'\b(' + r'|'.join(y) + r')\b\s*') line = pattern.sub('', line) tokenizer = RegexpTokenizer(r'\w+') words=tokenizer.tokenize(line) strz=strx+'a'+str(i) k.key=strz filestring=line+'\n' k.set_contents_from_string(filestring) for word in words: word=word.encode(encoding='UTF-8',errors='ignore') print '%s\t%s' % (word.strip(), strz) i+=1
def __init__(self, rtepair, stop=True, lemmatize=False): """ @param rtepair: a L{RTEPair} from which features should be extracted @param stop: if C{True}, stopwords are thrown away. @type stop: C{bool} """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'have', 'is', 'are', 'were', 'and', 'very', '.',',']) self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def lemmatizeall(word_list): """ Lemmatizes the word_list passing through each type of word Input: word_list - list of words to be cleaned pos options: ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' """ word_types = "v", "a", "n", "s", "r" #print(word_types) #ipdb.set_trace() wnl = nltk.WordNetLemmatizer() tokenizer = RegexpTokenizer(r'\w+') for x in range(0, len(word_list)): word_tokens = tokenizer.tokenize(str(word_list[x])) word_tokens_lem = word_tokens for i in range(0, len(word_types)): pos = word_types[i] word_tokens_lem = [wnl.lemmatize(w, pos=pos) for w in word_tokens_lem] sep = " " word_list[x] = sep.join(word_tokens_lem) #print(i) return word_list #[wnl.lemmatize(w, pos=pos) for w in word_list]
def textToWordList(txt): p_stemmer = RussianStemmer() tokenizer = RegexpTokenizer(r'\w+') stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')] r = re.compile('^[а-я]+$') badword =[ 'дом', 'город', "дорог", "час", "ноч", "слов", "утр", "стран", "пут", "путешеств", "мест", 'нов', "друз", "добр" ] txt = txt.lower().replace("<br>", "\n") tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)] tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword] return tokens
def createLDAModel(texts, n_topics, n_passes): """Generates a LDA model from an array of texts """ tokenizer = RegexpTokenizer(r'\w+') #Create EN stop words list en_stop = get_stop_words('en') #Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() texts_ = [] # loop through document list for i in texts: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts_.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts_) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts_] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes) return(ldamodel)
def lda(data): data = get_only_text(data) only_tweet = data length = len(only_tweet) length = min(20,length) for i in xrange(0,length): print i print only_tweet[i] return tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() length = len(only_tweet) length = min(20,length) total_texts = [] for i in xrange(0,length): print only_tweet[i] print to_lower = only_tweet[i].lower() tokens = tokenizer.tokenize(to_lower) stopped_tokens = [k for k in tokens if not k in en_stop] texts = [p_stemmer.stem(k) for k in stopped_tokens] total_texts.append(texts) dictionary = corpora.Dictionary(total_texts) corpus = [dictionary.doc2bow(text) for text in total_texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) result = ldamodel.print_topics(num_topics=2, num_words=1) for i in result: print i
def generate_stemmed_tokens(page_content): lowered = page_content.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(lowered) stems = create_stems(tokens) return stems
def __init__(self, rtepair, stop=True, lemmatize=False): """ @param rtepair: a L{RTEPair} from which features should be extracted @param stop: if C{True}, stopwords are thrown away. @type stop: C{bool} """ self.stop = stop self.stopwords = set( ["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","] ) self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"]) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+") # Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def get_structuredsteps(soup, dct): dct['structuredsteps'] = [] new_steps = dct['steps'] new_ingredients = dct['ingredients'] tokenizer = RegexpTokenizer(r'\w+') time_units = ['min', 'min.', 'minutes', 'minute', 'hour', 'hours', 'hr', 'hrs', 'hr.', 'hrs.'] ingredient_names = [] for y in new_ingredients: ingredient_names.append(y['name']) for step in new_steps: if step != '': method_list = [] for method in methods: if method in step: method_list.append(method) elif method + "ing" in step: method_list.append(method) elif method + "s" == step: method_list.append(method) elif method + "er" == step: method_list.append(method) elif method + "ed" == step: method_list.append(method) elif method + "ing" == step: method_list.append(method) tools_list = [] for tool in tools: if tool in step: tools_list.append(tool) for verb in actions: if verb in step: tools_list.append(actions[verb]) ingredient_list = [] for x in ingredient_names: for y in x.split(): if y in step: ingredient_list.append(x) cooking_time = " " step_list = tokenizer.tokenize(step) for x in range(0,len(step_list)-2): if step_list[x].isdigit(): if step_list[x+1] in time_units: cooking_time += step_list[x]+ ' ' + step_list[x+1] + ' ' d = { 'step': step, 'tools': set(tools_list), 'methods' : set(method_list), 'cooking time': cooking_time, 'ingredients' : ingredient_list } dct["structuredsteps"].append(d)
def Tokenize(TextData): tokenizer = RegexpTokenizer(r'\w+') tokens = list() # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # clean and tokenize document string raw = TextData.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] tokens = stemmed_tokens TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt") fp = open(TOKENIZEDTEXT_FILE, "w") print(TOKENIZEDTEXT_FILE) # pickle.dump(tokens, fp) fp.write(str(tokens)) fp.close()
def extractWords(text): tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(text) words = word_tokenize(text) sWords = stopwords.words("english") return [w.lower() for w in words if w not in sWords]
def text_process(text): ''' Takes in a string of text, then performs the following 1. Tokenizes and removes punctuation 2. Removes stopwords 3. Stems 4. Returns a list of the cleaned text ''' if(pd.isnull(text)): return [] # Tokenize tokenizer = RegexpTokenizer(r'\w+') text_processed = tokenizer.tokenize(text) # Removing any stopwords text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')] # Stemming porterStemmer = PorterStemmer() text_processed = [porterStemmer.stem(word) for word in text_processed] try: text_processed.remove('b') except: pass return " ".join(text_processed)
def trainMarkovChain(self, n = 1): self.ngram_degree = n self.markov_model = defaultdict(lambda : defaultdict(int)) sentences = self.corpus_sentences if sentences is None: sentences = self.sentenceTokenizeCorpus() print("Training markov model on corpus.") word_tokenizer = RegexpTokenizer(r"\w+") for sentence in sentences: words = word_tokenizer.tokenize(sentence) last_word_list = ["#"] * n for word in words: last_token = " ".join(last_word_list) self.markov_model[last_token][word] += 1 last_word_list.append(word) last_word_list = last_word_list[1:] last_token = " ".join(last_word_list) self.markov_model[last_token]["#"] += 1
def stripped_words(self, original_sentence): _sentence = filter(self.printable_char_filter, original_sentence) _sentence = _sentence.replace(u'\u2013', ' ') _sentence = _sentence.replace(u'\u2014', ' ') tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') tokens = tokenizer.tokenize(_sentence) return [word.lower() for word in tokens if word.lower() not in stop_words]
def write_summary(texts, ofile): word_tokenizer = RegexpTokenizer(r"\w+") with codecs.open(ofile, u"w", u"utf-8") as f: for text in texts: f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)])) f.write(u"\n") f.flush()
def relevance_features(doc): print "relfeatures" print doc[:10] features={} #print doc #Test 1 : Has synonyms of NIT Warangal features['contains synonym']='false' for word in synonyms: if word in doc: features['contains synonym']='true' break #Test 2 : Has a person name that appears in Almabase's DB count=0 names=ner.get_names(data) count=ner.query_db(names) print 'count is {}'.format(count) # if count==0: # features['hasAlumnus']='none' # elif count<=3: # features['hasAlumnus']='medium' # elif count>3: # features['hasAlumnus']='high' # print count #Test 3: Bag of words approach tokenizer = RegexpTokenizer(r'\w+') document_words=tokenizer.tokenize(doc) for word in word_features: if word.lower() in document_words: print "{} is present".format(word) features['contains({})'.format(word.lower())] = (word in document_words) return features
def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def preprocess_wikidata(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower().split('../img/')[0] tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def get_product_vocab(dict_queries): tok = RegexpTokenizer(r'\w+') vocab = {} for query,v in dict_queries.items(): words = defaultdict(int) for prod in v: w_prod = tok.tokenize(prod[1]) for w in w_prod: #wt = stem(wt) if not re.match(r'\d+$', w) and \ len(w) > 1 and \ w not in stop_words: words[w] += 1 vocab[query] = words.keys() #vocab[query] = [k for (k, v) in words.iteritems() if v > 1] """ print "Query: " + query sorted_w = sorted(words.items(), key=lambda x:x[1], reverse=True) print sorted_w """ return vocab
def preprocess(TWEETS, typeTweet): wordlist = [] tokenizer = RegexpTokenizer(r'#?\w+') #normalize text -- TOKENIZE USING REGEX TOKENIZER cnt = 0 for item in TWEETS: text = TWEETS[cnt] tweet = ''.join(text) tweet = tweet.lower().strip('\n') tweet = re.sub(r'[0-9]+', "" , tweet) tweet = re.sub(r'@[^\s]+', "" , tweet) tweet = re.sub(r'#\w+primary', "" , tweet) wordlist.extend(tokenizer.tokenize(tweet)) cnt += 1 #remove stopwords stop = stopwords.words('english') + ['rt', 'via', 'u', 'r', 'b', '2', 'http', 'https', 'co', 'live', 'hall', 'town', 'watch', 'tune', 'time', 'tonight', 'today', 'campaign', 'debate', 'wants', 'without', 'dont', '#hillaryclinton', '#berniesanders', '#donaldtrump', '#tedcruz', "#johnkasich", '#politics'] filtered = [term for term in wordlist if term not in stop] filtered_final = [term for term in filtered if len(term)>3] print 'Preprocessed %s tweets' % (typeTweet) return filtered_final
def getData(): tokenizer = RegexpTokenizer(r'\w+') f = open("msr_paraphrase_train.txt", "r") f.readline() trainInput = [] trainClass = [0] * 8160 i = 0 while i < 8160: tokens = f.readline().strip().split('\t') trainClass[i] = trainClass[i+1] = int(tokens[0]) i += 2 S = tokenizer.tokenize(tokens[3].lower()) Smatrix1 = sentenceToMatrix(S) S = tokenizer.tokenize(tokens[4].lower()) Smatrix2 = sentenceToMatrix(S) trainInput.append([np.transpose(Smatrix1+Smatrix2)]) trainInput.append([np.transpose(Smatrix2+Smatrix1)]) f.close() f = open("msr_paraphrase_test.txt", "r") f.readline() testInput = [] testClass = [0] * 1725 for i in range(0,1725): tokens = f.readline().strip().split('\t') testClass[i] = int(tokens[0]) S = tokenizer.tokenize(tokens[3].lower()) Smatrix = sentenceToMatrix(S) S = tokenizer.tokenize(tokens[4].lower()) Smatrix.extend(sentenceToMatrix(S)) testInput.append([np.transpose(Smatrix)]) f.close() return trainInput, trainClass, testInput, testClass
def __init__(self, oldid, newid, data, general): self.newid=newid self.oldid=oldid self.data=data self.tfidfatt=[] self.tfidfval=[] self.freatt=[] self.freval=[] self.text='' self.ntlk=[] self.idfvalue=[] self.general=general tokenizer = RegexpTokenizer(r'\w+') #stemmer = SnowballStemmer("english") stemmer = PorterStemmer() stop = stopwords.words('english') for r in tokenizer.tokenize(data): a=0 if r not in stop: if not any(i.isdigit() for i in r): r = stemmer.stem(r) if r not in self.ntlk: self.ntlk.append(r) self.text=self.text+' '+r
def parse_raw_data(self, new_art): self.startClass=default_timer() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(new_art.body) stemmer = LancasterStemmer() article_dic = new_art.words global_dic = self.raw_dictionary for word in tokens: word = word.lower() if(False == self.is_stop_word(word) and word.isnumeric()==False): s_word = stemmer.stem(word) # s_word = word ## it is not a stop word, check if the word ## is already part of the article dictionary. ## if yes, increment the count else add it. ## If you are adding check if it is part of ## the big corpus, if yes increment the count ## of number of articles with that word. self.globalWordCount+=1 new_art.doc_len = new_art.doc_len + 1 if(s_word in article_dic): article_dic[s_word].wrd_count+=1 global_dic[s_word].wrd_count+=1 else: article_dic[s_word] = local_word_attributes(1) if (s_word in global_dic): global_dic[s_word].art_count+=1 global_dic[s_word].wrd_count+=1 else: global_dic[s_word] = global_word_attributes(1,1, 1, 0)
def mean_stdDeviation(self,query,stopWordInstruction): list_count_postTitles = [] list_postTitles = self.data[:][query].tolist() tokenizer = RegexpTokenizer(r'\w+') stopwords_mine = [] #a.encode('ascii','ignore') stopwords_mine+= (word.encode('ascii','ignore') for word in stopwords.words('english')) tokenized_list = [] new_list_tokenized = [] for item in list_postTitles: tokenized_list.append(tokenizer.tokenize(item)) if stopWordInstruction==True: for item in tokenized_list: temp = [] temp += (word for word in item if word.lower() not in stopwords_mine) #print temp #raw_input() new_list_tokenized.append(temp) else: new_list_tokenized=copy.deepcopy(tokenized_list) for x in new_list_tokenized: list_count_postTitles.append(len(x)) #print list_count_postTitles npArray = np.asarray(list_count_postTitles) print npArray.mean() print npArray.std() return [npArray.mean(),npArray.std(),list_postTitles,list_count_postTitles]
def preprocess(sentence): sentence = sentence.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) filtered_words = [w for w in tokens if not w in stopwords.words('english')] #filtered_words = filter(lambda token: token not in stopwords.words('english')) return " ".join(filtered_words)
def count_ngrams(sessions,length): data = sessions data = data.replace(',',' ') tokenizer = RegexpTokenizer("[0-9]+") #include only number (pageIDs) for tokens token = tokenizer.tokenize(data) from nltk.util import ngrams #print list(ngrams(token, 2)) generated_ngrams = list(ngrams(token,length)) #print generated_ngrams try: ngrams = ' '.join(generated_ngrams[0]) except IndexError: global non_list non_list += 1 #print 'Failed generated ngrams as there is no minimum ' # print ngrams for ngram in generated_ngrams: if not ngrams_statistics.has_key(ngram): ngrams_statistics.update({ngram:1}) else: ngram_occurrences = ngrams_statistics[ngram] ngrams_statistics.update({ngram:ngram_occurrences+1})
def run(self, data): results = [] tokenizer = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True) for corpus in data: corpus.contents = " ".join(tokenizer.tokenize(corpus.contents)) results.append(corpus) return results
def tokenize(self, doc): ''' use NLTK RegexpTokenizer ''' tokenizer = RegexpTokenizer("\w{3,}") return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
def parse_questions(self): stemmer = PorterStemmer() tokenizer = RegexpTokenizer(r'\w+') for questions_key in self.rawSamples: # Stem the Question Text question_text = self.rawSamples[questions_key][0] words_array = tokenizer.tokenize(question_text) question_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) question_text += (word + " ") self.rawSamples[questions_key][0] = question_text # Stem the topic names topics_text = self.rawSamples[questions_key][2] words_array = tokenizer.tokenize(topics_text) topics_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) topics_text += (word + " ") self.rawSamples[questions_key][2] = topics_text
Mithilesh Ganesh Shinde Sachidanand Tripathi @author: Ganesh """ import nltk # from nltk import word_tokenize from nltk.util import ngrams from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer with open(r"InputDataForNgram") as inputData: data = inputData.read() dataWithoutPun = RegexpTokenizer(r'\w+') token = dataWithoutPun.tokenize(data) def ngramfunction(token, number): totCount = len(token) ngramlist = ngrams(token, number) fdist = nltk.FreqDist(ngramlist) # FreqUnigramData = list(fdist.keys()) # unigramResult = FreqUnigramData[0:100] # print(unigramResult) # fdist.plot(50,cumulative=False)
# STOPWORDS = maketrie(stopwords.words('english')) # STOPWORDS = set(stopwords.words('english')) CORPUSDIR = './presidential_debates' # DFTS is a dict('token': int(doc_freq)} that maps tokens to the qty of docs # containing that term (ie NOT IDF, just the qty of docs containing the token) DFTS = Counter() # PriorityQueue() DOCS = {} FILENAMES = '' IDFS = Counter() N = float(0) STEMMER = PorterStemmer().stem STOP = 'STOP' STOPWORDS = stopwords.words('english') # TFIDFS = Counter() TFIDFS = defaultdict(lambda: Counter()) TOKENIZER = RegexpTokenizer(r'[a-zA-Z]+').tokenize # Can't DFTS.keys() be used instead of TOKENCORPUS?? # TOKENCORPUS = def setup(): global FILENAMES global IDFS global N FILENAMES = listdir(CORPUSDIR) N = float(len(FILENAMES)) t = time() with Pool() as p: DOCS = dict(p.map(process_document, FILENAMES)) for tokens in DOCS.values(): # TOKENCORPUS.update(tokens)
evaluation['#2 Rating'].append(0) evaluation['#3 Simplification'].append(unicode(baseline_result[key], 'ascii', errors='ignore')) evaluation['#3 Rating'].append(0) evaluation['#4 Simplification'].append(unicode(clustered_distances[key], 'ascii', errors='ignore')) evaluation['#4 Rating'].append(0) num_eval_sentences = num_eval_sentences + 1 indeces_eval_sentences.append(random) eval_df = pd.DataFrame(evaluation, index=[i for i in range(0, len(evaluation['Annotated Sentence']))], columns=['Annotated Sentence', '#1 Simplification', '#1 Rating', '#2 Simplification', '#2 Rating', '#3 Simplification', '#3 Rating', '#4 Simplification', '#4 Rating']) eval_df.to_html(output_eval_dir, index=False, escape=False) """ tokenizer = RegexpTokenizer(r'\w+') for i in range(0, len(dictionary)): flag = False for original in output['Annotated Sentence']: if tokenizer.tokenize(dictionary[i]['value']) == tokenizer.tokenize(get_original(original)): dictionary[i]['simplification'] = result[original][score[original].index(max(score[original]))] flag = True break if flag is False: print dictionary[i]['annotated_sentence'] xml_dict = [] for i in range(0 , len(dictionary)): xml_dict.append({'annotated_sentence': dictionary[i]['annotated_sentence'], \ 'value': dictionary[i]['value'], \
# coding=utf-8 import unidecode import inflection import re from nltk.corpus import stopwords stop = stopwords.words('english') from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() def change_alphabet(sent): return unidecode.unidecode(sent.decode('utf-8')) def clean_sent(sent): sent = re.sub(r"http\S+", "", sent.lower()).decode('utf-8') sent = re.sub(r"@\S+", "", sent.lower()).decode('utf-8') #words=sent.split(" ") words = tokenizer.tokenize(sent) words_refined = [ lemmatizer.lemmatize(inflection.singularize(word)) for word in words ] words = [ inflection.transliterate(word.decode('utf-8')) for word in words_refined if not word.isdigit() and len(word) > 2 ] p_stemmer = PorterStemmer() _digits = re.compile('\d')
def init(): ps = PorterStemmer() tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') return ps,tokenizer
def my_tokenizer(text: str): """ Return tokens, remove punctuations as well""" tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(text)
def process_word(cont): c = [] for i in cont: i = i.lower() clean_tweet = re.sub(r"http\S+", "", i) tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') clean_tweet = tokenizer.tokenize(clean_tweet) a = list(clean_tweet) b = " ".join(a) c.append(b) return c tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') train = readtrain() data_train = process_word(train[1]) all_data = [] def cout_word(): result = {} for content in data_train: for word in content.split(): if word not in result: result[word] = 0 result[word] += 1 return result
reasonsList = list(reasons["Response"]) suggestionsList = list(suggestions["Response"]) ######################################################################################### #Part-of-Speech (POS) tag the words, then lemmatize (make root word) them # ######################################################################################### """We may not actually want to lemmatize, since we will be making bigrams. Check """ from nltk import pos_tag from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from nltk.tokenize import RegexpTokenizer lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer(r'\w+') #Function to convert nltk tag to wordnet tag def nltk_tag_to_wordnet_tag(nltk_tag): if nltk_tag.startswith('J'): return wordnet.ADJ elif nltk_tag.startswith('V'): return wordnet.VERB elif nltk_tag.startswith('N'): return wordnet.NOUN elif nltk_tag.startswith('R'): return wordnet.ADV else: return None
class Text_Preprocessing(): def __init__(self, doc_map): self.posting_list = {} self.mine = ['br','\'','http','url','web','www','blp','ref','external','links'] self.stop_words = set(stopwords.words('english')).union(self.mine) # self.ps = PorterStemmer().stem self.ps = SnowballStemmer("english").stem self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[0-9]{,4}') self.d = doc_map self.sent = nltk.data.load('tokenizers/punkt/english.pickle').tokenize self.toktok = ToktokTokenizer() def check(self, t1, t2, t3): if t1 not in self.posting_list: self.posting_list[t1] = {} if t2 not in self.posting_list[t1]: self.posting_list[t1][t2] = {} if t3 not in self.posting_list[t1][t2]: self.posting_list[t1][t2][t3] = 0 return self.posting_list def process_title(self, text, pageNumber): token_list = self.tokenizer.tokenize(text.lower()) token_list = list(filter(None, token_list)) filtered_sentence = [w for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11] stemmed_list = list(filter(None, stemmed_list)) # print('stemmedList title: ',stemmed_list) for word in stemmed_list: self.posting_list = self.check(word, pageNumber, 't') self.posting_list = self.check(word, pageNumber, 'n') self.posting_list[word][pageNumber]['t'] += 1 self.posting_list[word][pageNumber]['n'] += 1 def process_categories(self,text, pageNumber): c = 0 category_regex = compile(".*\[\[Category:(.*?)\]\].*") match_cat_list = category_regex.findall(text) total_stems = [] n = len('category') + 4 total_stems = [] rem = '[[Category:%s]]' extend = total_stems.extend for one_match in match_cat_list[:4]: text = text.replace(rem%(one_match), '') category_name = one_match[n:-3] # say, Indian Culture category_name = category_name.lower() token_list = self.tokenizer.tokenize(category_name) token_list = list(filter(None, token_list)) filtered_sentence = [w for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11] extend(stemmed_list) for word in total_stems: # ['data', 'scienc', 'peopl', 'birth'] # if word == '': # print('here null category') self.posting_list = self.check(word, pageNumber, 'c') self.posting_list = self.check(word, pageNumber, 'n') self.posting_list[word][pageNumber]['c'] += 1 self.posting_list[word][pageNumber]['n'] += 1 return text def process_infobox(self, text, pageNumber): infobox_start = compile("{{Infobox") start_match = search(infobox_start, text) if start_match: start_pos = start_match.start() brack_count = 2 end_pos = start_pos + len("{{Infobox ") while(end_pos < len(text)): if text[end_pos] == '}': brack_count = brack_count - 1 if text[end_pos] == '{': brack_count = brack_count + 1 if brack_count == 0: break end_pos = end_pos+1 if end_pos+1 >= len(text): return infobox_string = text[start_pos:end_pos+1] text = text.replace(infobox_string, '') content = infobox_string.split('\n') content = list(map(lambda x:x.lower(),content)) tokens = [] add = tokens.append heading = content[0][len('{{infobox '):-1] add(heading) for idx in range(1,len(content)-2): try: value = " ".join(findall(r'\w+', content[idx].split('=',1)[1])).strip() add(value) except: pass tokens = list(filter(lambda x: x.strip(), tokens)) tokens = list(filter(None, tokens)) total_stems = [] extend = total_stems.extend for one_token in tokens: token_list = self.tokenizer.tokenize(one_token) filtered_sentence = [w for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11] extend(stemmed_list) total_stems = list(filter(None, total_stems)) for word in total_stems: # if word == '': # print('here null ibox; ', total_stems) self.posting_list = self.check(word, pageNumber, 'i') self.posting_list = self.check(word, pageNumber, 'n') self.posting_list[word][pageNumber]['i'] += 1 self.posting_list[word][pageNumber]['n'] += 1 return text def process_ref(self, text, pageNumber): # pass ref_start = compile('< ref.* >(.*?)< /ref >', DOTALL) title_start = compile('.*title =|.*title=') n=2 tokenized_corpus = [ref_start.findall(sent) for sent in sent_tokenize(text) if len(ref_start.findall(sent))>0 ] tokenized_corpus = list(chain(*tokenized_corpus)) if len(tokenized_corpus) > n: tokenized_corpus = tokenized_corpus[:n] total_stems = [] extend = total_stems.extend # print('ref len %f'%len(tokenized_corpus)) for match_list in tokenized_corpus: text = text.replace(match_list, '') pipe_tokens = match_list.split('|') for one_token in pipe_tokens: if title_start.match(one_token): title = one_token.split('=')[1] token_list = self.tokenizer.tokenize(one_token) filtered_sentence = [w.lower() for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence] stemmed_list = list(filter(None, stemmed_list)) extend(stemmed_list) for word in total_stems: self.posting_list = self.check(word, pageNumber, 'r') self.posting_list = self.check(word, pageNumber, 'n') self.posting_list[word][pageNumber]['r'] += 1 self.posting_list[word][pageNumber]['n'] += 1 def process_body_text(self, text, pageNumber): body_ = compile(r'==(.*)==|{{(.*)}}|#(.*)|{{(.*)|{{(.*)|\|(.*)|\}\}|\*.*|!.*|\[\[|\]\]|;.*|<.*>.*</.*>|<.*>.*</.*>|<.*>') matches = list(chain.from_iterable(body_.findall(text))) matches = list(filter(None, matches)) # text = filter(lambda x: text.replace(x,''), matches ) big_regex = compile('|'.join(map(escape, matches))) text = big_regex.sub('',text) content = text.splitlines() content = list(filter(lambda x: x.strip(), content)) content = [" ".join(findall("[a-zA-Z]+", x)).strip() for x in content] content = list(filter(None, content)) content = list(map(lambda x:x.lower(),content)) total_stems = [] extend = total_stems.extend if len(content)>200: for one_line in range(0,len(content),5): token_list = word_tokenize(content[one_line]) filtered_sentence = [w for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence] extend(stemmed_list) else: for one_line in content: token_list = word_tokenize(one_line) filtered_sentence = [w for w in token_list if not w in self.stop_words] stemmed_list = [self.ps(word) for word in filtered_sentence] extend(stemmed_list) for word in total_stems: # if word == '': # print('here null boy') self.posting_list = self.check(word, pageNumber, 'b') self.posting_list = self.check(word, pageNumber, 'n') self.posting_list[word][pageNumber]['b'] += 1 self.posting_list[word][pageNumber]['n'] += 1 return text # def process_ref(self, text, pageNumber): # # pass # ref_regex = compile('.*< ref (.*?)< /ref >.*',DOTALL) # ref_tag = ref_regex.findall(text) # i = 0 # # title_start = compile('(.*?)title =|(.*?)title=') # for r in ref_tag: # try: # i+=1 # if i==4: # break # text = text.replace('< ref '+r+'< /ref >', '') # r = split(r'title',r)[1].split('|',1)[0].replace('=','').strip() # token_list = self.tokenizer.tokenize(r) # filtered_sentence = [w.lower() for w in token_list if not w in self.stop_words] # stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<11] # extend(stemmed_list) # for word in total_stems: # self.posting_list = self.check(word, pageNumber, 'r') # self.posting_list = self.check(word, pageNumber, 'n') # self.posting_list[word][pageNumber]['r'] += 1 # self.posting_list[word][pageNumber]['n'] += 1 # except: # pass # # def ab_with_check(self,text): # # for ch in ['\\','`','*','_','{','}','[',']','(',')','>','#','+','-','.','!','$','\'']: # # if ch in text: # # text = text.replace(ch,"\\"+ch) # def process_body_text(self, text, pageNumber): # body_ = compile('==.*==|\{\{.*\}\}|#.*|\{\{.*|\|.*|\}\}|\*.*|!.*|\[\[|\]\]|;.*|<.*>.*</.*>|<.*>.*</.*>|<.*>') # matches = set(body_.findall(text)) # # print(matches) # # text = text.replace(x,'') for x in matches # # print(text) # if matches: # for one_match in matches: # # print('one_match: ',one_match) # text = text.replace(one_match,'') # # print('text: ',text) # # text = str(filter(lambda x: text.replace(x,''), matches )) # content = text.splitlines() # content =[" ".join(findall("[a-zA-Z]+", x)).strip().lower() for x in content] # # content = [x.strip() for x in content] # # content = [" ".join(findall("[a-zA-Z]+", x)).strip() for x in content] # content = list(filter(None, content)) # # print(content) # # content = list(map(lambda x:x.lower(),content)) # # # content = " ".join(content) # # # print(content)s # total_stems = [] # extend = total_stems.extend # if len(content)>200: # for one_word in range(0,len(content),2): # token_list = self.tokenizer.tokenize(content[one_word]) # filtered_sentence = [w for w in token_list if not w in self.stop_words] # stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<20] # extend(stemmed_list) # else: # for one_word in content: # token_list = self.tokenizer.tokenize(one_word) # filtered_sentence = [w for w in token_list if not w in self.stop_words] # stemmed_list = [self.ps(word) for word in filtered_sentence if len(word)<20] # extend(stemmed_list) # for word in total_stems: # self.posting_list = self.check(word, pageNumber, 'b') # self.posting_list = self.check(word, pageNumber, 'n') # self.posting_list[word][pageNumber]['b'] += 1 # self.posting_list[word][pageNumber]['n'] += 1 # return text def make_index(self): limit_one_doc = 30/60000.0 # in sec # print('make index') title_regex = compile('.*?:') for k,v in self.d.items(): t1,t2,t3,t4,t5=0,0,0,0,0 t = time.time() match_title = title_regex.match(v['title']) self.process_title(v['title'], v['id']) t1= time.time()-t if not match_title: body = v['body'] t = time.time() x = self.process_categories(body, v['id']) t2= time.time()-t t= time.time() x = self.process_infobox(x, v['id']) t3= time.time()-t if x is not None: # t = time.time() self.process_ref(x, v['id']) t4=0 if x is not None: t = time.time() x = self.process_body_text(x, v['id']) t5= time.time()-t T = t1+t2+t3+t4+t5 if T>=limit_one_doc: pass # print('id %s title %f cat %f infobox %f ref %f body %f' % (v['id'],t1,t2,t3,t4,t5)) # print('--> T: %f limit: %f exceed: %f'%(T, limit_one_doc, T-limit_one_doc)) # print(i,end=' ') return def parse_posting_list(self, path2index): complete_index = dict(sorted(self.posting_list.items())) for term, posting_list in complete_index.items(): # if s: # print('term: ',term) # if term == '': # print() one_line = "" one_line = term + "|" for doc_id, occurences in posting_list.items(): one_line += str(doc_id) + "$" for field, count in occurences.items(): one_line += field + ":" + str(count) + "#" one_line += "|" one_line += "\n" with open(path2index, 'a+') as i: i.write(one_line) #one line: 0|29$i:1#n:1#|61$i:1#n:1#|..