def Granularity(sentenceArray): for sentence in sentenceArray: # print(sentence) try: stemmer = EnglishStemmer() sentence = re.sub(r'\#.*?$', '', sentence) sentence = re.sub(r'\#.*? ', '', sentence) sentence = re.sub(r'\@.*?$', '', sentence) sentence = re.sub(r'\@.*? ', '', sentence) sentence = re.sub(r'pic.twitter.*?$', '', sentence) sentence = re.sub(r'pic.twitter.*? ', '', sentence) sentence = re.sub(r'\'m', ' am', sentence) sentence = re.sub(r'\'d', ' would', sentence) sentence = re.sub(r'\'ll', ' will', sentence) sentence = re.sub(r'\&', 'and', sentence) sentence = re.sub(r'don\'t', 'do not', sentence) data = stemmer.stem(sentence) print(data) from nltk.corpus import stopwords sentence = str(data) stop = stopwords.words('english') final = [i for i in sentence.split() if i not in stop] finalstring = ' '.join(final) os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word) except Exception as e: print(e)
def getAllStemEntities(entities): st = EnglishStemmer() q = [",", ".", "!", "?", ":", ";"] tmp = [] sourceEntities = [x for x in entities if len(x) > 0] np.random.shuffle(entities) for i in xrange(len(entities)): if len(entities[i]) == 0: continue if i % 1000 == 0: print i entities[i] = entities[i].lower() entities[i] = entities[i].replace(" - ", " \u2013 ", entities[i].count(" - ")) entities[i] = entities[i].replace(" -", " \u2013", entities[i].count(" -")) entities[i] = entities[i].replace("- ", "\u2013 ", entities[i].count("- ")) entities[i] = entities[i].replace("-", " - ", entities[i].count("-")) entities[i] = entities[i].replace(")", " )", entities[i].count(")")) entities[i] = entities[i].replace("(", "( ", entities[i].count("(")) entities[i] = entities[i].replace("\u0027", " \u0027", entities.count("\u0027")) for w in q: entities[i] = entities[i].replace(w, " " + w, entities[i].count(w)) word = entities[i].split(" ") s = "" for w in word: s += st.stem(unicode(w)) + " " tmp.append(s[:-1]) if len(tmp) > 50: break return tmp, entities[: len(tmp)]
def getAllStemEntities(entities): st = EnglishStemmer() q = [',', '.', '!', '?', ':', ';'] tmp = [] sourceEntities = [x for x in entities if len(x)>0] np.random.shuffle(entities) for i in xrange(len(entities)): if len(entities[i]) == 0: continue if i % 1000 == 0: print i entities[i] = entities[i].lower() entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - ')) entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -')) entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- ')) entities[i] = entities[i].replace('-', ' - ', entities[i].count('-')) entities[i] = entities[i].replace(')', ' )', entities[i].count(')')) entities[i] = entities[i].replace('(', '( ', entities[i].count('(')) entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027')) for w in q: entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w)) word = entities[i].split(' ') s = '' for w in word: s += st.stem(unicode(w)) + ' ' tmp.append(s[:-1]) if len(tmp) > 50: break return tmp, entities[:len(tmp)]
def str_to_dict(s): ''' creates dictionary of words and counts input: s string output: dictionary {word: count} ''' s = s.encode('ascii','ignore') s = str(s) word_dict = {} l = re.findall(WORDRE, s) for w in l: w = w.lower() # make all letters lowercase if w[0] == "'": # remove single quotes from beginning/ w = w[1:] # end of words in l elif w[-1] == "'": w = w[:-1] w = EnglishStemmer().stem(w) # stems non-noun/verbs w = w.encode('ascii','ignore') if w != '': if w not in word_dict: # build dictionary word_dict[w] = 1 else: word_dict[w] += 1 return word_dict
def query(word): db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" ) cursor=db.cursor() snowball_stemmer = EnglishStemmer() stem2 = snowball_stemmer.stem(word) cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2)) rows = cursor.fetchall() words1 = dict() words2 = dict() for row in rows: if row[1] == word or row[3]==word: words1[word] = row[0] else: words2[word] = row[0] scenes1 = [] scenes2 = [] for (i,words_dict) in [(1,words1), (2,words2)]: wids = words_dict.values() for wid in wids: sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \ "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid) # print sql cursor.execute(sql) rows = cursor.fetchall() if (i==1): scenes1 += rows else: scenes2 += rows print scenes1 print scenes2 return scenes1 + scenes2 db.close()
def _execute(self): corpus = mongoExtractText(self.name) stemmer = EnglishStemmer() for item in corpus: line = item.replace(',', ' ') stemmed_line = stemmer.stem(line) self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
def stem_word(word): """ Stem words :param word: (str) text word :returns: stemmed word """ stemmer = EnglishStemmer() return stemmer.stem(word)
def as_eng_postagged_doc(doc): '''Uses nltk default tagger.''' tags = [t for _, t in nltk.pos_tag(list(doc.word))] stemmer = EnglishStemmer() lemmata = [stemmer.stem(w) for w in list(doc.word)] doc['pos'] = Series(tags) doc['lemma'] = Series(lemmata) return doc
def use_snowball_stemmer(self,word): """ return stemmed words used snowball algorithm :param word: :return: """ englishStemmer=EnglishStemmer() stemmed_word= englishStemmer.stem(word) return stemmed_word
def getLemmatizerInfo(pathArticle): data = open(pathArticle, "r") text1 = data.read().decode('utf-8') sourceText = text1 links1 = [] l = 0 for q in text1.split(): if q == '\ufeff': continue links1.append([text1.find(q,l), q]) l = len(q) + 1 + text1.find(q,l) text1 = text1.replace(' - ', ' \u2013 ', text1.count(' - ')) text1 = text1.replace(' -', ' \u2013', text1.count(' -')) text1 = text1.replace('- ', '\u2013 ', text1.count('- ')) text1 = text1.replace('-', ' - ', text1.count('-')) text1 = text1.replace('(', '( ', text1.count('(')) text1 = text1.replace(')', ' )', text1.count(')')) text1 = text1.replace(' \u0027', ' \u301E', text1.count(' \u0027')) text1 = text1.replace('\u0027', ' \u0027', text1.count('\u0027')) text1 = text1.split() if text1[0] == u'\ufeff': text1=text1[1:] text = [] for word in text1: text2 = [] if len(word) == 0: continue while word[len(word)-1] in [',','.','!','?',':',';']: text2.append(word[len(word)-1]) word = word[:-1] if len(word) == 0: break text.append(word) for i in range(len(text2)-1, -1,-1): text.append(text2[i]) out = '' st = EnglishStemmer() l = 0 links = [] for word in text: if isOk(word): q = st.stem(word) + ' ' else: q = word + ' ' out += q.lower() links.append([l, q]) l += len(q) return out, links, links1, sourceText
def getLemmatizerInfo(pathArticle): data = open(pathArticle, "r") text1 = data.read().decode("utf-8") sourceText = text1 links1 = [] l = 0 for q in text1.split(): if q == "\ufeff": continue links1.append([text1.find(q, l), q]) l = len(q) + 1 + text1.find(q, l) text1 = text1.replace(" - ", " \u2013 ", text1.count(" - ")) text1 = text1.replace(" -", " \u2013", text1.count(" -")) text1 = text1.replace("- ", "\u2013 ", text1.count("- ")) text1 = text1.replace("-", " - ", text1.count("-")) text1 = text1.replace("(", "( ", text1.count("(")) text1 = text1.replace(")", " )", text1.count(")")) text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027")) text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027")) text1 = text1.split() if text1[0] == u"\ufeff": text1 = text1[1:] text = [] for word in text1: text2 = [] if len(word) == 0: continue while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]: text2.append(word[len(word) - 1]) word = word[:-1] if len(word) == 0: break text.append(word) for i in range(len(text2) - 1, -1, -1): text.append(text2[i]) out = "" st = EnglishStemmer() l = 0 links = [] for word in text: if isOk(word): q = st.stem(word) + " " else: q = word + " " out += q.lower() links.append([l, q]) l += len(q) return out, links, links1, sourceText
def stemming(tweet): tweets = tweet.split() wrdStemmer = EnglishStemmer() stemTweet =[] try: for tweet in tweets: tweet = wrdStemmer.stem(tweet) stemTweet.append(tweet) except: print("Error: Stemming") return " ".join(stemTweet)
def fix_lemma_problem(pred_scores, targets, space): from nltk.stem.snowball import EnglishStemmer es = EnglishStemmer() r = pred_scores.copy() lemmas = np.array([es.stem(v) for v in space.vocab]) for i, t in enumerate(targets): g = es.stem(space.vocab[t]) mask = (lemmas == g) #print space.vocab[t], np.sum(mask) r[i][mask] = -1e9 #print r[i][mask] return r
def main(fname): e = EnglishStemmer() n, a = 0, 0 for line in open(sys.argv[1]): title, body, tags, creationdate, acceptedanswerid, score, viewcount = eval(line) # Process text into tokens html_tags = RX_OPEN_TAGS.findall(body) body = RX_TAGS.sub("",body) print " ".join(e.stem(s) for s in RX_NONWORD.split(body)) M = bayes.NaiveLearner(adjust_threshold=True, name="Adjusted Naive Bayes")
def stemmed(text, snowball=False): """Returns stemmed text """ if snowball: st = EnglishStemmer() else: st = PorterStemmer() words = wordpunct_tokenize(text) words = [st.stem(w) for w in words] text = ' '.join(words) return text
def get_stemmed_keywords(keywords): stemmer = EnglishStemmer() stemmed_keywords = list(keywords) # split into list of list stemmed_keywords = [keyword.split() for keyword in stemmed_keywords] # stem individual words stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords] # list of words to string stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords] return stemmed_keywords
def similarity_score(word1, word2): """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005 :type word1: string :type word2: string :return: float: between 0 and 1; similarity between two given words """ stemmer = EnglishStemmer() if stemmer.stem(word1) == stemmer.stem(word2): return 1 alpha = 0.2 beta = 0.6 l, h = get_path_length_and_subsumer_height(word1, word2) return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
def normalize_tags(): cursor.execute('SELECT app_id, tag, times FROM tag_app_rel;') all_tag_data = defaultdict(dict) for r in cursor: all_tag_data[r[0]][r[1]] = r[2] from nltk.stem.snowball import EnglishStemmer stemmer = EnglishStemmer() for app_id, tag_to_times in all_tag_data.iteritems(): normalized_app_tag_dict = defaultdict(int) for tag, times in tag_to_times.iteritems(): normalized_app_tag_dict[stemmer.stem(tag)] += times for tag, times in normalized_app_tag_dict.iteritems(): cursor.execute('INSERT INTO tag_app_relation (app_id, tag, times) VALUES (%s, %s, %s)', (app_id, tag, times))
def nltk_tokenizer(text, min_size=4, *args, **kwargs): from nltk.stem.snowball import EnglishStemmer from nltk.corpus import stopwords as stwds from nltk.tokenize import TreebankWordTokenizer stemmer = EnglishStemmer() stopwords = set(stwds.words('english')) text = [stemmer.stem(w) for w in TreebankWordTokenizer(). tokenize(text) if not w in stopwords and len(w) >= min_size] return text
def tokenize_documents(documents): stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered english = EnglishStemmer() arabic = ISRIStemmer() punctuation = { ord(char): None for char in string.punctuation} def valid_word(token, filtered=stop_words): # Returns false for common words, links, and strange patterns if (token in filtered) or (token[0:4] == u'http') or\ (token in string.punctuation): return False else: return True for doc in documents: row = doc[0] doc = doc[1] if doc is not None: # remove trailing whitespace doc = doc.strip() # remove twitter handles (words in doc starting with @) doc = re.sub(r"@\w+|\b@\w+", "", doc) # lowercase letters doc = doc.lower() # remove punctuation doc = doc.translate(punctuation) # tokenization: handles documents with arabic or foreign characters tokens = nltk.tokenize.wordpunct_tokenize(doc) cleaned_tokens = [] for token in tokens: # for valid words, correct spellings of gaddafi and stem words if valid_word(token): if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']: token = u'gaddafi' else: token = arabic.stem(english.stem(token)) cleaned_tokens.append(token) yield row yield cleaned_tokens
def stem_sen(list_sentences): stemmer = EnglishStemmer() # map back should be a dict with words, # each word map to 3 version: noun, adj, verb, # and each version is a list of pair lem = WordNetLemmatizer() mapping_back = {} res_list = [] res_sen = [] stemmer = EnglishStemmer() # of course we want to return a list of sentences back as well for sent in list_sentences: tmp_list = [] tok_list = word_tokenize(sent) tok_pos = nltk.pos_tag(tok_list) for tok,pos in tok_pos: if (tok.lower() in stopwords.words('english')): continue if len(tok) == 1: continue tok = lem.lemmatize(tok) pos = pos[:2] if ('NN' not in pos) and ('JJ' not in pos) and ('VB' not in pos): continue stem_tok = stemmer.stem(tok) if (stem_tok not in mapping_back): mapping_back[stem_tok] = {} if pos not in mapping_back[stem_tok]: mapping_back[stem_tok][pos] = {} # increase count if tok not in mapping_back[stem_tok][pos]: mapping_back[stem_tok][pos][tok] = 1 else: mapping_back[stem_tok][pos][tok] += 1 tmp_list.append(stem_tok + '-' + pos) res_sen.append(tmp_list) res_map = {} # do the second run through to find the most frequent - mapping for tok in mapping_back: for pos in mapping_back[tok]: tmp_tok = tok + '-' + pos # find the most frequently, unstemmed word correspond to the stemmer + tagged most_freq = max(mapping_back[tok][pos], key = mapping_back[tok][pos].get) res_map[tmp_tok] = most_freq.encode('ascii') res_list.append(tmp_tok) return res_sen, res_list, res_map
def tokenize(self): terms = word_tokenize(self.text); self.tokens = []; self.lemmas = [] stemmer = EnglishStemmer(); lemmatizer = WordNetLemmatizer() for term in terms: try: self.tokens.append(stemmer.stem(term).lower()) self.lemmas.append(lemmatizer.lemmatize(term.lower())) except Exception, e: print 'current text:', self.text; print 'current term:', term; print str(e); sys.exit(-1);
def exe_compress_word(argv): word_stat_path, comp_word_stat_path = argv; stemmer = EnglishStemmer(); word_stat = load_word_stat(word_stat_path); compress_word_stat = {}; for word, count in word_stat.items(): if count <= 0: continue; word = stemmer.stem(word.lower().decode('utf8')); compress_word_stat.__setitem__(word, max(word_stat.get(word,0), count)); words = compress_word_stat.keys(); words.sort(); f = open(comp_word_stat_path, 'w'); for word in words: f.write('%s %d\n' % (word.encode('utf8'), compress_word_stat[word])); f.close();
def __init__(self): """ class initialization: tokenizer- NLTK compatible tokenizer function stemmer- NLTK compatible stemmer stop_words- list of ignored words lemm- NLTK compatible lemmatizer inv_index- (defaultdict) the inverted index positional_index- (defaultdict of defaultdicts) relevant for the bonus task only """ # Tokenization self.tokenizer = word_tokenize # Stemming self.stemmer = EnglishStemmer() #self.stemmer = nltk.PorterStemmer() #self.stemmer = nltk.LancasterStemmer() # Stopwords self.stop_words = stopwords.words('english') # Lemmatization self.lemm = nltk.WordNetLemmatizer() # The invereted index self.inv_index = defaultdict(list) # The positional index (for the bonus task) self.positional_index = defaultdict(lambda: defaultdict(list))
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False): # remove accents, wordify punctuation in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote) en_stem = EnglishStemmer() # tokenize string if removestop: # remove stop words tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str)) else: tok_list = wordpunct_tokenize(in_str) new_tok_list = [] for tok in tok_list: if tok not in WORD_PUNC_LIST: correct_spell = HOBJ.spell(tok) if not correct_spell: suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)] else: suggestions = [] if correct_spell or (tok.lower() in suggestions): new_tok_list.append(tok) tok_stem = en_stem.stem(tok) if tok_stem != tok: new_tok_list.append(tok_stem) elif len(tok) >= 3: tok_sug = None lev_perc = .34 for sug in suggestions: if not tok_sug and tok == sug[1:]: tok_sug = sug if not tok_sug: for sug in suggestions: tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug))) if not tok_sug and tmp_lev_perc < lev_perc: tok_sug = sug lev_perc = tmp_lev_perc if tok_sug: new_tok_list.append(tok_sug) tok_stem = en_stem.stem(tok_sug) if tok_stem != tok_sug: new_tok_list.append(tok_stem) elif alwayskeep: new_tok_list.append(tok) elif alwayskeep: new_tok_list.append(tok) else: new_tok_list.append(tok) out_str = string.join(new_tok_list, ' ') return out_str.lower()
def tokenizeTweet(tweet,unique = True): allWords = [word.lower() for word in word_tokenize(tweet)] # deletes @users, RT and URLs and saves #hashtags nWords, i = len(allWords), 0 hashtags = [] while i < nWords: if allWords[i] == '@': # @users allWords[i:i + 2] = [] nWords -= 2 elif allWords[i] == 'rt': # delete RT allWords[i:i + 1] = [] nWords -= 1 elif allWords[i] == '#': # save the hashtag try: hashtags.append(allWords[i + 1]) allWords[i:i + 2] = [] nWords -= 2 except: allWords[i:i + 1] = [] nWords -= 1 elif allWords[i] == "http": # delete url starting with http: allWords[i:i + 3] = [] nWords -= 3 elif allWords[i][0:3] == 'www': # delete urls starting with www. allWords[i:i + 1] = [] nWords -= 1 else: i += 1 possibleWords = filter(lambda x: x not in ourStopWords and x.isdigit() == False, allWords) stemmer = EnglishStemmer() tokens = [] for word in possibleWords: aux = str(stemmer.stem(word)) if unique: if(aux not in tokens): # this makes each token appears only once tokens.append(aux) else: tokens.append(aux) for tag in hashtags: # this makes each token appears only once if unique: if '#' + tag not in tokens: tokens.append('#' + tag) else: tokens.append('#'+tag) return tokens
def textrank(text): sentences = sent_tokenize(text) tokenizer = RegexpTokenizer(r"\w+") lmtzr = EnglishStemmer() words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence)) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def GET(self): #return "Hello, world!" #def query(req): data = web.input() word=str(data.word) """ parameters = util.FieldStorage(req) word = parameters['word'] req.write(word) """ #print word db = MySQLdb.connect("127.0.0.1","root","","dizing" ) cursor=db.cursor() snowball_stemmer = EnglishStemmer() stem2 = snowball_stemmer.stem(word) cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2)) rows = cursor.fetchall() words1 = dict() words2 = dict() for row in rows: if row[1] == word or row[3]==word: words1[word] = row[0] else: words2[word] = row[0] scenes1 = [] scenes2 = [] for (i,words_dict) in [(1,words1), (2,words2)]: wids = words_dict.values() for wid in wids: sql = "SELECT s.sentence, s.start, s.stop, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \ "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid) print sql cursor.execute(sql) rows = cursor.fetchall() if (i==1): scenes1 += rows else: scenes2 += rows print scenes1 print scenes2 #req.write(str(scenes1)) #req.write(str(scenes2)) db.close() result = { 'scenes1': scenes1, 'scenes2': scenes2 } return json.dumps(result)
def text_processing(text, min_size=4, sep_char=' '): from nltk.stem.snowball import EnglishStemmer from nltk.corpus import stopwords as stwds stemmer = EnglishStemmer() stopwords = set(stwds.words('english') + contractions_without_punc) text = [stemmer.stem(w) for w in text.split(sep_char) if not w in stopwords and len(w) >= min_size] return text words = list() for word in text: words.append(stemmer.stem(word)) return words
def computeSentiment(tweet_text): pos_count = 0 neg_count = 0 pos_terms = [] neg_terms = [] st = EnglishStemmer() tokenized_tweet = tokenize(tweet_text) for t in tokenized_tweet: #print st.stem(t.lower()) if st.stem(t.lower()) in negative_terms: neg_terms.append(t.lower()) neg_count += 1 elif st.stem(t.lower()) in positive_terms: pos_terms.append(t.lower()) pos_count += 1 return pos_count, neg_count, set(pos_terms), set(neg_terms)
def query(index: dict, request: str): request = request.replace("."," ") request = request.replace(","," ") request = request.translate(str.maketrans('', '', string.punctuation)) request = request.lower() request = remove_stopwords(request) raw_request = request words = request.split() words2 = [] for i in words: words2.append(EnglishStemmer().stem(i)) words = words2 if len(words) == 1: if request in index: if len(index[request]) >=5: for i in range(5): print(index[request][i]) else: candidates = [] synonyms = [] for i in index[request]: candidates.append(i) for syn in wordnet.synsets(raw_request): for l in syn.lemmas(): synonyms.append(l.name()) for i in set(synonyms): s = EnglishStemmer().stem(i) for i in range(min(5,len(index[s]))): candidates.append([index[s][i][0],index[s][i][1]/2,index[s][i][2],index[s][i][3],index[s][i][4]]) candidates.sort(key = lambda x: x[1], reverse=True) for i in range(5): print(candidates[i]) else: candidates = [] synonyms = [] for syn in wordnet.synsets(raw_request): for l in syn.lemmas(): synonyms.append(l.name()) for i in set(synonyms): s = EnglishStemmer().stem(i) if s in index: for i in range(min(5,len(index[s]))): candidates.append([index[s][i][0],index[s][i][1]/2,index[s][i][2],index[s][i][3],index[s][i][4]]) else: continue candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) if len(words) == 2: counter = 0 candidates = [] for i in index[words[0]]: if counter > 10: break for j in index[words[1]]: if i[0] == j[0]: candidates.append([i[0],i[1]+j[1],i[2],i[3],i[4]]) counter+=1 if counter > 10: break if len(candidates) >= 5: candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) else: if words[0] in index: for i in range(min(5,len(index[words[0]]))): candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]]) if words[1] in index: for i in range(min(5,len(index[words[1]]))): candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]]) candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) if len(words) == 3: counter = 0 candidates = [] for i in index[words[0]]: if counter > 10: break for j in index[words[1]]: if i[0] == j[0]: candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter+=1 if counter > 10: break counter2 = 0 candidates2 = [] for i in index[words[1]]: if counter2 > 10: break for j in index[words[2]]: if i[0] == j[0]: candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter2+=1 if counter2 > 10: break for i in candidates2: candidates.append(i) if len(candidates) >= 5: candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) else: if words[0] in index: for i in range(min(5,len(index[words[0]]))): candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]]) if words[1] in index: for i in range(min(5,len(index[words[1]]))): candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]]) if words[2] in index: for i in range(min(5,len(index[words[2]]))): candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]]) candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) if len(words) == 4: counter = 0 candidates = [] for i in index[words[0]]: if counter > 10: break for j in index[words[1]]: if i[0] == j[0]: candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter+=1 if counter > 10: break counter2 = 0 candidates2 = [] for i in index[words[1]]: if counter2 > 10: break for j in index[words[2]]: if i[0] == j[0]: candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter2+=1 if counter2 > 10: break counter3 = 0 candidates3 = [] for i in index[words[2]]: if counter3 > 10: break for j in index[words[3]]: if i[0] == j[0]: candidates3.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter3+=1 if counter3 > 10: break for i in candidates3: candidates.append(i) if len(candidates) >= 5: candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) else: if words[0] in index: for i in range(min(5,len(index[words[0]]))): candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]]) if words[1] in index: for i in range(min(5,len(index[words[1]]))): candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]]) if words[2] in index: for i in range(min(5,len(index[words[2]]))): candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]]) if words[3] in index: for i in range(min(5,len(index[words[3]]))): candidates.append([index[words[3]][i][0],index[words[3]][i][1]/10000,index[words[3]][i][2],index[words[3]][i][3],index[words[3]][i][4]]) candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) if len(words) == 5: counter = 0 candidates = [] for i in index[words[0]]: if counter > 10: break for j in index[words[1]]: if i[0] == j[0]: candidates.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter+=1 if counter > 10: break counter2 = 0 candidates2 = [] for i in index[words[1]]: if counter2 > 10: break for j in index[words[2]]: if i[0] == j[0]: candidates2.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter2+=1 if counter2 > 10: break counter3 = 0 candidates3 = [] for i in index[words[2]]: if counter3 > 10: break for j in index[words[3]]: if i[0] == j[0]: candidates3.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter3+=1 if counter3 > 10: break for i in candidates3: candidates.append(i) counter4 = 0 candidates4 = [] for i in index[words[3]]: if counter4 > 10: break for j in index[words[4]]: if i[0] == j[0]: candidates4.append([i[0],i[1]+j[1],j[2],i[3],i[4]]) counter4+=1 if counter4 > 10: break for i in candidates4: candidates.append(i) if len(candidates) >= 5: candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) else: if words[0] in index: for i in range(min(5,len(index[words[0]]))): candidates.append([index[words[0]][i][0],index[words[0]][i][1]/10000,index[words[0]][i][2],index[words[0]][i][3],index[words[0]][i][4]]) if words[1] in index: for i in range(min(5,len(index[words[1]]))): candidates.append([index[words[1]][i][0],index[words[1]][i][1]/10000,index[words[1]][i][2],index[words[1]][i][3],index[words[1]][i][4]]) if words[2] in index: for i in range(min(5,len(index[words[2]]))): candidates.append([index[words[2]][i][0],index[words[2]][i][1]/10000,index[words[2]][i][2],index[words[2]][i][3],index[words[2]][i][4]]) if words[3] in index: for i in range(min(5,len(index[words[3]]))): candidates.append([index[words[3]][i][0],index[words[3]][i][1]/10000,index[words[3]][i][2],index[words[3]][i][3],index[words[3]][i][4]]) if words[4] in index: for i in range(min(5,len(index[words[4]]))): candidates.append([index[words[4]][i][0],index[words[4]][i][1]/10000,index[words[4]][i][2],index[words[4]][i][3],index[words[4]][i][4]]) candidates.sort(key = lambda x: x[1], reverse=True) for i in range(min(5,len(candidates))): print(candidates[i]) return
test = 'this is a test \'string\' where the stop can\'t words should be removed, also we want to use synonyms to get a better result.' stop = stopwords.words('english') tokenizer = RegexpTokenizer(r'\w+') result1 = ([i for i in tokenizer.tokenize(test)]) print result1 tokenizer2 = TweetTokenizer() result12 = ([i for i in tokenizer.tokenize(test)]) print result12 result2 = ([i for i in result1 if i not in stop]) print result2 st1 = LancasterStemmer() result3 = ([st1.stem(i) for i in result2]) print result3 st2 = EnglishStemmer() result4 = ([st2.stem(i) for i in result2]) print result4 st3 = WordNetLemmatizer() result5 = ([st3.lemmatize(i) for i in result2]) print result5 st4 = PorterStemmer() result6 = ([st4.stem(i) for i in result2]) print result6
def __init__(self): self.stemmer = EnglishStemmer() return
import nltk from nltk.stem.lancaster import LancasterStemmer from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import EnglishStemmer from nltk.stem import WordNetLemmatizer from flask import abort from helpers import ret_success from helpers import ret_failure from helpers import parse_input from helpers import penn_to_wn LancasterSt = LancasterStemmer() PorterSt = PorterStemmer() SnowballSt = EnglishStemmer() WordnetLm = WordNetLemmatizer() def stemmer(method, data): """ Takes an array of words in JSON format. """ data = parse_input(data) if data == False: return ret_failure(703) else: res = [] if method == "lancaster": for word in data: try: res.append([word, LancasterSt.stem(word)]) except:
class EnglishAnalyzer(object): extra_stop_words = None ngram_range = (1, 1) alphafilter = re.compile(r"(?u)[^a-z ]+") token_pattern = re.compile(r"(?u)\b\w\w+\b") stemmer = None def __init__(self, extra_stop_words=None, ngram_range=(1, 1)): self.stemmer = EnglishStemmer() self.ngram_range = ngram_range self.stop_words = ENGLISH_STOP_WORDS if extra_stop_words is not None: self.stop_words = self.stop_words | set(extra_stop_words) def analyze(self, doc): ''' replaces the analyze function of any sk-learn text vectorizer with improved text handling, namely: - filter only alphabetic words - stem words - enhanced stopwords ''' return self.word_ngrams( self.filter_stem(self.tokenize(self.preprocess(doc))), self.ngram_range) def preprocess(self, doc): return self.alphafilter.sub(' ', strip_accents_ascii(doc.lower())) def tokenize(self, doc): return self.token_pattern.findall(doc) def filter_stem(self, tokens): return [ self.stemmer.stem(w) for w in tokens if w not in self.stop_words ] def word_ngrams(self, tokens, ngram_range): # handle token n-grams, copied from # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py#L146-L175 min_n, max_n = self.ngram_range if max_n != 1: original_tokens = tokens if min_n == 1: # no need to do any slicing for unigrams # just iterate through the original tokens tokens = list(original_tokens) min_n += 1 else: tokens = [] n_original_tokens = len(original_tokens) # bind method outside of loop to reduce overhead tokens_append = tokens.append space_join = " ".join for n in range(min_n, min(max_n + 1, n_original_tokens + 1)): for i in range(n_original_tokens - n + 1): tokens_append(space_join(original_tokens[i:i + n])) return tokens
def pre_process_dev_test(root_d, input_f, min_freq, word2inx_path, count_in_sent=False, data_type='dev'): """ :param root_d: :param input_f: :param min_freq: minimum frequency, an integer :param count_in_sent: if need to count words in sentence :return: """ y = {} X = {} i = 0 stemmer = EnglishStemmer() with open(os.path.join(root_d, input_f)) as f_handle: for each_line in f_handle: # each_line = '2 If you \'re paying attention , the `` big twists '' are pretty easy to guess - but that does n\'t make the movie any less entertaining .' each_line = each_line.strip() if data_type == 'dev': y[i] = each_line[0] x = each_line[1:].strip().lower() else: y[i] = 0 x = each_line[:].strip().lower() x = re.sub(r'[\']?\d+[st]*', 'number', x) x = re.sub(r'\\/', ' ', x) x = re.sub(r'ca n\'t', 'can not', x) x = re.sub(r'n\'t', 'not', x) x = re.sub(r'\'re', 'are', x) x = re.sub(r'\'m', 'am', x) x = re.sub(r'it \'s', 'it is', x) x = re.sub(r'that \'s', 'that is', x) x = re.sub(r'there \'s', 'there is', x) x = re.sub(r'\?', 'question_mark', x) x = re.sub(r'!', 'exclamation_mark', x) x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore') x_list = x.decode('utf-8').split(' ') x_list_stemmed = [stemmer.stem(word) for word in x_list] # stemming X[i] = x_list_stemmed i += 1 print(i) if data_type == 'dev': y_json_path = os.path.join(root_d, 'y_dev.json') X_json_path = os.path.join(root_d, 'X_dev.json') if not os.path.exists(y_json_path): with open(y_json_path, 'a') as f_handle: json.dump(y, f_handle, indent=2) if not os.path.exists(X_json_path): with open(X_json_path, 'a') as f_handle: json.dump(X, f_handle, indent=2) # sentence to index and convert y to np.array X_inx = {} X_new = {} y_array = np.zeros([len(y)], dtype=np.int) if os.path.exists(word2inx_path): with open(word2inx_path, 'r') as f_handle: word2inx = json.load(f_handle) else: print("There is no 'word2inx.json' file, please run train.py first") for i, m in enumerate(sorted(list(X.keys()))): each_sentence = X[m] X_inx[m] = [] X_new[m] = [] y_array[i] = y[i] for word in each_sentence: if word in word2inx: X_new[m].append(word) X_inx[m].append(word2inx[word]) # print(' '.join(X_new[m])) # convert sentence to vector X_matrix = np.zeros([len(X_inx), len(word2inx)], dtype=np.int) total_sentence_inx = sorted(list(X_inx.keys())) # print(total_sentence_inx) for i in total_sentence_inx: if count_in_sent: inx_num = count_num_in_sent(X_inx[i]) X_matrix[i][inx_num['inxs']] = inx_num['num'] else: X_matrix[i][X_inx[i]] = 1 # if i == 1: # print(X_inx[i]) # print(inx_num) # delete the sentences that have no word if data_type == 'test': return {'X': X_matrix} sen_inx = np.where(X_matrix.sum(1) != 0) X_matrix = X_matrix[sen_inx] y_array = y_array[sen_inx] if data_type == 'dev': return {'X': X_matrix, 'y': y_array}
############################################################################### # Process data ############################################################################### # Processing the text # 1) Tokenize and lowercase # 2) Remove punctuation and special characters # 3) Remove stopwords # 4) Apply stemming # 5) Removing blank/empty strings and single characters from analysis stopwords = nltk.corpus.stopwords.words('english') word_lemma = WordNetLemmatizer() word_stem = EnglishStemmer() def process_text(text): punctuation_to_remove = string.punctuation + "’‘—“”" strip = str.maketrans('', '', punctuation_to_remove) sub_filter = r"\b[a-zA-Z]\b" p_text = list( filter(None, [ re.sub(sub_filter, "", word_stem.stem(word.translate(strip))) for word in tknzr(text.lower()) if word not in stopwords ])) return p_text ###############################################################################
def set_stemmer(stemmer_language): if (stemmer_language == "GER"): stemmers = GermanStemmer() else: stemmers = EnglishStemmer() return stemmers
le = LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) print('after: %s ...' % y_train[:5]) # In[435]: # Porter Stemmer import nltk import string import re from nltk.stem.snowball import EnglishStemmer snowball = EnglishStemmer() porter_stemmer = nltk.stem.porter.PorterStemmer() def porter_tokenizer(text, stemmer=porter_stemmer): lower_txt = text.lower() tokens = nltk.wordpunct_tokenize(lower_txt) stems = [porter_stemmer.stem(t) for t in tokens] no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None] return no_punct def snowball_tokenizer(text, stemmer=snowball): lower_txt = text.lower() tokens = nltk.wordpunct_tokenize(lower_txt) stems = [snowball.stem(t) for t in tokens]
if tempEmail.is_multipart(): for part in tempEmail.walk(): ctype = part.get_content_type() cdispo = str(part.get('Content-Disposition')) if ctype == 'text/plain' and 'attachment' not in cdispo: tempMessage = part.get_payload(decode=False) break else: tempMessage = tempEmail.get_payload(decode=False) tempSubject = tempSubject.lower() tempMessage = tempMessage.lower() es = EnglishStemmer() dist = re.sub(r'[^a-zA-Z]', " ", tempMessage) dis = word_tokenize(dist) for token in dis: if token in stopwords.words('english'): continue else: if token in dictionary: dictionary[token] += 1 else: dictionary[token] = 1 forwardIndex[route] = dictionary
def pre_process(root_d, input_f, min_freq, count_in_sent=False): """ :param root_d: :param input_f: :param min_freq: minimum frequency, an integer :param count_in_sent: if need to count words in sentence :return: """ y = {} X = {} word_count = {} word2inx = {} inx2word = [] i = 0 stemmer = EnglishStemmer() with open(os.path.join(root_d, input_f)) as f_handle: for each_line in f_handle: # each_line = '2 If you \'re paying attention , the `` big twists '' are pretty easy to guess - but that does n\'t make the movie any less entertaining .' each_line = each_line.strip() y[i] = each_line[0] x = each_line[1:].strip().lower() x = re.sub(r'[\']?\d+[st]*', 'number', x) x = re.sub(r'\\/', ' ', x) x = re.sub(r'ca n\'t', 'can not', x) x = re.sub(r'n\'t', 'not', x) x = re.sub(r'\'re', 'are', x) x = re.sub(r'\'m', 'am', x) x = re.sub(r'it \'s', 'it is', x) x = re.sub(r'that \'s', 'that is', x) x = re.sub(r'there \'s', 'there is', x) x = re.sub(r'\?', 'question_mark', x) x = re.sub(r'!', 'exclamation_mark', x) x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore') x_list = x.decode('utf-8').split(' ') x_list_stemmed = [stemmer.stem(word) for word in x_list] # stemming X[i] = x_list_stemmed # X[i] = x_list i += 1 # y_json_path = os.path.join(root_d, 'y_train.json') # X_json_path = os.path.join(root_d, 'X_train.json') # if not os.path.exists(y_json_path): # with open(y_json_path, 'a') as f_handle: # json.dump(y, f_handle, indent=2) # if not os.path.exists(X_json_path): # with open(X_json_path, 'a') as f_handle: # json.dump(X, f_handle, indent=2) # count for m in X: each_sentence = X[m] for word in each_sentence: if word not in word_count: word_count[word] = 1 else: word_count[word] += 1 # word_count_path = os.path.join(root_d, 'word_count_in_train.json') # if not os.path.exists(word_count_path): # with open(word_count_path, 'a') as f_handle: # json.dump(word_count, f_handle, indent=2) inx = 0 for m in sorted(word_count.keys()): if (not re.search(r'^\W+$', m)) and (word_count[m] >= min_freq) and (len(m) > 1): word2inx[m] = inx # these words will become word vector inx2word.append((inx, m)) inx += 1 # inx2word_path = os.path.join(root_d, 'inx2word.json') # if not os.path.exists(inx2word_path): # with open(inx2word_path, 'a') as f_handle: # json.dump(inx2word, f_handle, indent=2) # used in test word2inx_path = os.path.join(root_d, 'word2inx.json') if not os.path.exists(word2inx_path): with open(word2inx_path, 'a') as f_handle: json.dump(word2inx, f_handle, indent=2) # sentence to index and convert y to np.array X_inx = {} X_new = {} y_array = np.zeros([len(y)], dtype=np.int) for i, m in enumerate(sorted(list(X.keys()))): each_sentence = X[m] X_inx[m] = [] X_new[m] = [] y_array[i] = y[i] for word in each_sentence: if word in word2inx: X_new[m].append(word) X_inx[m].append(word2inx[word]) # print(' '.join(X_new[m])) # convert sentence to vector X_matrix = np.zeros([len(X_inx), len(word2inx)], dtype=np.int) total_sentence_inx = sorted(list(X_inx.keys())) # print(total_sentence_inx) inx_num = {} for i in total_sentence_inx: if count_in_sent: inx_num = count_num_in_sent(X_inx[i]) X_matrix[i][inx_num['inxs']] = inx_num['num'] else: X_matrix[i][X_inx[i]] = 1 # if i == 1: # print(X_inx[i]) # print(inx_num) # delete the sentences that have no word sen_inx = np.where(X_matrix.sum(1) != 0) X_matrix = X_matrix[sen_inx] y_array = y_array[sen_inx] # np.save('X', X_matrix) # np.save('y', y_array) return {'X': X_matrix, 'y': y_array}
import pandas as pd import numpy as np from nltk.stem.snowball import EnglishStemmer from nltk.tokenize import wordpunct_tokenize import sys reload(sys) sys.setdefaultencoding('ISO-8859-1') stemmer = EnglishStemmer() with open('./input/train.csv', mode='r') as f1: with open('./input/train_stemmed.csv', 'w') as f2: lines = f1.readlines() for i, line in enumerate(lines): f2.write(" ".join([ stemmer.stem(word.lower()) for word in wordpunct_tokenize(line) ]) + "\n") with open('./input/test.csv', mode='r') as f1: with open('./input/test_stemmed.csv', 'w') as f2: lines = f1.readlines() for i, line in enumerate(lines): f2.write(" ".join([ stemmer.stem(word.lower()) for word in wordpunct_tokenize(line) ]) + "\n") with open('./input/product_descriptions.csv', mode='r') as f1: with open('./input/product_descriptions_stemmed.csv', 'w') as f2: lines = f1.readlines() for i, line in enumerate(lines): f2.write(" ".join([
import os from nltk.stem.snowball import EnglishStemmer from pkgutil import get_data # Variables # --------------------------------------------------------------------------------------------- stopwords = { word.strip() for word in str(get_data('data', 'stopwords.txt').decode('utf-8')).split( '\n') } stemmer = EnglishStemmer() # stemmer class wordPattern = "^[^\W\d_]+$" # regex pattern to match a word epsilon = 1e-4 # epsilon value for algorithm damping = 0.85 # damping value for algorithm delta = 1e-7 # delta value for algorithm # ---------------------------------------------------------------------------------------------
text_rem = [x for x in text_3 if x not in text_4] ##we're going to use a similar format to apply various stemming/lemmatizing/synonyms algorithms from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() from nltk.stem import PorterStemmer pt = PorterStemmer() from nltk.stem.snowball import EnglishStemmer sb = EnglishStemmer() from nltk.stem.wordnet import WordNetLemmatizer wn = WordNetLemmatizer() ##let's examine the word ``better" st.stem('better') pt.stem('better') sb.stem('better') wn.lemmatize('better', 'a') wn.lemmatize('families', 'n') ## ##applying the porter stemmer to the gettysburg address
from nltk.corpus import stopwords #import stopwords from nltk corpus from nltk.stem.snowball import FrenchStemmer #import the French stemming library from nltk.stem.snowball import EnglishStemmer #import the English stemming library from nltk.tokenize import TreebankWordTokenizer #import the Treebank tokenizer from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import TreebankWordTokenizer from nltk.probability import FreqDist #import lib to detect language import elastic.detect_lang import elastic.common as common #name stemmers stemmer_fr = FrenchStemmer() stemmer_en = EnglishStemmer() # Load tokenizer # You can choose the most efficient, however wordpunct is working well #tokenizer = TreebankWordTokenizer() tokenizer = WordPunctTokenizer() # stemer function text def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed
def stemming_tokenizer(text, stemmer = EnglishStemmer()): stemmed_text = [stemmer.stem(word) for word in word_tokenize(text, language='english')] return stemmed_text
class DocParse: max_sentence = 10000 include_file = '' outfile = '' count = 0 def __init__(self): self.stemming_on = False self.stop_word_on = False self.summary = False self.use_threshold = False self.max_words_in_summary = 100 self.keep_all = True # by default do not exclude dupe words self.normalize = False self.rval = 0 self.score = 'size' # size | tfidf | stfidf self.update = False self.penalty = False self.total_sentences = 0 self.sentence_dictionary = defaultdict(list) # map of modified sentences to actual sentences (tokenized) self.dictionary = {} # keys are final tokenized output # values are 2-tuple of original sentence and size self.mod_words = () # all unique words of document self.mod_sentences = ((),) self.unique_sent = ((),) self.alg = Alg.Algorithms() self.stemmer = EnglishStemmer() self.doc_size = 0 def tokenize(self, in_file): """Reads in_file and tokenizes into words.""" global debug_on global punctuations if debug_on: print('stem:', self.stemming_on) if debug_on: print('stop:', self.stop_word_on) if debug_on: print('keep:', self.keep_all) f = open(in_file) raw = f.read() sentences_list = [] words_list = [] dictionary_values = [] sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences = sent_tokenizer.tokenize(raw) self.total_sentences = len(raw_sentences) # regex to match for xml type strings regex = re.compile('</|<[A-Za-z]|[A-Za-z]>') # operate on each sentence string from in_file for s, sentence in enumerate(raw_sentences): if debug_on: print('sentence #', str(s + 1)) # if regex match count greater than 2, reduce sentence to nothing count = len(re.findall(regex, sentence)) if count > 2: sentence = " " # remove newlines, after tokenizer. sentence = sentence.replace('\n', ' ') if debug_on: print(s, sentence[0], sentence) # change sentence into tokens (words) tokens = nltk.word_tokenize(sentence) # create table that maps all punctuation marks to None table = str.maketrans({key: None for key in string.punctuation if key != '\''}) # keep only words and numbers words = [word.lower() for word in tokens if (word.translate(table) and word != '\'')] if debug_on: print("nltk tokens", end=":") print(tokens) print("parsed words", end=": ") print(words) print(len(words)) sentence_size = len(words) if debug_on: print('sent len:', str(sentence_size)) # remove stop words if self.stop_word_on: filtered_words = [word for word in words if word not in stopwords.words('english')] words = filtered_words # stem words if self.stemming_on: filtered_words = [self.stemmer.stem(word) for word in words] words = filtered_words if debug_on: print('after filters:', str(words)) # compress sentences to unique words only if not doing greedy3 or tf-idf if self.keep_all: unique_words = words # removes repeated sentences if words not in sentences_list: sentences_list.append(words) dictionary_values.append((sentence, sentence_size, s)) else: # make list of unique words from current sentence unique_words = list(set(words)) # if unique word set not in sentence list than add this set # all repeated sentences will be removed at this stage if unique_words not in sentences_list: sentences_list.append(unique_words) # update local dictionary that maps index to tuple (original sentence, and length) dictionary_values.append((sentence, sentence_size, s)) if debug_on: print(sentences_list) # add unique words to doc word list for w, word in enumerate(words): if word not in words_list: words_list.append(word) # add the modified sentence into dictionary self.sentence_dictionary[tuple(unique_words)].append(sentence) f.close() # this loop changes all the sentences of sentence_list into tuples for s, sentence in enumerate(sentences_list): sentences_list[s] = tuple(sentence) self.dictionary[sentences_list[s]] = dictionary_values[s] # store word list as tuple # store sentence list as tuple self.mod_words = tuple(words_list) self.mod_sentences = tuple(sentences_list) self.doc_size = len(self.mod_sentences) def find_dominating_set(self, option='greedy'): if option == 'greedy': if self.score == 'size': if self.normalize: self.do_g_unique() else: self.do_g_size() elif self.score == 'tfidf': self.do_g_tfidf() elif self.score == 'stfidf': self.do_g_stfidf() elif option == 'dynamic': self.do_dynamic() elif option == 'optimal': if self.optimal_type == 'dp': self.do_bottomup() elif self.optimal_type == 'ilp': self.do_ilp() elif option == 'mcdonald': self.do_mcdonald() def do_mcdonald(self): global debug_on self.alg.mcdonald(self.mod_sentences, self.mod_words, self.dictionary, use_threshold=self.use_threshold, word_count=self.max_words_in_summary) print(self.make_summary(self.alg.dynamic_ans)) def do_g_size(self): global debug_on answer = self.alg.greedy(self.mod_sentences, self.mod_words, self.dictionary, update=self.update, penalty=self.penalty, word_threshold=self.use_threshold, word_count=self.max_words_in_summary) if debug_on: print('greedy answer', answer) if self.summary: print(self.make_summary(answer)) else: print('len(ans):', len(answer)) print('len(doc):', self.total_sentences) if debug_on: print('*****') print(self.sentence_dictionary) print('*****') def do_g_unique(self): global debug_on answer = self.alg.greedy2(self.mod_sentences, self.mod_words, self.sentence_dictionary, self.dictionary, rval=self.rval, update=self.update, penalty=self.penalty, word_threshold=self.use_threshold, word_count=self.max_words_in_summary) if debug_on: print('greedy answer', answer) if self.summary: print(self.make_summary(answer)) else: print('len(ans):', len(answer)) print('len(doc):', self.total_sentences) if debug_on: print('*****') print(self.sentence_dictionary) print('*****') def do_g_tfidf(self): global debug_on self.answer = self.alg.tfidf(self.mod_sentences, self.mod_words, self.dictionary, rval=self.rval, ratio=self.normalize, update=self.update, penalty=self.penalty, word_count=self.max_words_in_summary, use_threshold=self.use_threshold) if debug_on: print('tfidf answer', self.answer) if self.summary: print(self.make_summary(self.answer)) else: print('len(ans):', len(self. answer)) print('len(doc):', self.total_sentences) if debug_on: print('*****') print(self.sentence_dictionary) print('*****') def do_g_stfidf(self): global debug_on answer = self.alg.stfidf(self.mod_sentences, self.mod_words, self.dictionary, rval=self.rval, update=self.update, penalty=self.penalty, ratio=self.normalize, word_count=self.max_words_in_summary, use_threshold=self.use_threshold) if debug_on: print('tfidf answer', answer) if self.summary: print(self.make_summary(answer)) else: print('len(ans):', len(answer)) print('len(doc):', self.total_sentences) if debug_on: print('*****') print(self.sentence_dictionary) print('*****') def do_g_rtfidf(self): global debug_on answer = self.alg.tfidf(self.mod_sentences, self.mod_words, self.dictionary, ratio=True, use_threshold=self.use_threshold) if debug_on: print('tfidf answer', answer) if self.summary: print(self.make_summary(answer)) else: print('len(ans):', len(answer)) print('len(doc):', self.total_sentences) if debug_on: print('*****') print(self.sentence_dictionary) print('*****') def do_bottomup(self): global debug_on self.alg.bottom_up(self.mod_sentences) if self.summary: # print(self.alg.dynamic_ans) print(self.make_summary(self.alg.dynamic_ans)) else: print('len(ans):', len(self.alg.dynamic_ans)) print('len(doc):', self.total_sentences) def do_ilp(self): global debug_on self.alg.ilp(self.mod_sentences) print('alg.dynamic_ans:\n', self.alg.dynamic_ans) if self.summary: # print(self.alg.dynamic_ans) print(self.make_summary(self.alg.dynamic_ans)) else: print('len(ans):', len(self.alg.dynamic_ans)) print('len(doc):', self.total_sentences) def do_dynamic(self): global debug_on if debug_on: print(self.mod_sentences) if debug_on: print(self.mod_words) if self.doc_size > 20: print('too many sentences:', self.doc_size) return # else: # print('there are', self.doc_size, 'sentences') self.alg.dynamic(self.mod_sentences, self.mod_words) # self.sd.dynamic_lookup(set_of_sents, set_of_words) if debug_on: print('') self.alg.dynamic_calc_answer(self.mod_sentences, self.mod_words) if debug_on: print(self.alg.dynamic_ans) if debug_on: for i, items in enumerate(self.alg.dynamic_ans): print(i, ":", items) if self.summary: print(self.make_summary(self.alg.dynamic_ans)) else: print('len(ans):', len(self.alg.dynamic_ans)) print('len(doc):', self.total_sentences) # print 'dynamic answer', answer pass def make_summary(self, sentences): global debug_on ret_val = [] word_count = 0 for sentence in sentences: if self.dictionary[sentence][1] <= (self.max_words_in_summary - word_count) or \ self.max_words_in_summary == 0: if debug_on: print(str(self.dictionary[sentence][1]) + ": " + self.dictionary[sentence][0]) ret_val.append(self.dictionary[sentence][0]) word_count += self.dictionary[sentence][1] else: if debug_on: print(str(self.dictionary[sentence][1]) + ": " + self.dictionary[sentence][0]) ret_val.append(self.shorten(self.dictionary[sentence][0], self.max_words_in_summary - word_count)) break pass if self.outfile: with open(self.outfile, 'w') as f: f.write(" ".join(ret_val)) pass return " ".join(ret_val) pass def shorten(self, sentence, length): global punctuations global debug_on tokens = nltk.word_tokenize(sentence) # remove all non-alphanumeric characters words_used = 0 words = [] for word in tokens: if words_used == length: break if debug_on: print(word, end=' ') words.append(word) if word not in punctuations: words_used += 1 if debug_on: print('keep', words_used) else: if debug_on: print('remove') # words = [word for word in tokens if word not in punctuations] # return " ".join(words[:length-len(words)]) return " ".join(words)
count2 = 0 for word, index in self.index.items(): print(word, ':', index) temp = index[0][0] for i in index: if (i[0] != temp): temp = i[0] count = count + 1 if (i[0] == 0): count1 = count1 + 1 else: count2 = count2 + 1 return [count / count1, count / count2, count] index = Index(nltk.word_tokenize, EnglishStemmer(), nltk.corpus.stopwords.words('english')) for i in range(0, 2): #COnverting .pdf file to .txt file reader = PyPDF2.PdfFileReader('E:/C/Python/IWP_Project/test' + str(i) + '.pdf') text = data_func.convert_pdf_to_string('E:/C/Python/IWP_Project/test' + str(i) + '.pdf') text_file = open('E:/C/Python/IWP_Project/test' + str(i) + '.txt', 'w') n = text_file.write(text) text_file.close() file = open('E:/C/Python/IWP_Project/test' + str(i) + '.txt') read = file.read() file.seek(0)
from nltk import word_tokenize from nltk.stem.snowball import EnglishStemmer from langdetect import detect from langdetect.lang_detect_exception import LangDetectException from stop_words import * import string from pymystem3 import Mystem from nltk.corpus import stopwords import nltk nltk.download("stopwords") nltk.download('punkt') rus_stem = Mystem() en_stem = EnglishStemmer() def detect_language(text): try: return detect(text) except LangDetectException: return "en" def stem(text): return [ word for word in rus_stem.lemmatize(text) if not set(string.punctuation) & set(word) and word.strip() and word not in stopwords.words('russian').extend('https', 'ru') ]
def stem_lokens(tokens): sst = EnglishStemmer() return [[sst.stem(word) for word in tokens_i] for tokens_i in tokens]
def __init__(self): self.featureLookup={} self.class_map_dic={} self.index = Index(nltk.word_tokenize, EnglishStemmer(), nltk.corpus.stopwords.words('english'))
def __init__(self): self.stoplist = set(stopwords.words('english')) self.stemmer = EnglishStemmer()
class CleanText: def __init__(self, stop_en=None, stop_th=None, keyword=None): import re import os from nltk.stem.snowball import EnglishStemmer self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]') self.pattern_new_sentence = re.compile('\.[0-9]+(\)|\.) ') self.pattern_th_out = re.compile(u'[\u0e00-\u0e7f][^\u0e00-\u0e7f]') self.pattern_th_in = re.compile(u'[^\u0e00-\u0e7f][\u0e00-\u0e7f]') self.pattern_num_bullet = re.compile('^[0-9]+(\)|\.)*$') self.pattern_eng_token = re.compile('^[a-zA-Z]+$') self.pattern_number = re.compile('\+*[0-9]+') self.pattern_phone_number = re.compile('[0-9]+-[0-9]+-[0-9]+') self.pattern_email = re.compile('[a-zA-Z._\-0-9]+@[a-zA-Z._\-0-9]+') self.pattern_url = re.compile('(https://|www.)[a-zA-Z0-9]+.[a-z]+[^\s]*') self.pattern_sentence_collide = re.compile('[a-z][A-Z]]') self.pattern_thai_name = re.compile(u'\u0e04\u0e38\u0e13\s*[\u0e00-\u0e7f]+\s+') self.pattern_prefix_garbage = re.compile('^\-|^\||^\.|^\#{1,2}|^(\-\|)|^(\+\|)|^(\#\|)^(\.\|)') self.charset = {} with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', 'charset'), 'rt') as charfile: for item in charfile.read().split('\n'): if len(item) < 4: self.charset[item] = ord(item) else: self.charset[chr(int(item, 16))] = int(item, 16) self.stemming = EnglishStemmer() if stop_en: with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', stop_en), 'rt', encoding='utf-8') as stop_file: self.stop_en = set([item for item in stop_file.read().split('\n')]) else: self.stop_en = set([]) if stop_th: with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', stop_th), 'rt', encoding='utf-8') as stop_file: self.stop_th = set([item for item in stop_file.read().split('\n')]) else: self.stop_th = set([]) if keyword: with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file: self.keyword = set([item for item in keyword_file.read().split('\n')]) else: self.keyword = set([]) def clean_text(self, text): def validate_char(val_text): val_text = val_text.replace('&', ' ') val_text = val_text.replace(' ', ' ') ret_text = '' for cha in val_text: try: self.charset[cha] except KeyError: ret_text += ' ' else: ret_text += cha while self.pattern_prefix_garbage.search(ret_text): ret_text = self.pattern_prefix_garbage.sub(' ', ret_text) while ret_text.find(' ') != -1: ret_text = ret_text.replace(' ', ' ') return ret_text def split_th_en(splt_text): insert_pos = [] splt_text = splt_text[:] for pos, item in enumerate(splt_text[:-2]): if self.pattern_th_in.search(splt_text[pos:pos+2]) or self.pattern_th_out.search(splt_text[pos:pos+2]): insert_pos.append(pos + 1) for pos in reversed(insert_pos): splt_text = splt_text[:pos] + ' ' + splt_text[pos:] return splt_text def remove_thai_stop(th_text): stop_pos = [[0, 0]] ## TH : do longest matching for j in range(len(th_text)-1): for k in range(j+1, min(len(th_text), j+36)): if th_text[j:k] in self.stop_th: # found keyword +++ instead of returning string - return positions that is # i to j if j <= stop_pos[-1][1]: stop_pos[-1] = [stop_pos[-1][0], k] else: stop_pos.append([j, k]) break newstr = '' if len(stop_pos) == 1: newstr = th_text else: for j in range(len(stop_pos)-1): newstr += th_text[stop_pos[j][1]:stop_pos[j+1][0]] + ' ' return newstr text = text.replace(u'\u0e46', ' ') text = self.pattern_email.sub(' ', text) text = self.pattern_url.sub(' ', text) text = self.pattern_phone_number.sub(' ', text) text = self.pattern_thai_name.sub(' ', text) text = split_th_en(text) text = self.pattern_new_sentence.sub(' . ', text) text = text.replace('.', ' . ') text = validate_char(text) text = remove_thai_stop(text) text_split = text.split(' ') text_split = [item for item in text_split[:] if item not in self.stop_en and not self.pattern_num_bullet.search(item)] text_split = [self.stemming.stem(item) if self.pattern_eng_token.search(item) and item not in self.keyword else item for item in text_split[:]] text = '|'.join(text_split) return text
from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.stem import * from nltk.stem.snowball import EnglishStemmer with open('test.txt', 'r') as f: f_contents = f.read().decode("utf-8-sig").encode( "utf-8") #decode the contents to unicode and encode to utf-8 words = word_tokenize(f_contents) stemmer = PorterStemmer() #for w in words: # print(stemmer.stem(w)) print stemmer.stem('having') stemmer2 = SnowballStemmer('english') print stemmer2.stem('distribution') stemmer3 = EnglishStemmer() print stemmer3.stem('require')
class VocabProcessor: def __init__(self, tokenizer_fn, batch_size, remove_stop_words, should_stem, limit_vocab=True, max_vocab_size=80000): self.vocab = defaultdict( self.next_value ) # map tokens to ids. Automatically gets next id when needed self.token_counter = Counter() # Counts the token frequency self.vocab[PAD] = globals.PAD_ID self.vocab[UNK] = globals.UNK_ID self.vocab[START] = globals.START_ID self.vocab[EOS] = globals.END_ID self.next = globals.END_ID # After 3 comes 4 self.tokenizer = tokenizer_fn self.reverse_vocab = {} self.batch_size = batch_size self.remove_stop_words = remove_stop_words self.should_stem = should_stem self.limit_vocab = limit_vocab self.max_vocab_size = max_vocab_size if remove_stop_words: remove_words = stopwords.words("english") + list(punctuation) # not included in punctuation for some reason remove_words.append("+") remove_words.append("``") remove_words.append("''") remove_words.append("'s") self.remove_word_set = set(remove_words) # adding punctuation from the same flag if self.should_stem: self.stemmer = EnglishStemmer() """Gets the next index for defaultdict""" def next_value(self): self.next += 1 return self.next def reset_processor(self): self.vocab = defaultdict( self.next_value ) # map tokens to ids. Automatically gets next id when needed self.token_counter = Counter() # Counts the token frequency self.vocab[PAD] = globals.PAD_ID self.vocab[UNK] = globals.UNK_ID self.vocab[START] = globals.START_ID self.vocab[EOS] = globals.END_ID self.next = globals.END_ID # After 3 comes 4 self.reverse_vocab = {} """Convert the ids back to strings.""" def ids_to_string(self, tokens, length=None): string = ''.join([self.reverse_vocab[x] for x in tokens[:length]]) return string def convert_token_to_id(self, token): ''' Gets a token, looks it up in the vocabulary. If it doesn't exist in the vocab, it gets added to id with an id Then we return the id :param token: :return: the token id in the vocab ''' if self.limit_vocab: if token in self.vocab: self.token_counter[token] += 1 return self.vocab[token] else: if self.next < self.max_vocab_size: self.token_counter[token] += 1 return self.vocab[token] else: self.token_counter[UNK] += 1 return self.vocab[UNK] else: self.token_counter[token] += 1 return self.vocab[token] # does more than just tokenization def tokenize(self, text): words = self.tokenizer(text) # 7-17-18 1:12 PM Testing with lowercase all words, works, switched to this! if self.remove_stop_words: if globals.VOCAB_LOWERCASE: words = [ word.lower() for word in words if word.lower() not in self.remove_word_set ] else: words = [ word for word in words if word.lower() not in self.remove_word_set ] if self.should_stem: words = [self.stemmer.stem(word) for word in words] return words def tokens_to_id_list(self, tokens): return list(map(self.convert_token_to_id, tokens)) # def sentence_to_id_list(self, sent): # tokens = self.sentence_to_tokens(sent) # id_list = self.tokens_to_id_list(tokens) # return id_list # def sentence_to_numpy_array(self, sent): # id_list = self.sentence_to_id_list(sent) # return np.array(id_list) # All used to map back to vocab, not tested fully or used # def update_reverse_vocab(self): # self.reverse_vocab = {id_: token for token, id_ in self.vocab.items()} # def id_list_to_text(self, id_list): # tokens = ''.join(map(lambda x: self.reverse_vocab[x], id_list)) # return tokens # Untested def save(self, filename): """Saves vocabulary processor into given file. Args: filename: Path to output file. """ with gfile.Open(filename, 'wb') as f: f.write(pickle.dumps(self)) @classmethod def restore(cls, filename): """Restores vocabulary processor from given file. Args: filename: Path to file to load from. Returns: VocabularyProcessor object. """ with gfile.Open(filename, 'rb') as f: return pickle.loads(f.read())
#feat_tf_map = str(feat)+':'+str(tf) f.write(feat_tf_idf_map) f.write(' ') doc_list[class_label].append(feat_tf_idf_map) #doc_list[k].append(feat_tf_map) f.write('\n') #main method if __name__ == '__main__': #instantiate class 'Index' index = Index(nltk.word_tokenize, EnglishStemmer(), nltk.corpus.stopwords.words('english')) inv_index = index.indexed_docs() #saves the inverted index into a variable #reads arguments from command line dir_newsgroups_data = sys.argv[1] #reads directory of newsgroups data(which is the root directory mini_newsgroup) subject_body_index(dir_newsgroups_data) feature_defn_file = sys.argv[2] #argument name to use to write feature definition file feature_defn_gen(feature_defn_file) print('Produced feature definition file') class_defn_file = sys.argv[3] #argument name to use to write class definition file class_defn_gen(class_defn_file,dir_newsgroups_data) print('Produced class definition file')
global_path = "C:\\Users\\surat_000\\Documents\\Visual Studio 2013\\Projects\\searchDB_CS\\searchDB_CS\\bin\\Debug\\" global_path = global_path + "text-data-G06F-20000-1\\" train_path = global_path + "train\\" test_path = global_path + "test\\" #stop_words_list = ['it', 'a', 'is', ] #Trying to use PorterStemmer import nltk from nltk import word_tokenize from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import EnglishStemmer ####### # based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html stemmer = EnglishStemmer() def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed import string import unicodedata import re def tokenize(text):
def stemmed_words(doc): stemmer = EnglishStemmer() analyzer = TfidfVectorizer().build_analyzer() return ' '.join((stemmer.stem(w) for w in analyzer(doc)))
def tweet_tokenizer(text, stemmer = EnglishStemmer()): stemmed_text = [stemmer.stem(word) for word in TweetTokenizer(preserve_case = False, strip_handles=True, reduce_len=True).tokenize(text)] return stemmed_text
dataloc = '/Volumes/Seagate Backup Plus Drive/PoliTweet/TwitterData/' import emoji # Sci-Kit Learn from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score # ### Required Data # In[7]: twtokenizer = TweetTokenizer() # In[ ]: stemmer = EnglishStemmer() # Get an instance of SnowballStemmer for English # In[8]: punctuation = list(set(string.punctuation)) + ['…','’','...','—',':/','”','..', '“'] # In[9]: stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',