def run_search(dict_file, postings_file, queries_file, results_file): """ using the given dictionary file and postings file, perform searching on the given queries file and output the results to a file """ print('running search on the queries...') dictionary = dict() lengths = dict() stemmer = stem.PorterStemmer() #Open dictionary in memory with open(dict_file, "rb") as dictionary_f: dictionary = pickle.load(dictionary_f) with open(os.path.join(os.getcwd(), "lengths.txt"), "rb") as lengths_f: lengths = pickle.load(lengths_f) #Open and read each line of the queries file try: fd = open(queries_file, 'r', encoding="utf8") line = fd.readline() except: error_opening_file(queries_file) sys.exit(2) #Erase the contents of the file output_file = open(results_file, "w") output_file.close() #Open file to append lines output_file = open(results_file, "a") #Evaluate each line or query while line: #If its blank just write nothing if (line == " " or line == "\n" or line == "\t"): output_file.write('\n') line = fd.readline() continue #Tokenize the query tokens = nltk.word_tokenize(line) scores = defaultdict(lambda: 0) # Creating empty heap heap = [] heapify(heap) #COSINE SCORE #For each query term t stemmed_tokens = list() unique_tokens = set() for token in tokens: stemmed_tokens.append(stemmer.stem(token.lower())) unique_tokens.add(stemmer.stem(token.lower())) for token in unique_tokens: docFreq_pointer = dictionary.get(token, -1) if (docFreq_pointer == -1): continue # get the document_frequency for the token document_frequency = docFreq_pointer[0] #read the posting lists, only open the file in this line postings_f = open(postings_file, "rb") #Move to the position in the file where docFreq_pointer[1] = pointer postings_f.seek(docFreq_pointer[1]) #Only read the object at that position token_postings_list = pickle.load(postings_f) #Close file postings_f.close() #print("token postings list:") #print(token_postings_list) for docID_termF in token_postings_list: doc_vector = lengths[docID_termF[0]] query_idf = (len(lengths) + 1) / (document_frequency + 1) """ print("current token of query:") print(token) print("term frequency in query:") print(stemmed_tokens.count(token)) print("weight of term in doc vector:") print(doc_vector[token]) print("tf of term in query:") print(1 + math.log(stemmed_tokens.count(token), 10)) print("idf division:") print(query_idf) print("idf of term in query:") print(math.log((query_idf),10)) print("weight of the term in query:") print(((1 + math.log(stemmed_tokens.count(token), 10)) * math.log((query_idf),10))) """ scores[docID_termF[0]] += (doc_vector[token]) * ( (1 + math.log(stemmed_tokens.count(token), 10)) * math.log( (query_idf), 10)) heappush(heap, (-1 * scores[docID_termF[0]], docID_termF[0])) maxTen = heap[:10] result = [] for cosineSim_docId in maxTen: result.append(cosineSim_docId[1]) #Write the result with the specified format output_file.write(' '.join(map(str, result))) #Prepare new line output_file.write("\n") line = fd.readline() output_file.close fd.close
def stem_word(word): #stemming of words new_word = word.replace(",", "") new_word = new_word.replace("\'", "") stemmer = stem.PorterStemmer() return stemmer.stem(new_word)
def __init__(self): self.stemmer = stem.PorterStemmer() self.validreg = re.compile(r'^[-=!@#$%^&*()_+|;";,.<>/?]+$') self.splitreg = re.compile(r'\s|,|\.|\(|\)|\'|/|\'|\[|\]|-')
def train_knn(trainlines, model_fname): # worddict [class][filenum][words] worddict = defaultdict(lambda: defaultdict(dict)) gtruth_dict = defaultdict(int) idf_dict = defaultdict(float) chicount_dict = defaultdict(lambda: defaultdict(int)) fileindex = 0 # counting the instances of variable class_count = defaultdict(int) #chifeat_words = ["_RARE_"] chifeat_words = [] total_docs = len(trainlines) uniquewords = set() stemmer = stem.PorterStemmer() stopwords = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'should', 'now', 'movie', 'would', 'thing', 'film', 'cinema', 'movie', 'movies', 'cinemas', 'tv', 'documentary', 'y' ] ''' stopwords.extend(string.punctuation) stopwords = set(stopwords) ''' tokenizer = RegexpTokenizer(r'\w+') exp_allnums = re.compile('^[-./]*[0-9][-0-9.,:/]+$') exp_wordnums = re.compile('^[A-Za-z]+[0-9]+[-/0-9.A-Za-z]*$') exp_numwords = re.compile('^[0-9]?[0-9]+[-./A-Za-z]+$') for line in trainlines: line = line.strip().lower() fileindex = fileindex + 1 gtruth_dict[fileindex] = int(line[0]) class_count[int(line[0])] += 1 #tokenlist = word_tokenize(commentwords) commentline = line[3:-1] commentline = commentline.replace("it's", "it is") commentline = commentline.replace("won't", "will not") commentline = commentline.replace("can't", "cannot") commentline = commentline.replace("n't", " not") commentline = commentline.replace("'ll", " will") commentwords = tokenizer.tokenize(commentline) # Numeric words allnumkeys = set([ m.group(0) for word in commentwords for m in [exp_allnums.search(word)] if m ]) # identify exclusive alphanumeric wordnumset = set([ m.group(0) for word in commentwords for m in [exp_wordnums.search(word)] if m ]) # identify exclusive numericalpha numwordset = set([ m.group(0) for word in commentwords for m in [exp_numwords.search(word)] if m ]) # removing the punctuations and stop words commentwords = [ stemmer.stem(x) for x in commentwords if stemmer.stem(x) not in set(stopwords) | allnumkeys | set(wordnumset) | set(numwordset) ] wordset = set(commentwords) # for single occurance of word increment for word in wordset: idf_dict[word] += 1 uniquewords.update(wordset) worddict[fileindex] = commentwords # getting the chisquare_dict for word in commentwords: chicount_dict[word][int(line[0])] += 1 # Setting IDF for each doc. for key, value in idf_dict.iteritems(): idf_dict[key] = float(total_docs) / value # calculating chi-square value for each word. for word in uniquewords: # subscript (term, class) n11 = chicount_dict[word][1] + 1 n10 = chicount_dict[word][0] + 1 n01 = class_count[1] - n11 n00 = class_count[0] - n10 num = (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01) * (n11 * n00 - n10 * n01) den = (n11 + n01) * (n11 + n10) * (n10 + n00) * (n01 + n00) chivalue = float(num) / den if chivalue > 0: # other values are # 10.83 -- 0.001 # 7.88 -- 0.005 # 6.63 -- 0.01 # 3.84 -- 0.05 # 2.71 -- 0.1 chifeat_words.append(word) #print chifeat_words print len(chifeat_words), "\n" uniquewords = chifeat_words # enumeration of unique words as enumeration is not existing in python feat_len = len(uniquewords) uniquewords = list(uniquewords) enumwords_dict = defaultdict(int) iter = 0 for word in uniquewords: enumwords_dict[word] = iter iter += 1 #initialising centroid matrix centroid_mat = defaultdict(list) centroid_mat[0] = [0] * feat_len centroid_mat[1] = [0] * feat_len for findex in range(1, len(trainlines) + 1): featvect = [0] * feat_len words = worddict[findex] for word in words: if word in uniquewords: #featvect[enumwords_dict[word]] += idf_dict[word]/len(words) # normalising length of sentence. featvect[enumwords_dict[word]] += float( 1) #/len(words) # normalising length of sentence. else: featvect[enumwords_dict["_RARE_"]] += float( 1) #/len(words) # normalising length of sentence. # replace this value 1 with frequency or tf/idf for better results. centroid_mat[gtruth_dict[findex]] = map( add, centroid_mat[gtruth_dict[findex]], featvect) for cl in [0, 1]: centroid_mat[cl] = map(lambda x: x / class_count[cl], centroid_mat[cl]) fopen = open(model_fname, "wb") pickle.dump([centroid_mat, enumwords_dict, idf_dict], fopen) fopen.close()
def __init__(self): self._stemmer = stem.PorterStemmer()
def regex_str(sentence): tokenizer = RegexpTokenizer(r'\w+') stemmer = stem.PorterStemmer() words = tokenizer.tokenize(sentence) return [stemmer.stem(word) for word in words if not stopwords_exsits(word)]
def extract_features(sentence): words = sentence.split() stemmer = stem.PorterStemmer() result = [stemmer.stem(word) for word in words if not check(word)] return ' '.join(result)
def lower_and_stem(s): return stem.PorterStemmer().stem_word(s.lower())
def preprocess(trainlines): worddict = defaultdict(lambda: defaultdict(int)) chisquare_dict = defaultdict(float) chicount_dict = defaultdict(lambda: defaultdict(int)) gtruth_dict = defaultdict(int) idf_dict = defaultdict(float) index = 0 stemmer = stem.PorterStemmer() total_docs = len(trainlines) stopwords = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'should', 'now', 'movie', 'would', 'thing', 'film', 'cinema', 'movie', 'movies', 'cinemas', 'tv', 'documentary', 'y' ] stopwords = set(stopwords) tokenizer = RegexpTokenizer(r'\w+') # Regular Expressions to remove alpha numerics exp_allnums = re.compile('^[-./]*[0-9][-0-9.,:/]+$') exp_wordnums = re.compile('^[A-Za-z]+[0-9]+[-/0-9.A-Za-z]*$') exp_numwords = re.compile('^[0-9]?[0-9]+[-./A-Za-z]+$') for line in trainlines: line = line.strip().lower() index = index + 1 gtruth_dict[index] = int(line[0]) #tokenlist = word_tokenize(commentwords) commentline = line[3:-1] commentline = commentline.replace("it's", "it is") commentline = commentline.replace("won't", "will not") commentline = commentline.replace("can't", "cannot") commentline = commentline.replace("n't", " not") commentline = commentline.replace("'ll", " will") commentwords = tokenizer.tokenize(commentline) commentwords = [ stemmer.stem(x) for x in commentwords if stemmer.stem(x) not in stopwords ] wordset = list(set(commentwords)) # counting freq of docs with the "word" for word in wordset: idf_dict[word] += 1 # adding to dictionary for word in commentwords: worddict[int(line[0])][word] += 1 chicount_dict[word][int(line[0])] += 1 # removing numerics as key words allnumkeys = [ m.group(0) for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys()) for m in [exp_allnums.search(word)] if m ] # identify exclusive alphanumeric wordnumset = [ m.group(0) for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys()) for m in [exp_wordnums.search(word)] if m ] # identify exclusive numericalpha numwordset = [ m.group(0) for word in set(worddict[0].iterkeys()) | set(worddict[1].iterkeys()) for m in [exp_numwords.search(word)] if m ] # removing all numerics and alpha numerics. allremovablekeys = allnumkeys + numwordset + wordnumset for numstr in allremovablekeys: worddict[1].pop(numstr, None) worddict[0].pop(numstr, None) chicount_dict[1].pop(numstr, None) chicount_dict[0].pop(numstr, None) idf_dict.pop(numstr, None) # Setting IDF now for key, value in idf_dict.iteritems(): idf_dict[key] = float(total_docs) / value return [worddict, gtruth_dict, idf_dict, chicount_dict]
def nlp52(sentences: list) -> list: stemmer = stem.PorterStemmer() return [ '\n'.join([stemmer.stem(word) for word in sentence.split('\n')]) for sentence in sentences ]
def __init__(self, filename): self.filename = filename self.stemmer = stem.PorterStemmer()
import gensim from gensim.models.doc2vec import * #import cPickle as pickle import numpy #from collections import namedtuple import nltk import nltk.corpus as nc import nltk.stem as nsl import re import scipy.io as sio #import numpy as np #from sklearn.manifold import TSNE nltk.download() stops = set(nc.stopwords.words("english")) st = nsl.PorterStemmer() def rev_to_words(rev_line): ## from stopping to stemming #1. remove non_letters letters_only = re.sub("[^a-zA-Z]"," ", rev_line) #2. convert to lower case, and split line = letters_only.lower() #3. stopword remove! # meaningful_words = [w for w in words if not w in stps] #4. return the result return(line) #f = open('books_processed_positive.txt','rb') #text = pickle.load(f)
def stemWordsList(words_list): return [[(word, stem.PorterStemmer().stem(word)) for word in words] for words in words_list ]
def get_stat_dict(nametxt_dict, test=False): if test: pp_dir = "test/partisan_phrases/" else: pp_dir = "src/data/init/partisan_phrases/" pp_txts = os.listdir(pp_dir) score_dict = {} for i in pp_txts: with open(pp_dir + i) as curtxt: for line in curtxt.readlines()[1:]: splt = line.split("|") score_dict[splt[0]] = float(splt[1].strip()) nltk.download("wordnet") nltk.download("stopwords") stpwrds = stopwords.words("english") porter = stem.PorterStemmer() def preproc_strn(strn): # Lowercase, remove digits and doublespaces curstr = strn.lower().translate( str.maketrans('', '', string.punctuation)) curstr = re.sub(r'[0-9]+', '', curstr) curstr = re.sub(r'\n', ' ', curstr) curstr = re.sub(r' +', ' ', curstr) plst = [] for word in curstr.split(): # Check for stopwords if word not in stpwrds: # Porter stem the word pword = porter.stem(word) plst.append(pword) numwords = len(plst) curstr = ' '.join(plst) return (curstr, numwords) def string_score(strn, score_dict): # Pre-process, return the processed string and the number of words curstr, numwords = preproc_strn(strn) # Absolute bias sum absscore = 0 # Bias sum sumscore = 0 # Total number of occurences of phrases from G&S totphrs = 0 # Dictionary of top 10 phrase counts counts_dict = {} for key, value in score_dict.items(): numoccurs = curstr.count(key) totphrs += numoccurs counts_dict[key] = (numoccurs, value) curscore = numoccurs * value absscore += abs(curscore) sumscore += curscore counts_list = sorted(counts_dict.items(), key=lambda item: item[1], reverse=True)[:10] return [absscore, sumscore, numwords, counts_list, totphrs] namestat_dict = {} for name, txt in nametxt_dict.items(): namestat_dict[name] = string_score(txt, score_dict) for name, stat in namestat_dict.items(): dispcnt = 1 procname = preproc_strn(name)[0] is_intitle = False for phr, freq in stat[3]: if phr in procname: is_intitle = True dispcnt += 1 namestat_dict[name].append(is_intitle) return namestat_dict
def stem_make(): stemmer = stem.PorterStemmer() for word in word_make(): yield word + "\t" + stemmer.stem(word)
def stemming_porter(): for word_stem in separate_word(): stemmer = stem.PorterStemmer() yield (word_stem, stemmer.stem(word_stem))
from knock71 import stop from collections import defaultdict from nltk import stem stemming = stem.PorterStemmer() feature_dict = defaultdict(int) for line in open('sentiment.txt'): word_list = line.strip('\n').split() word_list.pop(0) for word in word_list: word = stemming.stem(word) if stop(word) == False: feature_dict[word] += 1 for word, freq in sorted(feature_dict.items()): print(word + '\t' + str(freq))
def normalize(self, s, stemmer=stem.PorterStemmer()): words = tokenize.wordpunct_tokenize(s.lower().strip()) return ' '.join([stemmer.stem(w) for w in words])
def search(query, docTermIndex): print('\nRetrieving documents for query \'{}\'\n'.format(query)) qlist = query.strip().split() #Remove stop words modified = [ term for term in qlist if term not in stopwords.words('english') ] EXPAND = False '''Expansion EXPAND = True for term in modified: syns = wordnet.synsets(term) for i in range(len(syns)): #print(syns[i].lemmas()[0].name()) new.append(syns[i].lemmas()[0].name()) ''' '''''' if EXPAND == False: new = modified before_stem = np.unique(new) ps = stem.PorterStemmer() after_stem = [ps.stem(word) for word in before_stem] mQuery = np.unique(after_stem) mQuery #read potentially relevant docs into matrix using tf*idf weights qMatrix = pd.DataFrame(np.zeros((0, len(mQuery))), columns=mQuery) for term in mQuery: if term in docTermIndex.keys(): termInfo = docTermIndex.get(term) for occurence in termInfo.occList: if occurence.docID not in qMatrix.index: toAppend = pd.Series(np.zeros(len(qMatrix.columns)), index=qMatrix.columns, name=occurence.docID) toAppend[term] = occurence.count * termInfo.idf qMatrix = qMatrix.append(toAppend) else: qMatrix.loc[occurence.docID, term] = occurence.count * termInfo.idf #compute tfxidf vector of query #print(qMatrix.columns) q_vect = [docTermIndex.get(term).idf for term in qMatrix.columns] #Get cosine similarities for query matrix_norm = np.array( [np.linalg.norm(qMatrix.iloc[i]) for i in range(len(qMatrix))]) q_norm = np.linalg.norm(q_vect) sims = np.dot(qMatrix, q_vect) / (matrix_norm * q_norm) dists = 1 - sims idx = np.argsort(dists) user_docs = qMatrix.iloc[idx[:10]].index classes = pd.read_csv('classes.csv', index_col=1) for i, path in enumerate(user_docs): parts = path.split('/') group = parts[1] file = parts[2] print('----{}: File {} in folder {}-----\n'.format(i + 1, file, group)) with open(path, 'r', errors='ignore') as myfile: data = myfile.read() art = data ind = art.find('\n\n') art = art[ind + 2:] #If article(and not post), re-index again to get rid of tags if art[0:10].find('archive') != -1: ind = art.find('\n\n') art = art[ind + 2:] mid = len(art) // 2 midmid = mid // 2 print('---------------------------------------------\n') print(art[mid:mid + 200]) print('---------------------------------------------\n') return user_docs
def stemmer(text): stemmer = stem.PorterStemmer("NLTK_EXTENSIONS") stemmed_tokens = [stemmer.stem(token) for token in text.split()] return stemmed_tokens
class PorterStemmer(BaseNormalizer): name = 'Porter Stemmer' normalizer = stem.PorterStemmer().stem
def get_all_history_stats(test = False): # Setup for string for cleaning stpwrds = stopwords.words("english") porter = stem.PorterStemmer() nltk.download("wordnet") nltk.download("stopwords") # Base directory for saving/processing article histories if test: xmls_base = "test/temp/wiki_xmls/" else: xmls_base = "src/data/temp/wiki_xmls/" if not os.path.exists(xmls_base): os.makedirs(xmls_base) # Base directory for saving resdicts rd_base = "src/data/temp/resdicts/" if not os.path.exists(rd_base): os.makedirs(rd_base) # Get and split anames into chunks of 20 for chunk-wise processing anames = retrieve_anames() alst = [anames[i:i + 20] for i in range(0, len(anames), 20)] # Load in the score dictionary if test: pp_dir = "test/partisan_phrases/" else: pp_dir = "src/data/init/partisan_phrases/" pp_txts = os.listdir(pp_dir) score_dict = {} for i in pp_txts: with open(pp_dir + i) as curtxt: for line in curtxt.readlines()[1:]: splt = line.split("|") score_dict[splt[0]] = float(splt[1].strip()) # Helper for cleaning strings def preproc_strn(strn): # Lowercase, remove digits and doublespaces curstr = strn.lower().translate(str.maketrans('', '', string.punctuation)) curstr = re.sub(r'[0-9]+', '', curstr) curstr = re.sub(r'\n', ' ', curstr) curstr = re.sub(r' +', ' ', curstr) plst = [] for word in curstr.split(): # Check for stopwords if word not in stpwrds: # Porter stem the word pword = porter.stem(word) plst.append(pword) numwords = len(plst) curstr = ' '.join(plst) return (curstr, numwords) def get_art_hists(for_hist): for_hist_und = ["_".join(i.split()) for i in for_hist] exp_base = "https://en.wikipedia.org/w/index.php?title=Special:Export&pages=" exp_end = "&history=1&action=submit" for tit in for_hist_und: url = exp_base + tit + exp_end try: resp = requests.get(url) except Exception as e: try: time.sleep(10) resp = requests.get(url) except Exception as e: print(tit + " did not get processed") with open(xmls_base + tit + ".xml", mode = "wb") as wfile: wfile.write(resp.content) resp.close() def get_hist_stats(rdname): xmls_list = [x for x in os.listdir(xmls_base) if ".xml" in x] resdict = {} for fn in xmls_list: # This block is for fixing broken xmls with no closing tags try: tree = ET.parse(xmls_base + fn) except Exception as e: with open(xmls_base + fn, "a") as app: app.write(" </page>") app.write("</mediawiki>") tree = ET.parse(xmls_base + fn) # Set up the tree and the list of results for the current article root = tree.getroot().find("{http://www.mediawiki.org/xml/export-0.10/}page") revlist = [] for rev in root.findall("{http://www.mediawiki.org/xml/export-0.10/}revision"): # The dictionary for each revision curdict = {} curdict["time"] = rev.find("{http://www.mediawiki.org/xml/export-0.10/}timestamp").text txt = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text").text if not txt is None: curdict["text"] = txt else: curdict["text"] = "" comm = rev.find("{http://www.mediawiki.org/xml/export-0.10/}comment") if not comm is None: curdict["comm"] = comm.text else: curdict["comm"] = "" cont = rev.find("{http://www.mediawiki.org/xml/export-0.10/}contributor") user = cont.find("{http://www.mediawiki.org/xml/export-0.10/}username") if not user is None: curdict["user"] = user.text else: curdict["user"] = cont.find("{http://www.mediawiki.org/xml/export-0.10/}ip").text revlist.append(curdict) resdict[fn[:-4]] = revlist cnt = 0 # Populate resdict with stats for name, revl in resdict.items(): prevr = db.bigram(preproc_strn(revl[0]["text"])[0]) for rev in revl: # if cnt % 1000 == 1: # print(cnt) cnt += 1 curr = db.bigram(preproc_strn(rev["text"])[0]) diffs = db.unique_items(prevr, curr) rem, add = diffs # Trying to get the following output: [absscore, sumscore, numwords, counts_list, totphrs] rem_abs = 0 add_abs = 0 rem_sum = 0 add_sum = 0 rem_num = len(rem) add_num = len(add) # add_counts = {} # rem_counts = {} add_phrs = 0 rem_phrs = 0 for bigr in rem: if bigr in score_dict.keys(): rem_abs += abs(score_dict[bigr]) rem_sum += score_dict[bigr] rem_phrs += 1 for bigr in add: if bigr in score_dict.keys(): add_abs += abs(score_dict[bigr]) add_sum += score_dict[bigr] add_phrs += 1 rev["rem"] = rem rev["add"] = add rev["rem_abs"] = rem_abs rev["add_abs"] = add_abs rev["rem_sum"] = rem_sum rev["add_sum"] = add_sum rev["rem_num"] = rem_num rev["add_num"] = add_num rev["rem_phrs"] = rem_phrs rev["add_phrs"] = add_phrs del rev["text"] prevr = curr with open(rd_base + rdname + ".json", "w") as outfile: json.dump(resdict, outfile) def del_art_hists(): files = glob.glob(xmls_base + "*") for f in files: os.remove(f) if not test: for ind, hst in enumerate(alst): # For each chunk of 20 scrapes, processes, and deletes the articles get_art_hists(hst) get_hist_stats("rd" + str(ind+1)) del_art_hists() else: get_hist_stats("testrd")
Created on Tue Apr 4 17:08:23 2017 @author: konodera """ import os print( """#============================================================================== # START !!! {} PID: {} #============================================================================== """.format(__file__, os.getpid())) import utils import numpy as np import pandas as pd from nltk import stem pt = stem.PorterStemmer().stem from gensim.models import Doc2Vec d2v = Doc2Vec.load('../nlp_source/d2v/enwiki_dbow/doc2vec.bin') from gensim.models import KeyedVectors w2v = KeyedVectors.load_word2vec_format( '../nlp_source/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True) import gc train, test = utils.load(7, 1) train_, test_ = utils.load(2) train = pd.merge(train, train_, on='id', how='left') test = pd.merge(test, test_, on='test_id', how='left') del train_, test_
def stemming(word): stemmer = stem.PorterStemmer() try: return stemmer.stem(word) except: return word
* filtering stopwords * stemming * lemmatization * custom string similarity metric based on strig-edit distance """ from nltk import stem from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from DAS.keywordsearch.config import USEFUL_STOPWORDS from DAS.keywordsearch.config import get_setting from DAS.keywordsearch.entity_matchers.string_dist_levenstein \ import levenshtein_normalized as levenshtein_norm from DAS.keywordsearch.utils import memo STEMMER = stem.PorterStemmer() LMTZR = WordNetLemmatizer() EN_STOPWORDS = stopwords.words('english') EN_STOPWORDS_SET = set(EN_STOPWORDS) USEFUL_STOPWORDS_SET = set(USEFUL_STOPWORDS) lemmatize = memo(LMTZR.lemmatize) lemmatize.__doc__ = "cached version of lmtzr.lemmatize" getstem = memo(STEMMER.stem) getstem.__doc__ = "cached version of PorterStemmer() stem" # load the lemmatization DB now lemmatize("dataset")
def normalize_word(word): word = word.lower() stemmer = stem.PorterStemmer() return stemmer.stem(word)
import pandas as pd import re import nltk from nltk import stem import pickle # 数据文件 filename = "labeledTrainData.tsv" # 停用词文件 stopfile = "stop_words.txt" stemmer = stem.PorterStemmer() # nltk.stem词干提取对象初始化 # 文本预处理1,字符串文本, 返回list格式的单词 def text_pre_process1(text): text_1 = re.sub(r"</?[^>]*>|\\|n*'[\w]*|[^(\w|\s)]", ' ', text) # 去除html标签,'\', 英文缩写, 非英文字符 text_2 = nltk.word_tokenize(text_1) # nltk分词 text_3 = [] for word in text_2: text_3.append(stemmer.stem(word)) # nltk.stem词干提取 return text_3 # 文本处理2,处理停用词,单词list,停用词list def stop_words_process2(word_list, stop_list): word_clean = [] for word in word_list: if word.lower() in stop_list: continue word_clean.append(word)
dictionary.save(save_direc) return dictionary docs_direc = 'data/raw/ohsumed_docs_stemmed.pickle' q_direc = 'data/raw/querry_content_stemmed.pickle' dct_direc = 'data/dcts/dict' print 'Loading Data' with open(docs_direc, 'rb') as f: d = pickle.load(f) print 'Loading querries' with open(q_direc, 'rb') as f: q = pickle.load(f) print 'Getting docs' docs = get_docs(d, q) print len(docs) stemmer = ns.PorterStemmer() stop_list = [] for w in stop_words: stop_list.append(stemmer.stem(w)) for i in range(1, 6): print 'Making dictionary for i =', str(i) direc = dct_direc + str(i) dct = make_dictionary(docs, stop_list, i, direc) print dct
# -*- coding: utf-8 - from nltk import stem from collections import defaultdict from knock71 import getstopword stoplist = getstopword() stemmer = stem.PorterStemmer() def create_features(input_file): phi = defaultdict(int) for line in open(input_file, "r"): words = line.strip("\n").split() for word in words[1:]: word = stemmer.stem(word) if word in stoplist: pass else: phi[word] += 1 return phi def create_feature_vector(line, features_dict): featureline_list = list() feature_vector = list() words = line.strip("\n").split() for word in words[1:]: word = stemmer.stem(word) if word in stoplist: pass else: featureline_list.append(word)
def test_get_stems(self): porterStem = stem.PorterStemmer() self.assertEqual(get_stems('during', porterStem), 'dure')