def processQuery(): f = open("Queries.txt") text = f.read() lines = text.splitlines() for i in lines: raw_query = i raw_query = raw_query.replace("\n", "") raw_query = raw_query.lower() query = re.compile(r'(\w+)', re.DOTALL).findall(raw_query) print(query) queryNum = query[0] #print(queryNum) query = re.compile(r'[a-z]+',re.DOTALL).findall(raw_query) query = filter(None, query) #print(query) stemmer = PorterStemmer() query = map(lambda word: stemmer.stem(word, 0, len(word) - 1), query) queryLen = len(query) #print(queryLen) #run the models okapiTF(query, queryNum) tfIdf(query, queryNum, queryLen) smoothing(query, queryNum, 'Laplace') smoothing(query, queryNum, 'Jelinek-Mercer') bm25(query, queryNum) print("Queries processed")
def printDocsHelper(fileT,k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0,len(fileT)-1) + " " fileT=re.sub(r'\s', '', fileT) print fileT if (len(fileT)>1) and (fileT not in stopwords): newDict[k].append(fileT)
def process(self): """ This function reads the text file and performs- -punctuation -tokenization -lower-casing/upper-casing / punctuation / numbers -stop word -stemming """ try: stopWords = open(self.stopwordFile, "r").read() try: if self.writerFlag == True: outFile = open(self.oFile, "w") stemmer = PorterStemmer() dataDic = {} translator = str.maketrans('', '', string.punctuation) nTranslator = str.maketrans('', '', "0123456789") with open(self.iFile) as f: for line in f: try: (key, val) = line.split("\t") except ValueError: continue stringToWrite = "" val = val.translate(translator) val = val.translate(nTranslator) val = val.lower().strip().split(" ") if self.writerFlag == True: stringToWrite = "%s %s \t" % (stringToWrite, key.upper()) for words in val: if words.strip() not in stopWords: stringToWrite = "%s %s" % (stringToWrite, stemmer.stem(words)) stringToWrite = "%s \n" % (stringToWrite) if self.writerFlag == False: dataDic[key.strip()] = stringToWrite.strip() else: outFile.write(stringToWrite) if self.writerFlag == True: outFile.close() else: return dataDic except (OSError, IOError) as e: print("Wrong input file name or file path", e) except (OSError, IOError) as e: print("Wrong stopwords file name or file path", e)
def printDocsHelper(fileT,k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0,len(fileT)-1) + " " fileT=re.sub(r'\s', '', fileT) if (len(fileT)>1) and (fileT not in stopwords): fileT="./wordFiles/" + fileT FILE=open(fileT,'a') initFreq=checkforFrequency(k,fileT) if checkifWritten(fileT,k): FILE.write(str(fileT[12:])+ " " +str(k)+ " " +str(initFreq)) FILE.write("\n") return 1 return 0
def printDocsHelper(fileT, k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0, len(fileT) - 1) + " " fileT = re.sub(r'\s', '', fileT) if (len(fileT) > 1) and (fileT not in stopwords): fileT = "./wordFiles/" + fileT FILE = open(fileT, 'a') initFreq = checkforFrequency(k, fileT) if checkifWritten(fileT, k): FILE.write( str(fileT[12:]) + " " + str(k) + " " + str(initFreq)) FILE.write("\n") return 1 return 0
class Parser: # A processor for removing the commoner morphological and inflexional endings from words in English stemmer = None stopwords = [] def __init__(self,): self.stemmer = PorterStemmer() self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]") for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"): self.stopwords += [line.strip() for line in open(file).readlines()] self.stopwords.append("the") def clean(self, string): """ remove any nasty grammar tokens from string """ string = self.p.sub(" ", string) string = string.lower() return string def removeStopwords(self, list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords] def tokenise(self, string, stem=False): """ break string up into tokens and stem words """ string = self.clean(string) words = string.split() if stem: return [self.stemmer.stem(word, 0, len(word) - 1) for word in words] else: return words def tokenize(self, string, stem=False): tokenise(self, string, stem=stem)
class Parser: def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords() ''' words_list is an array ''' def fullParse(self, words_list): stopped = self.removeStopWords(words_list) cleaned = self.cleanCaseAndPunctuation(stopped) stopped = self.stemWords(cleaned) return stopped def stemWords(self, words_list): stemmed = [] for word in words_list: word = self.stemmer.stem(word, 0, len(word) - 1) stemmed.append(word) return stemmed def removeStopWords(self, words_list): non_stop_list = [] for word in words_list: word = ''.join( filter(lambda word: word not in self.stopWordsList, word.strip())) non_stop_list.append(word) return non_stop_list def cleanCaseAndPunctuation(self, words_list): clean_list = [] for word in words_list: word = word.lower() if not word.startswith('http'): clean = ''.join( [c for c in word if c not in self.remove_punctuation_set]) if clean: clean_list.append(clean) return clean_list def printStopWords(self): print "****************************************************************" print " STOP WORDS" print "****************************************************************" print self.stopWordsList ''' happens on __init__ ''' def loadStopWords(self): for line in open(STOPWORDS_FILE): self.stopWordsList.append(line.strip())
def tokenize(document, stem): tokens = [] p = PorterStemmer() for text in document.headline, document.graphic, document.text: # Lowercase and split on non-alphanumerics text = text.lower() text_tokens = re.split('[\W]', text) if stem: stem_tokens = [] for t in text_tokens: t = p.stem(t, 0, len(t) - 1) stem_tokens.append(t) text_tokens = stem_tokens tokens += text_tokens # Remove empty strings in resulting tokens list tokens = list(filter(None, tokens)) return tokens
class Parser: def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords() ''' words_list is an array ''' def fullParse(self, words_list): stopped = self.removeStopWords(words_list) cleaned = self.cleanCaseAndPunctuation(stopped) stopped = self.stemWords(cleaned) return stopped def stemWords(self, words_list): stemmed = [] for word in words_list: word = self.stemmer.stem(word, 0, len(word)-1) stemmed.append(word) return stemmed def removeStopWords(self, words_list): non_stop_list = [] for word in words_list: word = ''.join(filter(lambda word: word not in self.stopWordsList, word.strip())) non_stop_list.append(word) return non_stop_list def cleanCaseAndPunctuation(self, words_list): clean_list = [] for word in words_list: word = word.lower() if not word.startswith('http'): clean = ''.join([c for c in word if c not in self.remove_punctuation_set]) if clean: clean_list.append(clean) return clean_list def printStopWords(self): print "****************************************************************" print " STOP WORDS" print "****************************************************************" print self.stopWordsList ''' happens on __init__ ''' def loadStopWords(self): for line in open(STOPWORDS_FILE): self.stopWordsList.append(line.strip())
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path): """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking""" query_tokens = tokenize(topic) doc_no_score = {} N = len(doc_id_no) p = PorterStemmer() # Calculate tf in query, and idf for token in query_tokens: qf = query_tokens.count(token) token_tf = ((K2 + 1)*qf) / (K2 + qf) # Calculate idf if stem: token = p.stem(token, 0, len(token) - 1) token_id = token_token_id[token] postings = postings_list[token_id] # Postings follow format: [doc_id, count] n_i = len(postings[::2]) a = (N - n_i + 0.5) / (n_i + 0.5) token_idf = math.log(a) # Calculate tf for docs for i in range(0, len(postings), 2): doc_id = postings[i] doc_no = doc_id_no[doc_id] document = getDocument.retrieve_by_docno(docs_path, doc_no) fi = postings[i+1] K = K1 * ((1 - B) + B * (document.length / average_doc_length)) doc_tf = ((K1 + 1)*fi) / (K + fi) score = doc_tf * token_tf * token_idf if doc_no in doc_no_score: doc_no_score[doc_no] = doc_no_score[doc_no] + score else: doc_no_score[doc_no] = score sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True)) print("Calculated scores for query: {}".format(topic_id)) return sorted_doc_no_score
def build_word_index(doc_dict): ''' this method builds the word index dictionary ''' dictionary={} stopWords = set(stopwords.words('english')) p=PorterStemmer() global idx term_frequecy_list=[] def append_to_word_list(text,doc_index): global idx #text = " ".join(re.findall("[a-zA-Z]+", st)).lower() text=" ".join(re.findall("[a-zA-Z]+", text)) text=set(text.split(" ")) text=list(text) text.sort() temp_list=[] f_dt={} for word in text: if(word!=""): if word in stopWords: continue else: word=p.stem(word, 0,len(word)-1) #update frequency of term if word not in f_dt: f_dt[word]=1 else: f_dt[word]+=1 #check if word in dictionary and append it if word not in dictionary: dictionary[word]=idx idx+=1 term_frequecy_list.append([dictionary[word],doc_index,f_dt[word]]) #wordlist.append(word) idx=1 for i in range(1,len(doc_dict)+1): if(doc_dict[i][1]!=''): append_to_word_list(doc_dict[i][1],i) return dictionary,term_frequecy_list
def querySearcher(self): """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations""" try: stemmer = PorterStemmer() preProcess = PreProcessing(False, self.iFile, "", self.stopwordFile) preProcessRes = preProcess.process() createIndex = InvertedIndexGenerator(False, preProcessRes, "") mainIndex = createIndex.generate() originalquery = self.query self.query = self.query.lower() self.query = self.query.replace('but', 'and') querySep = list(self.parenthetic_contents(self.query)) res = self.queryCalculator(querySep, mainIndex, stemmer, preProcessRes) tempQuery = self.query tempQuery = tempQuery.replace('{', '') tempQuery = tempQuery.replace('}', '') tempQuery = tempQuery.replace('(', '') tempQuery = tempQuery.replace(')', '') tempQuery = tempQuery.replace('/', '') mapKey = {} quryStem = [] for t in tempQuery.split(" "): quryStem.append(stemmer.stem(t)) tempQuery = ' '.join(quryStem) for i, r in enumerate(res.keys()): mapKey["%d_%s" % (i, "firstItr")] = r tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr")) res = {**res, **mainIndex} andPro = tempQuery.split(" ") """AND operation""" for index, term in enumerate(andPro): if term == "and": if andPro[index + 1] == "not": continue else: if mapKey.get(andPro[index - 1], -1) == -1: tempKeyFirst = andPro[index - 1] else: tempKeyFirst = mapKey[andPro[index - 1]] if mapKey.get(andPro[index + 1], -1) == -1: tempKeySecond = andPro[index + 1] else: tempKeySecond = mapKey[andPro[index + 1]] res["%s and %s" % (andPro[index - 1], andPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s and %s" % (andPro[index - 1], andPro[index + 1])][k] = res[tempKeyFirst][ k] and res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s and %s" % (andPro[index - 1], andPro[index + 1]), "%d_%s" % (index, "secondItr")) mapKey["%d_%s" % (index, "secondItr")] = "%s and %s" % ( andPro[index - 1], andPro[index + 1]) """OR operation""" orPro = tempQuery.split(" ") for index, term in enumerate(orPro): if term == "or": if orPro[index + 1] == "not": continue else: if mapKey.get(orPro[index - 1], -1) == -1: tempKeyFirst = orPro[index - 1] else: tempKeyFirst = mapKey[orPro[index - 1]] if mapKey.get(orPro[index + 1], -1) == -1: tempKeySecond = orPro[index + 1] else: tempKeySecond = mapKey[orPro[index + 1]] res["%s or %s" % (orPro[index - 1], orPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s or %s" % (orPro[index - 1], orPro[index + 1])][k] = res[ tempKeyFirst][k] or res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s or %s" % (orPro[index - 1], orPro[index + 1]), "%d_%s" % (index, "thirdItr")) mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % ( orPro[index - 1], orPro[index + 1]) """AND NOT, OR NOT, BUT NOT operations""" notPro = tempQuery.split(" ") for index, term in enumerate(notPro): if term == "not": tempKeyNot = {} if mapKey.get(notPro[index + 1], -1) == -1: tempKeySecond = notPro[index + 1] else: tempKeySecond = mapKey[notPro[index + 1]] for k in res[tempKeySecond].keys(): if not res[tempKeySecond][k] == True: tempKeyNot[k] = 1 else: tempKeyNot[k] = 0 for index, term in enumerate(notPro): if term == "and": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s and not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s and not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] and tempKeyNot[kee] tempQuery = tempQuery.replace( "%s and not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s and not %s" % ( notPro[index - 1], notPro[index + 2]) if term == "or": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s or not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s or not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] or tempKeyNot[kee] tempQuery = tempQuery.replace( "%s or not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s or not %s" % ( notPro[index - 1], notPro[index + 2]) self.queryAnswer(originalquery, tempQuery, mapKey, res) except: print('The term is not present in the Documents')
punct_list_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"] #defining punctuations to be eliminated punct_list_2 = [".",'"',"[","]","?","!","*","%","{","}","$"] #removing punctuations for punct in punct_list_1: if punct in key_text: key_text = key_text.replace (punct, " ") for punct in punct_list_2: if punct in key_text: key_text = key_text.replace (punct, "") key_text = key_text.split() #removing stop words text_wo_stop_punct = [x for x in key_text if x not in stop_word_file] p = PorterStemmer() midlist = [(p.stem (word, 0, (len (word) - 1))) for word in text_wo_stop_punct] newlist = [x for x in midlist if x not in stop_word_file] finaltext = ''.join (" " + x for x in newlist) dict_map[key] = finaltext.strip() print "Completed stemming and stopping" dict_word_ID_map = {} i = 1 print "Assigning IDs to words..........please wait" for key in dict_map.keys(): for word in dict_map[key].split(): if dict_word_ID_map.has_key (word): pass else: dict_word_ID_map[word] = i
] #defining punctuations to be eliminated punct_list_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"] #removing punctuations for punct in punct_list_1: if punct in key_text: key_text = key_text.replace(punct, " ") for punct in punct_list_2: if punct in key_text: key_text = key_text.replace(punct, "") key_text = key_text.split() #removing stop words text_wo_stop_punct = [x for x in key_text if x not in stop_word_file] p = PorterStemmer() midlist = [(p.stem(word, 0, (len(word) - 1))) for word in text_wo_stop_punct] newlist = [x for x in midlist if x not in stop_word_file] finaltext = ''.join(" " + x for x in newlist) dict_map[key] = finaltext.strip() print "Completed stemming and stopping" dict_word_ID_map = {} i = 1 print "Assigning IDs to words..........please wait" for key in dict_map.keys(): for word in dict_map[key].split(): if dict_word_ID_map.has_key(word): pass else:
''' ''' Kshitij is avoiding using Nltk as much as possible for now as the searches will be very slow otherwise ''' import sys import re import gc from porterStemmer import PorterStemmer from collections import defaultdict from array import array from math import log,sqrt import operator from fuzzywuzzy import fuzz index_porter = PorterStemmer() class Index: N=4718 token_set= set() #to store the tokens without stemming for correction id_title={} #to store the corresponding title for docId def __init__(self): self.index = defaultdict(list) def remove_stop_words(self): file = open(self.stopwordsFile, 'rw') stop_words = [line.rstrip() for line in file] '''rstrip removes whitespace characters(by default)''' self.stop_words_dict = dict.fromkeys(stop_words) file.close()
def applyStem(word): p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) return word
strg = "" for elements in temp: strg += " " + elements punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"] punc2 = [".",'"',"[","]","?","!","*","%","{","}"] for punc in punc1: if punc in strg: strg = strg.replace(punc," ") for punc in punc2: if punc in strg: strg = strg.replace(punc,"") strg = strg.split() finallist = [x for x in strg if x not in stop] p = PorterStemmer() midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist] newlist = [x for x in midlist if x not in stop] finalstring = ''.join(" " + x for x in newlist) queryhashmap[key] = finalstring.strip() avgdoclen = 46.25 #avgdoclen = 46.2484394507 #zipfs avgdoclen def calcOBM25(OBM25dict,docid,doclen,termfreq,df): b = 0.6 #0.2-1.0 k = 1.6 #1.2-2.0 idf = log(3204.0/df) numerator = termfreq * float(k+1.0) denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen)
import os import sys import re as reg import math from array import array from collections import defaultdict from porterStemmer import PorterStemmer import copy #Creating object of PorterStemmer Class portstemmer_obj = PorterStemmer() class CreateIndex: def __init__(self): self.mainindex = defaultdict(list) self.termfrequency = defaultdict(list) self.documentfrequency = defaultdict(int) self.totaldocuments = 0 self.indexanditstitle = defaultdict(list) def findstopwords(self): stopwordsfile = open('stopwords.txt','r',encoding='UTF-8') stopwords=[line.rstrip() for line in stopwordsfile] self.stop_words = dict.fromkeys(stopwords) stopwordsfile.close() def get_important_terms(self , lines): #get the useful words and terms from the text lines = lines.lower() lines = reg.sub(r'[^a-z0-9 ]' , ' ' , lines) lines = lines.split() lines = [ele for ele in lines if ele not in self.stop_words]
def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords()
import sys from porterStemmer import PorterStemmer import re # regex to rescue from functools import reduce from collections import defaultdict #from indexedfile.dat import defaultdict ps = PorterStemmer() inverteddict = defaultdict(list) regex = re.compile("[^a-z0-9 ]+") def Cleaning(): # concatenate command line arguement words = " ".join(sys.argv[1:]) # converting all to lowercase to ease in searching words = words.lower() # removing non-alphanumeric except for space words = regex.sub(" ", words).strip() words = words.split() # remove prepositions and all stopwords file = open("preposition.dat").read() filterwords = [w for w in words if not w in file] filterwords = [] for w in words: if w not in file: filterwords.append(w) # stem words so that text texting and texted are all considered text filterwords = [ps.stem(word, 0, len(word) - 1) for word in filterwords] filterwords = tuple(filterwords) return filterwords
def __init__(self,): self.stemmer = PorterStemmer() self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]") for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"): self.stopwords += [line.strip() for line in open(file).readlines()] self.stopwords.append("the")
#!/usr/bin/env python #encoding=utf8 from porterStemmer import PorterStemmer if __name__ == "__main__": stemmer = PorterStemmer() word = "Keyphrases" result = stemmer.stem(word, 0, len(word)-1) print result
def applyStem(word): p = PorterStemmer() word = p.stem(word, 0,len(word)-1) return word
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([char(10) char(10)])); # email_contents = email_contents(hdrstart(1):end); # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # Other email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},' '\">_<;%\\n\\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n') # Process file stemmer = PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) # Skip the word if it is too short if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # try: index = vocabList.index(word) except ValueError: pass else: word_indices.append(index) # ============================================================" print(' '.join(processed_email)) # Print footer print('\n\n=========================') return word_indices
def stemWords(tokens): for i in range(0,len(tokens)): stemmer = PorterStemmer() tokens[i] = stemmer.stem(tokens[i], 0, len(tokens[i])-1) return tokens
for words in temp: temp_string += " " + words punc_type_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"] punc_type_2 = [".",'"',"[","]","?","!","*","%","{","}","$"] for punc in punc_type_1: if punc in temp_string: temp_string = temp_string.replace (punc, " ") for punc in punc_type_2: if punc in temp_string: temp_string = temp_string.replace (punc, "") temp_string = temp_string.split() final_word_list = [x for x in temp_string if x not in stop_words] p = PorterStemmer() mid_list = [(p.stem(word, 0, len (word)-1)) for word in final_word_list] new_list = [x for x in mid_list if x not in stop_words] final_string = ''.join(" " + x for x in new_list) query_hashmap[key] = final_string.strip() #print query_hashmap[key] # printing each query after stopping and stemming model_dict = {} query_word_count = defaultdict(float) file1 = open ("file1.txt","r").readlines() for key in query_hashmap.keys(): query = query_hashmap[key].split() print query query = map(str.lower,query) query_dict = {}
import sys import re from porterStemmer import PorterStemmer from collections import defaultdict import copy porter = PorterStemmer() class QueryIndex: def __init__(self): self.index = {} self.titleIndex = {} self.tf = {} #term frequencies self.idf = {} #inverse document frequencies def intersectLists(self, lists): if len(lists) == 0: return [] #start intersecting from the smaller list lists.sort(key=len) return list(reduce(lambda x, y: set(x) & set(y), lists)) def getStopwords(self): f = open(self.stopwordsFile, 'r') stopwords = [line.rstrip() for line in f] self.sw = dict.fromkeys(stopwords) f.close() def getTerms(self, line): line = line.lower()
def parseDocs(): global terms global documents files = os.listdir("./cacm") sp = open("stopWords.txt") stopData = sp.read() stopTerms = stopData.lower().split("\n") stopTerms = stopTerms[:len(stopTerms) - 1] filep1 = open("terms.txt", "a") filep2 = open("mappings.txt", "a") filep3 = open("documents.txt", "a") termId = 1 for f in files: fp = open("./cacm/" + f) documentName = f.split(".")[0] documentId = documentName.split("-")[1] line = fp.read() data = re.compile(r'.*?<pre>(.*?)</pre>', re.DOTALL).match(line).group(1) data = data.replace("\n", " ") splitword = re.compile(r'CA\d+', re.DOTALL).findall(data)[0] text = data.split(splitword) words = text[0] words = words.replace("CACM", " ") words = words.lower() words = re.compile(r'(\w+)', re.DOTALL).findall(words) stemmer = PorterStemmer() words = map(lambda word: stemmer.stem(word, 0, len(word) - 1), words) docLength = len(words) global totalDocLength totalDocLength += docLength count = collections.Counter(words) filep3.write(documentId + " " + documentName + " " + str(docLength) + "\n") for term in words: if term not in stopTerms and term not in stopList: global numOfTerms numOfTerms += 1 if term in terms: #print(term) attributes = terms[term] #print(attributes) idterm = attributes[0] tf = count[term] documentDetails = attributes[3] latestDoc = len(documentDetails) lastTermId = documentDetails[latestDoc - 1] #print(latestDoc) if documentId == lastTermId[0]: ctf = attributes[1] ctf = ctf + 1 df = attributes[2] terms[term] = idterm, ctf, df, documents[term] #print(terms[term]) else: documents[term] = documents[term] + [[documentId, documentName, docLength, tf]] ctf = attributes[1] ctf = ctf + 1 df = attributes[2] df = df + 1 terms[term] = idterm, ctf, df, documents[term] #print(terms[term]) if term not in terms: #print(termId) ctf = 1 tf = count[term] df = 1 documents[term] = [[documentId, documentName, docLength, tf]] terms[term] = termId, ctf, df, documents[term] termId += 1 #print(termId) #print(terms[term]) for key in terms: attributes = terms[key] key_termName = key key_termId = attributes[0] key_ctf = attributes[1] key_df = attributes[2] key_documents = attributes[3] offsetLength = len(str(key_termId)) + 1 filep2.write(str(key_termId) + " ") for doc in key_documents: docId = doc[0] tf = doc[3] offsetLength += len(docId) + len(str(tf)) + 2 filep2.write(docId + " " + str(tf) + " ") filep2.write("\n") global offset filep1.write(key_termName + " " + str(key_termId) + " " + str(key_ctf) + " " + str(key_df) + " " + str(offset) + " " + str(offsetLength) + "\n") offset += offsetLength + 1