def init_test(doc, word_list, i, docs): p = porter.PorterStemmer() infile = open(doc, 'r', encoding="ISO-8859-1") while 1: word = '' line = infile.readline() if line == '': break elif line == '\n': continue for c in line: if c.isalpha(): word += c.lower() else: if c.isdigit(): continue elif c in string.punctuation: continue elif word: if not word in stopword: word = p.stem(word, 0, len(word) - 1) if word in word_list: idx = word_list.index(word) docs[i, idx] += 1 word = '' else: word = '' continue infile.close()
class Summarizer(object): """Abstract base class for all summarizers.""" NUM_DOCS = 2 BASE_DIR = os.path.dirname(__file__) stopwords = set( open(os.path.join(BASE_DIR, 'stop_words.txt'), 'r').read().strip().split(',') ) word_re = re.compile('\w+') # drop trailing non-alphanumeric chars non_space = re.compile('\S+') non_alnum_ending = re.compile('\W$') punctuation = re.compile('[\-.,?!:;\'()&\[\]\$]') sentence_terminator = re.compile('[.?!]$') numbers = re.compile(r'[_\d.]+') # numbers and other strange tokens made up of underscores; re.compile(r'[\d.]*\d+') stemmer = porter.PorterStemmer() def __init__(self): self.df = None self.tf = {} # tf is the frequency of a word in the document being summarized; # df is its frequency in the document collection (all 37 plays) def initialize(self, tf_filename, df_filename): with open(os.path.join(Summarizer.BASE_DIR, df_filename), 'rb') as f: self.df = pickle.load(f) with open(tf_filename, 'r') as f: for line in f: count, word = line.split() self.tf[word] = int(count) @abc.abstractmethod def summarize(self, document_path): return
def __init__(self, all_path, stopwords_path): # specific path for file to parse self.all = all_path # the file name of the BMWeights self.bm25_file = 'BM25Weights.json' # the porter to stem the words self.porter = porter.PorterStemmer() # the stopwords which is read from the stopwords_path self.stopwords = self._read_stopwords(stopwords_path) # the doc k:docid v:{k:len,v:doclen,k:tfs,v:tfs} tfs is a dict with k:term v: a number of the frequency self.docdict = {} # the idfs k:term v: a number of the over all frequency self.idfs = {} # the BM25 value dict k:docid v:{k:term,v:score} self.BM25 = {} # the k of the formula self.k = 1 # the b of the formula self.b = 0.75 # judge that if here is the BM25Weightfile: if os.path.exists(self.bm25_file): #load the BM25Weights print("Loading BM25 index from file, please wait. \n") self.load_bm25() else: # calcualte the bm25 score and store print("BM25 index is not exists") print("Generateing BM25 index....") self.calculate_bm25() self.save_bm25()
def getItemWords(list_of_words, stop_words): stemmer = porter.PorterStemmer() allwords = {} itemwords = [] itemtitles = [] ec = 0 stemlist = {} # Loop over every item in list_of_words for item in list_of_words: words = separatewords(item, 1) words = removeStopWords(words, stop_words) itemwords.append({}) itemtitles.append("Response " + str(ec + 1)) # Increase the counts for this word in allwords and in articlewords for word in words: unstemmedword = word word = stemmer.stem(word, 0, len(word) - 1) if word in stemlist: temp = stemlist[word] try: temp.index(unstemmedword) except ValueError: temp.append(unstemmedword) stemlist[word] = temp else: temp = [] temp.append(unstemmedword) stemlist[word] = temp allwords.setdefault(word, 0) allwords[word] += 1 itemwords[ec].setdefault(word, 0) itemwords[ec][word] += 1 ec += 1 return allwords, itemwords, itemtitles, stemlist
def stem_terms(terms): stemmmed_terms = [] p = porter.PorterStemmer() for i in range(len(terms)): stemmmed_terms.append(stem_term(terms[i], p)) return stemmmed_terms
def normalize(word): regex = re.compile('[^a-zA-Z0-9]') word = regex.sub("", word) word = word.lower() word_obj = porter.PorterStemmer(word) word_obj.stem(word, 0, len(word) - 1) word = word_obj.word return word
def __init__(self, docset_map_file='', punctuation_marks='', stoplist_file=''): self.__docMap = text.PickleStrMap.load(docset_map_file) self.__txtprr = text.TextProcessor( punctuation_marks, text.PickleStrMap.load(stoplist_file), porter.PorterStemmer())
def test_05(): stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl') fnfltr = fnfilter.TextFileFilter() ftdsmap = text.FileToDocSetMap() stm = porter.PorterStemmer() tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm) docset = tp.processFile(my_test_dir + 'poem.txt') print docset
def test_03(): stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl') fnfltr = fnfilter.TextFileFilter() ftdsmap = text.FileToDocSetMap() stm = porter.PorterStemmer() tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm) crwlr = crawl.Crawler(fnfltr, ftdsmap, tp) crwlr.crawl(my_test_dir + 'texts/') crwlr.pickleFileToDocSetMap(my_test_dir + 'file_to_docset_map.pkl')
def search(keystring): file = open('index.txt', encoding='gbk') js = file.read() dicread = json.loads(js) file.close() N = dicread['N'] k = 1 b = 0.75 avg = dicread['avg_doclen'] stemmer = porter.PorterStemmer() stopwords = set() with open('stopwords.txt', 'r') as f: for line in f: stopwords.add(line.rstrip()) keystring = keystring.split(None) keylist = [] for value in keystring: if value not in stopwords: value = stemmer.stem(value) value = value.lower() keylist.append(value) dic_ij = {} dic_ni = {} for term in keylist: dicij = {} i = 0 for id in dicread['house_index_info']: if term in dicread['house_index_info'][id]: fij = dicread['house_index_info'][id][term] dicij[id] = fij i = i + 1 dic_ni[term] = i else: fij = 0 dicij[id] = fij dic_ni[term] = i dic_ij[term] = dicij bmij = {} for docid in dicread['house_index_info']: sim = 0 for term in keylist: fij = dic_ij[term][docid] ni = dic_ni[term] len = dicread['indoclen'][docid] sim = sim + (fij * (1 + k) / (fij + k * (1 - b + ((b * len) / avg)))) * math.log( ((N - ni + 0.5) / (ni + 0.5)), 2) bmij[docid] = sim bmrank = sorted(bmij.items(), key=lambda x: x[1], reverse=True)[0:100] # dic_result = {} keylist = [] for key in bmrank: keylist.append(key[0]) return keylist
def getSynonym(self, word): synonyms = [] p = porter.PorterStemmer() for synset in wn.synsets(word): for lemma in synset.lemmas(): syn = lemma.name() if parameters.stemming: # stem sysnonym if stemming parameter is set True. syn = p.stem(lemma.name(), 0, len(lemma.name()) - 1) if syn not in synonyms: synonyms.append(syn) return synonyms
def stem(li,cols=0): if cols == 0: cols = range(len(li[0])) import porter pstemmer = porter.PorterStemmer() newlist = copy.deepcopy(li) for i in range(len(li)): for j in cols: string = str(li[i][j]) for ch in "'"+'"+[]?!\n': string = string.replace(ch,'') words = string.split(' ') newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words]) return newlist
def porterTokenizer(corpusString): p = porter.PorterStemmer() output = '' word = '' lines = corpusString.split('\n') for line in lines: for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() return output.split()
def Q2a(): p = porter.PorterStemmer() text = nltk.load('text.txt', encoding='gbk') # code for Q2a token_list = nltk.word_tokenize(text) english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''", "-" ] token_list = [ word for word in token_list if word not in english_punctuations ] token_list1 = [w.lower() for w in token_list] print(token_list1) token_list2 = [p.stem(w) for w in token_list1] print(token_list2)
def compare(doc1, doc2): """ strip all punctuation but - and ' convert to lower case store word/occurance in dict """ words_db = WordsDb(stemmer=porter.PorterStemmer()) for text in [doc1, doc2]: words_db.add_article(text) words_db.bulid() v1 = words_db.classify(doc1) v2 = words_db.classify(doc2) print v1 print v2 return float(dot(v1, v2) / (norm(v1) * norm(v2)))
def Q3(): p = porter.PorterStemmer() stopwords = [] with open('stopwords.txt', 'r') as f: for line in f: stopwords.append(line.rstrip()) f.close() # print(stopwords) temp = requests.get("https://www.bbc.com/news/world-us-canada-49871909") temp.encoding = 'utf-8' soup = BeautifulSoup(temp.content, 'html.parser') text_1 = soup.find('div', {'class': 'story-body__inner'}).findAll('p') # text_1.remove('<p>') text_1 = [part.get_text() for part in text_1] text_1 = [nltk.word_tokenize(sen) for sen in text_1] english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '"', '\'s', '``', "''", "-" ] text_1 = [[word for word in sens if word not in english_punctuations] for sens in text_1] text_1 = [[word for word in sens if word not in stopwords] for sens in text_1] text_1 = [nltk.pos_tag(sen) for sen in text_1] # print(text_1) result = [] for sen in text_1: for word in sen: if "V" in word[1]: w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v') elif "N" in word[1]: w = WordNetLemmatizer().lemmatize(word[0], 'n') else: w = p.stem(word[0]) result.append(w.lower()) # print(result) fdist = FreqDist(result) tops = fdist.most_common(40) print(tops)
def preProcessing(content): stopSet = getStopSet() p = porter.PorterStemmer() info = [] for line in content: newLine = "" line = line.split(" ") for element in line: temp = element.split("://") temp1 = element.split("@") temp2 = element.split("#") temp3 = element.split("/") if len(temp)<2 and len(temp1)<2 and len(temp2)<2 and len(temp3)<2: element = element.strip()#clean the '\n' element = element.lower() element = element.translate(str.maketrans('','', string.punctuation)) element = p.stem(element) if element not in stopSet: newLine = newLine+element+" " info.append(newLine) return info
def Porter_extraction(self, file_path): p = porter.PorterStemmer() r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+' r_num = '[0-9]' r = re.compile(r) r_num = re.compile(r_num) text = '' with open(file_path, 'r') as infile: output = '' word = '' line = infile.read() for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() text += (' ' + re.sub(r, ' ', re.sub(r_num, ' ', output))) return text
def main(): # stores count of each words appearing in a file at least once # does not double count word appearance in a signle file ps = pstem.PorterStemmer() fileG = open("groundtruth.csv", "w") porter = False stopWord = None root_dir = sys.argv[1] if (sys.argv[2] == "TRUE"): porter = True if (sys.argv[3] != "NULL"): stopWord = sys.argv[3] if (stopWord != None): fileH = open(stopWord, "r") stopWord = [] line = fileH.readlines() stopWord = [lines.rstrip('\n') for lines in line] fileH.close() test = [ dI for dI in os.listdir(root_dir + "/C50test") if os.path.isdir(os.path.join(root_dir + "/C50test", dI)) ] train = [ dI for dI in os.listdir(root_dir + "/C50test") if os.path.isdir(os.path.join(root_dir + "/C50train", dI)) ] totalDict = {} allVectors = [] print("Reading test data...") for name in test: dir_name = root_dir + "/C50test/" + name files = os.listdir(dir_name) for file in files: fileG.write(file + "," + name + "\n") file_name = dir_name + "/" + file with open(file_name, 'r') as myfile: document = myfile.read().replace('\n', ' ') document = document.replace('.', ' ').replace(",", " ").replace( "\"", "").replace("-", " ") document = document.replace("-", "").replace("(", "").replace( ")", "").replace("$", "") document = document.replace("?", "").replace("!", "").replace( "#", "").replace("/", " ") document = re.sub('\d', '', document) document = document.lower().split() hitWords = [] thisDict = {} for word in document: if (stopWord != None and word in stopWord): continue word = word.replace("'", "") if (word == ""): continue if (porter == True): word = ps.stem(word, 0, len(word) - 1) if (word not in hitWords): if word in totalDict: totalDict[word] += 1 else: totalDict[word] = 1 hitWords.append(word) if word not in thisDict: thisDict[word] = 1 else: thisDict[word] += 1 temp = Vector(file, name) temp.vector = thisDict allVectors.append(temp) print("Reading train data...") for name in train: dir_name = root_dir + "/C50train/" + name files = os.listdir(dir_name) for file in files: fileG.write(file + "," + name + "\n") file_name = dir_name + "/" + file with open(file_name, 'r') as myfile: document = myfile.read().replace('\n', ' ') document = document.replace('.', ' ').replace(",", " ").replace( "\"", "").replace("-", " ") document = document.replace("-", "").replace("(", "").replace( ")", "").replace("$", "") document = document.replace("?", "").replace("!", "").replace( "#", "").replace("/", " ") document = re.sub('\d', '', document) document = document.lower().split() hitWords = [] thisDict = {} for word in document: if (stopWord != None and word in stopWord): continue word = word.replace("'", "") if (word == ""): continue if (porter == True): word = ps.stem(word, 0, len(word) - 1) if (word not in hitWords): if word in totalDict: totalDict[word] += 1 else: totalDict[word] = 1 hitWords.append(word) if word not in thisDict: thisDict[word] = 1 else: thisDict[word] += 1 temp = Vector(file, name) temp.vector = thisDict allVectors.append(temp) print(len(allVectors)) fileNoNorm = open("plainVector.csv", "w") for vect in allVectors: fileNoNorm.write(vect.authorName + "," + vect.fileName + ",") vectDict = vect.vector for elem in vectDict: fileNoNorm.write(elem + " " + str(vectDict[elem]) + ",") fileNoNorm.write("\n") fileNoNorm.close() tfidfVect = copy.deepcopy(allVectors) for thing in tfidfVect: curDict = thing.vector maximum = max(curDict.values()) for k in curDict: curDict[k] /= maximum curDict[k] = curDict[k] * math.log(5000 / totalDict[k], 2) fileNoNorm = open("tfidfVector.csv", "w") for vect in tfidfVect: fileNoNorm.write(vect.authorName + "," + vect.fileName + ",") vectDict = vect.vector for elem in vectDict: fileNoNorm.write(elem + " " + str(vectDict[elem]) + ",") fileNoNorm.write("\n") fileNoNorm.close() fileG.close()
def test_02(): stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl') stm = porter.PorterStemmer() tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm) docset = tp.processFile(my_test_dir + 'poem.txt') print docset
def __init__(self): # the BM25 model self.model = {} # the poter to stem the input self.porter = porter.PorterStemmer()
class FeatureSet(object): """ Set of features for a dictionary. """ splitter = re.compile( "[a-z0-9]+(?:['\-][a-z0-9]+)*", re.I ) dates = re.compile( r'\b\d\d\d\d\b|\'\d\d\b' ) numbers = re.compile( "\d\+" ) apos = re.compile("'$") stemmer = porter.PorterStemmer() words = None stop_words = None st = None wt = None pronouns = None fset = None def __init__(self, name="", features=None): """ Initializes a feature set. """ # Load various libraries / dictionaries if they haven't been if FeatureSet.pronouns is None: FeatureSet.pronouns = loadDictionary(PRONOUN_FILENAME) if FeatureSet.words is None: FeatureSet.words = loadDictionary(DICT_FILENAME) if FeatureSet.stop_words is None: FeatureSet.stop_words = loadDictionary(STOP_FILENAME) if FeatureSet.st is None: # FeatureSet.st = punkt.PunktSentenceTokenizer(gutenberg.raw(gutenberg.files())) FeatureSet.st = punkt.PunktSentenceTokenizer() if FeatureSet.wt is None: FeatureSetwt = punkt.PunktWordTokenizer() # predefined set of features? if features is None: self.features = {} else: self.features = features # article name self.name = name def getFeatures(self): return self.features def getFeature(self, f): if f in self.features: return self.features[f] return 0 def incrFeature(self,f): if f in self.features: self.features[f] += 1 else: self.features[f] = 1 def setFeature(self,f,val): self.features[f] = val def __iter__(self): return self.features.__iter__() def keys(self): return self.features.keys() def __eq__(self, other): for i in self.features: if i == "SIMS": continue if i in other.features: if other.features[i] != self.features[i]: return False else: if self.features[i] != 0: return False return True def __add__(self,other): f = self.features.copy() for i in other.features: if i in self.features: f[i] = self.features[i] + other.features[i] else: f[i] = other.features[i] return FeatureSet(f) def __radd__(self,other): for i in other.features: if i in self.features: self.features[i] += other.features[i] else: self.features[i] = other.features[i] def __mul__(self,other): dotprod = 0 if type(other) == dict: fs = other else: fs = other.features for i in fs: if i in self.features: dotprod += self.features[i] * fs[i] return dotprod def extractFeatures(self, text): """ Extracts features based on text. Clears any existing features.""" # Working text (will have things deleted) wtext = text # Clear dict before importing new features self.features = {} # Clean text words = FeatureSet.splitter.findall(text) # Number of words, sentences, questions, exclamations self.features["WORD"] = len(words) self.features["SENT"] = len(FeatureSet.st.tokenize(text)) self.features["QUES"] = text.count("?") self.features["EXCL"] = text.count("!") # If we have an article name provided, find instances of that if (self.name != ""): occurs = 0 namesplit = FeatureSet.splitter.findall(self.name) for i in namesplit: namepart = re.compile(r'\b'+i+r'\b',re.I) occurs += len(namepart.findall(text)) wtext = namepart.sub("", wtext) self.features["NAME"] = occurs # Find dates self.features["DATE"] = len(FeatureSet.dates.findall(wtext)) wtext = FeatureSet.dates.sub("", wtext) # Remove other numbers self.features["NUM"] = len(FeatureSet.numbers.findall(wtext)) wtext = FeatureSet.numbers.sub("", wtext) # Now look for words / bigrams / positions pronouns = 0 # num pronouns propers = 0 # num proper nouns prev = "" # end marker i = -1.0 length = len(wtext) wtext_words = FeatureSet.splitter.findall(wtext) for w in wtext_words: i += 1 wl = w.lower() if wl in FeatureSet.pronouns: pronouns += 1 continue if wl in FeatureSet.stop_words: # If this is a stop word, just ignore it continue if not wl in FeatureSet.words and wl != w: # Capital and not in word list, so assume it's a proper noun propers += 1 continue ws = FeatureSet.stemmer.stem(wl,0,len(wl)-1) ws = FeatureSet.apos.sub("", ws) if FeatureSet.fset is None or "UNI_"+ws.upper() in FeatureSet.fset: self.incrFeature("UNI_"+ws.upper()) if prev != "" and (FeatureSet.fset is None or "BI_"+prev.upper()+"_"+ws.upper() in FeatureSet.fset): self.incrFeature("BI_"+prev.upper()+"_"+ws.upper()) if (not "POS_"+ws.upper() in self.features) and (FeatureSet.fset is None or "POS_"+ws.upper() in FeatureSet.fset): self.features["POS_"+ws.upper()] = i/length prev = ws firstword = FeatureSet.stemmer.stem(words[0],0,len(words[0])-1).upper() if FeatureSet.fset is None or "FIRST_"+firstword in FeatureSet.fset: self.features["FIRST_" + firstword] = 1 if len(words) > 1: secondword = FeatureSet.stemmer.stem(words[1],0,len(words[1])-1).upper() if FeatureSet.fset is None or "SECOND_"+firstword+"_"+secondword in FeatureSet.fset: self.features["SECOND_" + firstword + "_" + secondword] = 1 self.features["PROP"] = propers self.features["PRON"] = pronouns
def call_query(query, collection, i, brf, brf_count, brf_number_words, brf_from, stopwords, thesaurus, normalization): # clean query if parameters.case_folding: query = query.lower() query = re.sub(r'[^ a-zA-Z0-9]', ' ', query) query = re.sub(r'\s+', ' ', query) query_words = query.split(' ') if (thesaurus): query_with_thesaurus = [] for word in query_words: query_with_thesaurus.append(word) syns = getSynonyms(word) for syn in syns: query_with_thesaurus.append(syn) query_words = query_with_thesaurus # create accumulators and other data structures accum = {} filenames = [] p = porter.PorterStemmer() # get N f = open("indexes/" + collection + "_index_N", "r") N = eval(f.read()) f.close() # get document lengths/titles titles = {} f = open("indexes/" + collection + "_index_len", "r") lengths = f.readlines() f.close() # get index for each term and calculate similarities using accumulators for term in query_words: if stopwords and (term in stop_words): continue if term != '': if parameters.stemming: term = p.stem(term, 0, len(term) - 1) if not os.path.isfile("indexes/" + collection + "_index/" + term): continue f = open("indexes/" + collection + "_index/" + term, "r") lines = f.readlines() idf = 1 if parameters.use_idf: df = len(lines) idf = 1 / df if parameters.log_idf: idf = math.log(1 + N / df) for line in lines: mo = re.match(r'([0-9]+)\:([0-9\.]+)', line) if mo: file_id = mo.group(1) tf = float(mo.group(2)) if not file_id in accum: accum[file_id] = 0 if parameters.log_tf: tf = (1 + math.log(tf)) accum[file_id] += (tf * idf) f.close() # parse lengths data and divide by |N| and get titles for l in lengths: mo = re.match(r'([0-9]+)\:([0-9\.]+)\:(.+)', l) if mo: document_id = mo.group(1) length = eval(mo.group(2)) title = mo.group(3) if document_id in accum: if normalization: accum[document_id] = accum[document_id] / length titles[document_id] = title # print top ten results results = sorted(accum, key=accum.__getitem__, reverse=True) final_result = [] #print(collection+" "+query) for c in range(min(len(results), 10)): #print ("{0:10.8f} {1:5} {2}".format (accum[result[c]], result[c], titles[result[c]])) final_result.append([accum[results[c]], results[c]]) if (brf and brf_count == 0): total = 0 for result in results: if total >= brf_from: break total += 1 document = result #accumulation = result[0] f = open( "indexes/tf-idf/testbed" + str(i) + "_document_" + str(document) + "_tf-idf", "r") lines = f.readlines() f.close() c = 0 d = 0 word = "" while (c < brf_number_words and len(lines) > d): mo = lines[d].split(":") if (word == mo[1].replace("\n", "") or mo[1].replace("\n", "") in stop_words): d += 1 continue word = mo[1].replace("\n", "") query += " " + word d += 1 c += 1 final_result = call_query(query, collection, i, brf, brf_count + 1, brf_number_words, brf_from, stopwords, thesaurus, normalization) return final_result
def main(q_id, collection_name, query_text): print(q_id + " " + collection_name + " " + query_text) MIN_RESULT_LENGTH = 30 OUT_DIR = "testbed/" RESULT_FILE = "control_results.txt" if parameters.use_thesaurus: RESULT_FILE = "thesaurus_results.txt" # construct collection and query query_id = q_id collection = collection_name query = query_text # clean query if parameters.case_folding: query = query.lower() query = re.sub(r'[^ a-zA-Z0-9]', ' ', query) query = re.sub(r'\s+', ' ', query) query_words = query.split(' ') # Check if using thesaurus # Design is to get synonyms for each query term, # Excluding synonyms that are longer than one word, as the breaking # Up of many words into their constituent words # Might not make sense as a synonym. # Particularly since the system search on term basis and not a phrase basis if parameters.use_thesaurus: added_synonyms = [] for term in query_words: thesaurus = py_thesaurus.WordAnalyzer(term) synonyms = thesaurus.get_synonym() # ignore synonyms that are more than one word long allowed_synonyms = [] for s in synonyms: if (len(s.split(" ")) == 1): allowed_synonyms.append(s) for s in allowed_synonyms: if s not in added_synonyms: added_synonyms.append(s) query_words.extend(added_synonyms) # list of synonyms for a word # create accumulators and other data structures accum = {} filenames = [] p = porter.PorterStemmer() # get N f = open(collection + "_index_N", "r") N = eval(f.read()) f.close() # get document lengths/titles titles = {} f = open(collection + "_index_len", "r") lengths = f.readlines() f.close() # get index for each term and calculate similarities using accumulators for term in query_words: if term != '': if parameters.stemming: term = p.stem(term, 0, len(term) - 1) if not os.path.isfile(collection + "_index/" + term): continue f = open(collection + "_index/" + term, "r") lines = f.readlines() idf = 1 if parameters.use_idf: df = len(lines) # document frequency of a word idf = 1 / df if parameters.log_idf: idf = math.log(1 + N / df) for line in lines: mo = re.match(r'([0-9]+)\:([0-9\.]+)', line) if mo: file_id = mo.group(1) tf = float(mo.group(2)) if not file_id in accum: accum[file_id] = 0 if parameters.log_tf: tf = (1 + math.log(tf)) accum[file_id] += (tf * idf) f.close() # parse lengths data and divide by |N| and get titles for l in lengths: mo = re.match(r'([0-9]+)\:([0-9\.]+)\:(.+)', l) if mo: document_id = mo.group(1) length = eval(mo.group(2)) title = mo.group(3) if document_id in accum: if parameters.normalization: accum[document_id] = accum[document_id] / length titles[document_id] = title # print top ten results result = sorted(accum, key=accum.__getitem__, reverse=True) for i in range(min(len(result), MIN_RESULT_LENGTH)): print("{0:10.8f} {1:5} {2}".format(accum[result[i]], result[i], titles[result[i]])) def write_to_result_file(result, query_id): run_id = "control" output = OUT_DIR + RESULT_FILE if parameters.use_thesaurus: run_id = "thesaurus" if not os.path.isdir(OUT_DIR): os.mkdir("testbed") output = "testbed/" + RESULT_FILE print("Writing results to: " + output) else: print("Writing results to: " + output) with open(output, "a") as f: for i in range(min(len(result), MIN_RESULT_LENGTH)): # <query-id> <literal '0'> <document-id> <rank> <score> <run-id> f.write( str(query_id) + " 0 " + str(result[i]) + " " + str(i) + " " + str(accum[result[i]]) + " " + run_id + "\n") f.close() write_to_result_file(result, query_id)
def __init__(self): LanguageModule.__init__(self) import porter self._stemmer = porter.PorterStemmer()
class Compare: score = 0 doc1 = '' doc2 = '' __splitter = re.compile("[a-zA-Z\-']+", re.I) __stemmer = porter.PorterStemmer() def __del__(self): class_name = self.__class__.__name__ print class_name, "destroyed" def setDoc1(self, doc1): self.doc1 = doc1 def setDoc2(self, doc2): self.doc2 = doc2 def add_word(self, word, d): """ Adds a word the a dictionary for words/count first checks for stop words the converts word to stemmed version """ w = word.lower() # if w not in stop_words: # ws=stemmer.stem(w,0,len(w)-1) ws = w d.setdefault(ws, 0) d[ws] += 1 def doc_vec(self, doc, key_idx): v = zeros(len(key_idx)) for word in self.__splitter.findall(doc): # keydata=key_idx.get(stemmer.stem(word,0,len(word)-1).lower(), None) keydata = key_idx.get(word.lower(), None) # if keydata: v[keydata[0]] = 1 if keydata: v[keydata[0]] += 1 return v def compare(self): # strip all punctuation but - and ' # convert to lower case # store word/occurance in dict all_words = dict() for dat in [self.doc1, self.doc2]: [self.add_word(w, all_words) for w in self.__splitter.findall(dat)] # build an index of keys so that we know the word positions for the vector key_idx = dict() # key-> ( position, count ) keys = all_words.keys() keys.sort() for i in range(len(keys)): key_idx[keys[i]] = (i, all_words[keys[i]]) del keys del all_words v1 = self.doc_vec(self.doc1, key_idx) v2 = self.doc_vec(self.doc2, key_idx) # return math.acos(float(dot(v1,v2) / (norm(v1) * norm(v2)))) # return math.acos(float(dot(v1,v2) / (norm(v1) * norm(v2)))) try: degreeScore = math.degrees( math.acos(float(dot(v1, v2) / (norm(v1) * norm(v2))))) except: degreeScore = 0 return degreeScore
def index(folder_name, i): print('Indexing Testbed ' + str(i)) # Make index directories try: os.makedirs("indexes/testbed" + str(i) + "_index") except: pass try: os.makedirs("indexes/tf-idf/") except: pass data = {} print("indexing testbed" + str(i), end="") #read in files for j in range(1, 201): document = '' f = open(folder_name + "/document." + str(j), "r", encoding="ISO-8859-1") if parameters.case_folding: for line in f.readlines(): document += line.lower() + " " else: for line in f.readlines(): document += line + " " if (document != ''): data[str(j)] = document f.close() # document length/title file g = open("indexes/" + "testbed" + str(i) + "_index_len", "w") # create inverted files in memory and save titles/N to file index = {} N = len(data.keys()) p = porter.PorterStemmer() for key in data: #write over dtf tf_idf = open( "indexes/tf-idf/" + "testbed" + str(i) + "_document_" + str(key) + "_tf-idf", "w") tf_idf.write("") tf_idf.close() content = re.sub(r'[^ a-zA-Z0-9]', ' ', data[key]) content = re.sub(r'\s+', ' ', content) words = content.split(' ') doc_length = 0 for word in words: if word != '': if parameters.stemming: word = p.stem(word, 0, len(word) - 1) doc_length += 1 if not word in index: index[word] = {key: 1} else: if not key in index[word]: index[word][key] = 1 else: index[word][key] += 1 print(key, doc_length, key, sep=':', file=g) # document length/title file g.close() tf_idf_arr = {} # dict of tf-idf scores for key in index: if (len(key) > 30): continue f = open("indexes/testbed" + str(i) + "_index/" + key, "w") for entry in index[key]: if (not (entry in tf_idf_arr)): tf_idf_arr[entry] = [] # additionally calculate the tf-idf for use in Blind relevance feedback print(entry, index[key][entry], sep=':', file=f) tf = float(index[key][entry]) idf = 1 if parameters.use_idf: df = len(index[key]) idf = 1 / df if parameters.log_idf: idf = math.log(1 + N / df) tf_idf_arr[entry].append([tf * idf, key]) f.close() # write N f = open("indexes/testbed" + str(i) + "_index_N", "w") print(N, file=f) f.close() # sort on tf_idf for j in tf_idf_arr: tf_idf_arr[j].sort(key=lambda k: (k[0], k[1]), reverse=True) # Write tf-idf to file tf_idf = open( "indexes/tf-idf/" + "testbed" + str(i) + "_document_" + str(j) + "_tf-idf", "w") for line in tf_idf_arr[j]: print(j) print(line[0], line[1], sep=':', file=tf_idf) tf_idf.close() print('Indexing Testbed ' + str(i) + ' Done')
def __init__(self): self.stemmer = porter.PorterStemmer()
linenum = 0 tweet = [] sent = [] #for row in reader: # sent.append(row[0]) # tweet.append(row[1]) #print '[log]-Done reading csv' for line in f input_file.close() #use porter to stem english words stemmed_tweets = [] p = porter.PorterStemmer() for t in tweet: t_n = re.sub(url+'|'+username,'', t, flags=re.MULTILINE) t_n.strip() tweet_list = t_n.split() s ='' for w in tweet_list: s_w = p.stem(w,0,len(w)-1) s = s +' '+s_w s = s.strip() stemmed_tweets.append(s) print '[log]- stemming done' pickle.dump(stemmed_tweets,open('prep_tweets.p','wb'))
def stemword(word): p = porter.PorterStemmer() return p.stem(word,0,len(word)-1)