def __init__(self): f = formatter.NullFormatter( ) #formatter.AbstractFormatter(formatter.DumbWriter()) #htmllib.HTMLParser.__init__(self, f) sgmllib.SGMLParser.__init__(self, f) self.SqliteDB = SqliteDatabase(Globals.DBName) self.Stemmer = PorterStemmer() self.ReadStopWords('stopwords.txt') #self.textData = "" #self.BitMap = BitMap #self.WordFrequency = {} #self.splitter = re.compile(r'\W+', re.I) self.splitter = re.compile(r'\s+', re.I) #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I) self.DigitWord = re.compile(r'\b\d+\b', re.I) self.AlphaNumericWord = re.compile(r'\w+', re.I) #self.doubleSlashes = re.compile(r'\\*', re.I) self.tagType = "" self.REUTERSTOPICS = "" self.LEWISSPLIT = "" self.CGISPLIT = "" self.NEWID = "" self.DATE = "" self.MKNOTE = "" self.TOPICS = "" self.PLACES = "" self.UNKNOWN = "" self.AUTHOR = "" self.DATELINE = "" self.TITLE = "" self.TOPICS = "" self.PLACES = "" self.PEOPLE = "" self.ORGS = "" self.EXCHANGES = "" self.COMPANIES = "" self.TEXTTYPE = "" self.DateHandled = False self.InTagDate = False self.MknoteHandled = False self.InTagMknote = False self.InTagTitle = False self.InTagDateline = False self.InTagBody = False self.InTagTopics = False self.InTagPlaces = False self.InTagPeople = False self.InTagOrgs = False self.InTagExchanges = False self.InTagCompanies = False self.InTagAuthor = False self.InTagUnknown = False
def porter_stemmer(self,words_list): p = PorterStemmer() return_list=[] for i in range(len(words_list)): if words_list[i].isalpha(): return_list.append(p.stem(words_list[i], 0,len(words_list[i])-1)) else: return_list.append(words_list[i]) return return_list
def TokenStem(document): steemer = PorterStemmer() returner = [] document = document.lower().split(' ') for word in document: if (word not in stopwords.words('english')): word = re.sub('[^A-Za-z0-9]+', '', word) word = steemer.stem(word, 0, len(word) - 1) returner.append(word) return returner
def clear(self, dataset, patter, replace = ' ', join = ' '): corpus = []; for e in dataset.values: review = re.sub(patter, replace, e); review = review.lower(); review = review.split(); ps = PorterStemmer(); review = [ps.stem(word) for word in review if not word in set(stopwords.words(self.lang))]; review = join.join(review); corpus.append(review)
def porter_stemmer(self, words_list): p = PorterStemmer() return_list = [] for i in range(len(words_list)): if words_list[i].isalpha(): return_list.append( p.stem(words_list[i], 0, len(words_list[i]) - 1)) else: return_list.append(words_list[i]) return return_list
def __init__(self, win, startTime, rootPath): import HTMLParser self.win = win self.StartTime = startTime self.rootPath = rootPath self.DocID = 0 self.WordID = 0 self.StemmedWordID = 0 self.DirCount = 0 self.FilesCount = 0 self.WordCount = 0 self.StemmedWordCount = 0 self.ElapsedTime = "" self.ParseStatus = "Indexing in Progress..." self.KeyColumnNames = "" self.UseStemmer = False self.Stemmer = None #self.SetupTextCatDB() #DBFunctions.SetupTextCatTables(Globals.TextCatFileName) DBFunctions.SetupSqliteIndexTables(Globals.TextCatFileName) self.EventStart = time.time() if Globals.Stemmer == "Porter Stemmer": self.Stemmer = PorterStemmer() self.FileScanStartTime = time.time() self.fout = None
def boolean_search(self, text): results = [] count = 0 # PUT YOUR CODE HERE PS = PorterStemmer.PorterStemmer() stemmed_Search =[] words = text.split() for word in words: stemmed_Search.append(PS.stem(word, 0, len(word)-1)) #print(stemmed_Search) for word in words: if(re.search('AND', word)):#if AND is in the query results+=self.computeAND(words[count-1], words[count]) return results if(re.search('OR', word)):#if OR is in the query #print(words[count - 1]) #print(words[count]) results += self.computeOR(words[count-1], words[count]) return results if(self._inverted_index.__contains__(stemmed_Search[0])):#for the cases without and/or #print(stemmed_Search[0]) #print(self._inverted_index['footbal']) results = self._inverted_index[stemmed_Search[0]] return results
def boolean_search(self, text): results = [] actualResults = "" stem = PorterStemmer.PorterStemmer() words = text.split() if words.__len__() == 1: results = self._inverted_index[stem.stem(words[0], 0, len(words[0]) - 1)] else: results1 = self._inverted_index[stem.stem(words[0], 0, len(words[0]) - 1)] #print (results1) results2 = self._inverted_index[stem.stem(words[2], 0, len(words[2]) - 1)] #print(results2) if words[1] == "AND": results = (set(results1) & set(results2)) if words[1] == "OR": results = set(results1).union(results2) # PUT YOUR CODE HERE for thingy in results: actualResults += str(thingy) #print (actualResults) return actualResults
def stemm(line): p = PorterStemmer() #Here the porter stemmer is initialized. line += " " line1 = "" element = '' for c in line: if c.isalpha(): element += c.lower() else: if element: element = p.stem(element, 0, len(element) - 1) line1 += element line1 += " " element = '' return line1
def get_stems(): """ Returns the array of the filtered stems according to the conditions mentioned in the paper @return: stemarray """ stemarray = [] p = ps.PorterStemmer() infile = open("./part-of-speech.txt", 'r') while 1: output = '' line = infile.readline() line = line.split('\t')[0] if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() stemarray.append(output) if (len(output) > 2 and output not in stemarray) else None infile.close() return stemarray
def __init__(self, win, startTime): import HTMLParser self.win = win self.StartTime = startTime self.DocID = 0 self.WordID = 0 self.StemmedWordID = 0 self.DirCount = 0 self.FilesCount = 0 self.WordCount = 0 self.StemmedWordCount = 0 self.ElapsedTime = "" self.ParseStatus = "Indexing in Progress..." self.KeyColumnNames = "" self.UseStemmer = False self.Stemmer = None #self.SetupTextCatDB() DBFunctions.SetupTextCatTables(Globals.TextCatFileName) """ self.timerStatus = wx.Timer(id=wx.NewId(), owner=self) self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer, id=self.timerStatus.GetId()) """ self.EventStart = time.time() self.splitter = re.compile(r'\W*') #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I) if Globals.Stemmer == "Porter Stemmer": self.Stemmer = PorterStemmer() #self.UseStemmer = True self.htmlParser = HTMLParser.HTMLParser(self.Stemmer) self.textParser = TextParser.TextParser(self.Stemmer) """
def stemming(self, tokens): stemmed_tokens = [] porter = PorterStemmer.PorterStemmer() for i in range(len(tokens)): stemmed_tokens.append(porter.stem(tokens[i], 0, len(tokens[i]) - 1)) return stemmed_tokens
def __init__(self, path=None, fn=None): fin = open(path + fn, 'rbU') self.stopwords = set() for f in fin: token = filter(lambda x: x in string.ascii_letters, f) stemmed_token = stemmer.stem(token, 0, len(token) - 1) self.stopwords.add(token) self.stopwords.add(stemmed_token)
def stemming(self, tokens): stemmed_tokens = [] PS = PorterStemmer.PorterStemmer() for word in tokens: stemmed_tokens.append(PS.stem(word, 0, len(word)-1)) # PUT YOUR CODE HERE #print(stemmed_tokens) return stemmed_tokens
def stemming(self, tokens): stemmed_tokens = [] steemer = PorterStemmer.PorterStemmer() tempSteemed = "" for i in tokens: tempSteemed = steemer.stem(i, 0, len(i) - 1) stemmed_tokens.append(tempSteemed) return stemmed_tokens
def stemming(self, tokens): stemmed_tokens = [] stemmer = PorterStemmer.PorterStemmer() for token in tokens: stemmed_token = stemmer.stem(token, 0, len(token) - 1) # print(stemmed_token) stemmed_tokens.append(stemmed_token) return stemmed_tokens
def __init__(self,path=None,fn=None) : fin=open(path+fn,'rbU') self.stopwords=set() for f in fin : token=filter(lambda x : x in string.ascii_letters, f) stemmed_token=stemmer.stem(token,0,len(token)-1) self.stopwords.add(token) self.stopwords.add(stemmed_token)
def stemming(self, tokens): if __name__ == '__main__': stemmed_tokens = [] for i in tokens: stemmed_i = PorterStemmer.PorterStemmer().stem(i,0,(len(i)-1)) stemmed_tokens.append(stemmed_i) files = self._documents return stemmed_tokens
def Rocchio(self, invertedFile, documentsList, relevantDocs): p = PorterStemmer.PorterStemmer() weights = {} for term in invertedFile.iterkeys(): sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) weights[ sterm] = 0.0 #initialize weight vector for each key in inverted file print '' relevantDocsTFWeights = {} # ------------------------------------- # # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors for docId in relevantDocs: doc = documentsList[docId] for term in doc["tfVector"]: sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) if sterm in relevantDocsTFWeights: relevantDocsTFWeights[sterm] = relevantDocsTFWeights[ sterm] + doc["tfVector"][term] else: relevantDocsTFWeights[sterm] = doc["tfVector"][term] # ------------------------------------- # # Compute Rocchio vector for term in invertedFile.iterkeys(): idf = math.log( float(len(documentsList)) / float(len(invertedFile[term].keys())), 10) sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) # Terms 2 and 3 of Rocchio algorithm for docId in invertedFile[term].iterkeys(): if documentsList[docId]['IsRelevant'] == 1: # Term 2: Relevant documents weights normalized and given BETA weight weights[sterm] = weights[sterm] + constants.BETA * idf * ( relevantDocsTFWeights[sterm] / len(relevantDocs)) # Term 1 of Rocchio, query terms if term in self.query: self.query[term] = constants.BETA * self.query[term] + weights[ sterm] #build new query vector of weights elif weights[sterm] > 0: self.query[term] = weights[sterm] with open('output_lucene_after_relevance_feedback.txt', 'w') as file: file.write(pickle.dumps(self.query))
def stemWords(list_of_tokens): p = PorterStemmer.PorterStemmer() # instance of a Porter Stemmer stemmed_list = [] for token in list_of_tokens: if token.isalpha(): stemmed_list.append(p.stem(token.lower(), 0, len(token) - 1)) else: # if non-aphabetical character exists, no stemming! stemmed_list.append(token.lower()) return stemmed_list
def getTopK(q, docIDList, pageRank, cache, zoneCache, normDict, zoneNorm, btree, zoneBtree): scores = dict() q = q.strip() # remove AND and OR regex1 = re.compile("AND") regex2 = re.compile("OR") q = re.sub(regex1, "", q) q = re.sub(regex2, "", q) if isWrappedInQuotes(q): q = q.strip('"') listOfTerms = re.split('[^a-zA-Z0-9*]+', q.lower()) if listOfTerms[0] == '': listOfTerms.pop(0) if listOfTerms[-1] == '': listOfTerms.pop() stopwordedTermList = [] for t in listOfTerms: if not t in stopWordSet: stopwordedTermList.append(t) docScores = dict() zoneScores = dict() # compute the scores in the "normal" index for term in stopwordedTermList: if (term.find("*") > -1): docScores = updateDocScoresWildcard(term, docScores, docIDList, cache, normDict, btree) else: docScores = updateDocScoresRegular(term, docScores, docIDList, cache, normDict) # compute the scores in the "zone" index, which indexes the top section # of each wikipedia page for term in stopwordedTermList: if (term.find("*") > -1): zoneScores = updateDocScoresWildcard(term, zoneScores, docIDList, zoneCache, zoneNorm, zoneBtree) else: zoneScores = updateDocScoresRegular(term, zoneScores, docIDList, zoneCache, zoneNorm) # stem the query words before passing them in to the final ranking function queryTerms = [] for term in stopwordedTermList: queryTerms.append(PorterStemmer.stemWord(pstemmer, term)) return computeFinalRanking(queryTerms, docIDList, docScores, zoneScores, pageRank, cache, zoneCache)
def __init__(self): f = formatter.NullFormatter( ) #formatter.AbstractFormatter(formatter.DumbWriter()) #htmllib.HTMLParser.__init__(self, f) sgmllib.SGMLParser.__init__(self, f) self.SqliteDB = SqliteDatabase(Globals.DBName) self.Stemmer = PorterStemmer() #self.textData = "" #self.BitMap = BitMap #self.WordFrequency = {} self.splitter = re.compile(r'\W+', re.I) #self.splitter = re.compile(r'\s+', re.I) #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I) #self.DigitWord = re.compile(r'\b\d+\b', re.I) self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I) self.AlphaNumericWord = re.compile(r'[a-z]*\W+[a-z]*', re.I) self.AlphabeticWord = re.compile(r'[a-z]+') #self.doubleSlashes = re.compile(r'\\*', re.I) self.BodyData = ""
def stem_text(text): stemmer = ps.PorterStemmer() output = '' word = '' for c in text: if c.isalpha(): word += c.lower() else: if word: output += stemmer.stem(word, 0, len(word) - 1) word = '' output += c.lower() output += stemmer.stem(word, 0, len(word) - 1) return output
def __init__(self): """ Constructor """ pat = r'["\.,:;?!\(\)\[\]\<\>{}' + r"']" self.__rexPunct = re.compile(pat) self.__rexSpace = re.compile(r'\s+') self.__repPunctStr = r' ' cf = ConfigFile() self.__stopWordsVocab = Vocab() stopList = cf.GetConfig("STOPLIST") self.__stopWordsVocab.Read(stopList) self.__ptStm = PorterStemmer()
def content_terms_generate(self, content): content_dict = dict() # Generate a query term list from the query. stemmer = ps.PorterStemmer() content_terms = [] term_list = content.strip().lower().split() for word in term_list: if word in self.stop_words: term_list.remove(word) for term in term_list: content_terms.append(stemmer.stem(term, 0, len(term)-1)) # Modify the query term list to a dictionary, then to a vector (Pandas Series). for term in content_terms: content_dict[term] = 1 if term not in content_dict.keys() else content_dict[term]+1 return content_dict
def tokenizeFreeText(s): listOfTerms = re.split('[^a-z0-9]+', s.lower()) if listOfTerms[0] == '': listOfTerms.pop(0) if listOfTerms[len(listOfTerms) - 1] == '': listOfTerms.pop() newTermList = [] for term in listOfTerms: if not term in stopWordSet: term = PorterStemmer.stemWord(pstemmer, term) newTermList.append(term) return newTermList
def add_page_to_index_re(index, url, content): i = 0 # it is not a good idea to use regular expression to parse html # i did this just to give a quick and dirty result # to parse html pages in practice you should use a DOM parser regex = re.compile('(?<!script)[>](?![\s\#\'-<]).+?[<]') p = PorterStemmer.PorterStemmer() for words in regex.findall(content): word_list = split_string(words, """ ,"!-.()<>[]{};:?!-=`&""") for word in word_list: #word = stem(word,p) if word > 2: add_to_index(index, word, url) return i
def preProcessString(s): isWrapped = isWrappedInQuotes(s) s = s.strip() s = s.strip('"') # produce nice spacing for boolean expressions s = s.replace("(", " ( ") s = s.replace(")", " ) ") listOfTerms = re.split('[^a-zA-Z0-9*()]+', s) if listOfTerms[0] == '': listOfTerms.pop(0) if listOfTerms[len(listOfTerms) - 1] == '': listOfTerms.pop() newTermList = [] for term in listOfTerms: if (term == "AND" or term == "OR" or term == "(" or term == ")"): newTermList.append(term) elif term.find("*") > -1: newTermList.append(term.lower()) elif not term in stopWordSet: term = PorterStemmer.stemWord(pstemmer, term.lower()) newTermList.append(term) # ignore consecutive ANDs or ORs lastWasBool = False newTermList2 = [] for term in newTermList: if (term == "AND" or term == "OR"): if not lastWasBool: newTermList2.append(term) lastWasBool = True else: lastWasBool = False newTermList2.append(term) toReturn = ' '.join(newTermList2) if (isWrapped): toReturn = '"' + toReturn + '"' return toReturn
def buildIndexDictionary(listOfTerms, bigDict): # build the inverted index index = 0 for term in listOfTerms: if term in stopWordSet: continue term = PorterStemmer.stemWord(pstemmer, term) if term in bigDict: littleDict = bigDict[term] if docID in littleDict: littleDict[docID].append(index) else: littleDict[docID] = [index] else: bigDict[term] = dict({docID: [index]}) index += 1 return bigDict
def readInTitles(titleFile): for t in titleFile: fields = t.split("\t") docID = int(fields[0]) # whether the header is stub-length stub1 = int(fields[1]) if stub1 == 0: stubsHeader[docID] = False else: stubsHeader[docID] = True # whether the body of the article is stub-length stub2 = int(fields[2]) if stub2 == 0: stubsBody[docID] = False else: stubsBody[docID] = True titleString = fields[3].lower() rawTitles[docID] = titleString listOfTerms = re.split('[^a-z0-9*()]+', titleString) if listOfTerms[0] == '': listOfTerms.pop(0) if listOfTerms[len(listOfTerms) - 1] == '': listOfTerms.pop() newListOfTerms = [] for t in listOfTerms: if not t in stopWordSet: newListOfTerms.append(t) titleTermSet = set() for term in newListOfTerms: titleTermSet.add(PorterStemmer.stemWord(pstemmer, term)) titles[docID] = titleTermSet
def removeCommonPluralsN7(fileName): try: #remove plurals that are normally used in plural form #clothes, lots, pants #Also, we need to look at plurals that have suffixes CommonPlurals=["CLOTHES", "LOTS", "PANTS", "SCISSORS","SHORTS", "TROUSERS","TONGS","PLIERS","GLASSES","STAIRS"] SecondExamplar=["BLOCKS", "GRAPES", "SHOES"] LexicalCriteria=["N1","N2","N3","N4","N5","N6","N7","N8","N9","N10","N11","V1","V2","V8","V10","V11","V14","V17","Q1","Q2","Q4","Q8","Q9","S5","S10"] ps=PorterStemmer.PorterStemmer() examplesToDelete=[] for i in range(len(FileList[fileName]["N7"])): if FileList[fileName]["N7"][i][1] in CommonPlurals: examplesToDelete.append(FileList[fileName]["N7"][i]) for i in examplesToDelete: FileList[fileName]["N7"].remove(i) if len(FileList[fileName]["N7"])==1: if FileList[fileName]["N7"][0][1] in SecondExamplar: FileList[fileName]["N7"].pop(0) for lc in LexicalCriteria: l2=[] examplesToDelete=[] for i in range(len(FileList[fileName][lc])): present=False for l1 in l2: if ps.stem(l1.lower(),0,len(l1)-1)==ps.stem(FileList[fileName][lc][i][1].lower(),0,len(FileList[fileName][lc][i][1].lower())-1): present=True break if present==True: examplesToDelete.append(FileList[fileName][lc][i]) else: l2.append(FileList[fileName][lc][i][1]) FileList[fileName][lc].reverse() for l1 in examplesToDelete: FileList FileList[fileName][lc].remove(l1) FileList[fileName][lc].reverse() return except: print "Error occured while trying to remove plurals for N7"
def updateDocScoresRegular(term, docScores, docIDList, cache, normDict): term = PorterStemmer.stemWord(pstemmer, term) if not term in cache: return docScores thetuple = cache[term] # maps from docID to position list dictionary = thetuple[0] # inverse document frequency score idf = thetuple[1] for ID in dictionary.keys(): if ID in docScores and docIDList: docScores[ID] += (len(dictionary[ID]) * idf) / normDict[ID] elif ID in docIDList: docScores[ID] = (len(dictionary[ID]) * idf) / normDict[ID] return docScores
def text_snippet(self, terms, start, length): """ Return a snippet from pos start to end with highlighted terms start - the "word" position (as opposed to characater position) length - how many words to include """ start_found = False new_start = 0 new_end = 0 pos = start for term in self.text.split(" "): pos = pos - 1 if not start_found: new_start = new_start + 1 else: new_end = new_end + 1 if not start_found and pos <= 0: pos = length start_found = True elif pos <= 0: break new_end = new_start + new_end snippet = " ".join(self.text.split(" ")[new_start:new_end]) for term in terms: p = PorterStemmer.PorterStemmer() term = p.stem(term, 0,len(term)-1) snippet = re.sub('(?i)([\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]' + \ re.escape(term) + \ "[^\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]*)", '\033[94m\\1\033[0m', snippet) return snippet
def __init__(self): self.stopList = set(self.readFile(os.path.join("..", "data", "english.stop"))) self.stemmer = PorterStemmer()
class Tokenizer: """ Tokenizer class """ __instance = None @staticmethod def GetInstance(): if Tokenizer.__instance is None: Tokenizer.__instance = Tokenizer() print("Tokenzier instance initilized") return Tokenizer.__instance @staticmethod def Split(sen): tkz = Tokenizer.GetInstance() return tkz.__split(sen) @staticmethod def ProcessToken(tok, \ isToLower = True, \ isUseStemmer = True, \ isAlphaNumOnly = False, \ isRmStopWords = False): """ Process token using given setting: convert to lower? use stemmer? keep only alphanum chars? remove stop words? """ tkz = Tokenizer.GetInstance() return tkz.__processToken(tok, isToLower, isUseStemmer,\ isAlphaNumOnly, isRmStopWords) def __init__(self): """ Constructor """ pat = r'["\.,:;?!\(\)\[\]\<\>{}' + r"']" self.__rexPunct = re.compile(pat) self.__rexSpace = re.compile(r'\s+') self.__repPunctStr = r' ' cf = ConfigFile() self.__stopWordsVocab = Vocab() stopList = cf.GetConfig("STOPLIST") self.__stopWordsVocab.Read(stopList) self.__ptStm = PorterStemmer() def __processToken(self, tok, isToLower, isUseStemmer, \ isAlphaNumOnly, isRmStopWords): """ Process token, according to the configuration setting i.e. lower? stemmer? etc. """ tok = tok.strip() if tok == '': return None isAllNonASCII = True findPos = False findPercent = False lenTok = len(tok) for i in range(lenTok, 0, -1): idx = i - 1 ch = tok[idx] ordVal = ord(ch) if ordVal < 128: isAllNonASCII = False replaceCh = '' doReplace = False if ordVal <= 32: # Special char! need to wipe out replaceCh = '' doReplace = True if ch == "'": replaceCh = "\'" doReplace = True if ch == '%': replaceCh = "_PERCENT_" doReplace = True # Doing replace if doReplace: tok = tok[:idx] + replaceCh + tok[idx + 1:] if isAllNonASCII: return None if isRmStopWords: if self.__stopWordsVocab.IsVocabWord(tok): #print("REMOVED: " + tok) return None if isToLower: tok = tok.lower() if isUseStemmer: tok = self.__ptStm.stem(tok, 0, len(tok) - 1) if isAlphaNumOnly: if not tok.isalnum(): return None return tok def __isAllNonASCII(self, string): return all(ord(c) >= 128 for c in string) def __split(self, sen): """ Split given sentcen. Return a set of tokens """ # First, replace punctuation sen = self.__rexPunct.sub(self.__repPunctStr, sen) #print sen # Split sp = self.__rexSpace.split(sen) # Process contiguous spaces lenSp = len(sp) for i in range(lenSp, 0, -1): idx = i - 1 c = sp[idx].strip() if c == '' or c == None: del sp[idx] return sp
class TweetFeaturizer: WORD_REGEX = "([\'A-Za-z0-9\@\#]+)" # note: includes @ and # def __init__(self): self.stopList = set(self.readFile(os.path.join("..", "data", "english.stop"))) self.stemmer = PorterStemmer() def readFile(self, fileName): contents = [] f = open(fileName) for line in f: contents.append(line) f.close() result = '\n'.join(contents).split() return result def filterStopWords(self, words): filtered = [] for word in words: if not word.lower() in self.stopList and word.strip() != '': filtered.append(word) return filtered def tokenizeSentence(self, sentence): words = [] matches = re.findall(self.WORD_REGEX, sentence) for match in matches: words.append(match) return words def stemTokens(self, tokens): return [self.stemmer.stem(xx) for xx in filteredTokens] def extractBigrams(self, sentence): # The sentence argument should be an # ordered array of tokens. bigrams = [] for index in range(len(sentence) - 1): bigrams.append(sentence[index] + " " + sentence[index + 1]) return bigrams def featurizeTweet(self, tweet): features = [] tokens = self.tokenizeSentence(tweet) filteredTokens = self.filterStopWords(tokens) ##stemmedTokens = self.stemTokens(filteredTokens) features += filteredTokens ## These are experimental (should experiment to see which actually improve performance) if tweet.find("@") != -1: features.append("CONTAINS_@MENTION") if tweet.find("#") != -1: features.append("CONTAINS_HASHTAG") features.append(str(len(tweet)) + "_CHARACTERS") features.append(str(len(tokens)) + "_WORDS") bigrams = self.extractBigrams(tokens) features.append(bigrams) ## potentially add more features return features
def stem_tokens(self,txt) : return [stemmer.stem(x,0,len(x)-1) for x in self.tokenize(txt)]
By Siamak Faridani 1/10/2012 call it by: python main.py input.txt """ from PorterStemmer import * import time if __name__ == '__main__': print "Starting..." start = time.clock() wordsseen = {} p = PorterStemmer() if len(sys.argv) > 1: for f in sys.argv[1:]: infile = open(f, 'r') outfile = open("output.txt", "w") while 1: output = '' word = '' line = infile.readline() if line == '': break for c in line: if c.isalpha(): word += c.lower()