Exemple #1
0
    def __init__(self):

        f = formatter.NullFormatter(
        )  #formatter.AbstractFormatter(formatter.DumbWriter())
        #htmllib.HTMLParser.__init__(self, f)
        sgmllib.SGMLParser.__init__(self, f)
        self.SqliteDB = SqliteDatabase(Globals.DBName)

        self.Stemmer = PorterStemmer()

        self.ReadStopWords('stopwords.txt')

        #self.textData = ""
        #self.BitMap = BitMap
        #self.WordFrequency = {}
        #self.splitter = re.compile(r'\W+', re.I)
        self.splitter = re.compile(r'\s+', re.I)
        #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I)
        self.DigitWord = re.compile(r'\b\d+\b', re.I)
        self.AlphaNumericWord = re.compile(r'\w+', re.I)
        #self.doubleSlashes = re.compile(r'\\*', re.I)
        self.tagType = ""
        self.REUTERSTOPICS = ""
        self.LEWISSPLIT = ""
        self.CGISPLIT = ""
        self.NEWID = ""
        self.DATE = ""
        self.MKNOTE = ""
        self.TOPICS = ""
        self.PLACES = ""
        self.UNKNOWN = ""
        self.AUTHOR = ""
        self.DATELINE = ""
        self.TITLE = ""
        self.TOPICS = ""
        self.PLACES = ""
        self.PEOPLE = ""
        self.ORGS = ""
        self.EXCHANGES = ""
        self.COMPANIES = ""
        self.TEXTTYPE = ""

        self.DateHandled = False
        self.InTagDate = False
        self.MknoteHandled = False

        self.InTagMknote = False
        self.InTagTitle = False
        self.InTagDateline = False
        self.InTagBody = False
        self.InTagTopics = False
        self.InTagPlaces = False
        self.InTagPeople = False
        self.InTagOrgs = False
        self.InTagExchanges = False
        self.InTagCompanies = False
        self.InTagAuthor = False
        self.InTagUnknown = False
	def porter_stemmer(self,words_list):
		p = PorterStemmer()
		return_list=[]
		for i in range(len(words_list)):
		    if words_list[i].isalpha():
				return_list.append(p.stem(words_list[i], 0,len(words_list[i])-1))
		    else:
				return_list.append(words_list[i])
		return return_list
Exemple #3
0
def TokenStem(document):
    steemer = PorterStemmer()
    returner = []
    document = document.lower().split(' ')
    for word in document:
        if (word not in stopwords.words('english')):
            word = re.sub('[^A-Za-z0-9]+', '', word)
            word = steemer.stem(word, 0, len(word) - 1)
            returner.append(word)
    return returner
	def clear(self, dataset, patter, replace = ' ', join = ' '):
	 	corpus = [];
		for e in dataset.values:
			review = re.sub(patter, replace, e);
			review = review.lower();
			review = review.split();
			ps = PorterStemmer();
			review = [ps.stem(word) for word in review if not word in set(stopwords.words(self.lang))];
			review = join.join(review);
			corpus.append(review)
 def porter_stemmer(self, words_list):
     p = PorterStemmer()
     return_list = []
     for i in range(len(words_list)):
         if words_list[i].isalpha():
             return_list.append(
                 p.stem(words_list[i], 0,
                        len(words_list[i]) - 1))
         else:
             return_list.append(words_list[i])
     return return_list
Exemple #6
0
    def __init__(self, win, startTime, rootPath):
        import HTMLParser
        self.win = win
        self.StartTime = startTime
        self.rootPath = rootPath
        self.DocID = 0
        self.WordID = 0
        self.StemmedWordID = 0
        self.DirCount = 0
        self.FilesCount = 0
        self.WordCount = 0
        self.StemmedWordCount = 0
        self.ElapsedTime = ""
        self.ParseStatus = "Indexing in Progress..."
        self.KeyColumnNames = ""
        self.UseStemmer = False
        self.Stemmer = None
        #self.SetupTextCatDB()
        #DBFunctions.SetupTextCatTables(Globals.TextCatFileName)
        DBFunctions.SetupSqliteIndexTables(Globals.TextCatFileName)

        self.EventStart = time.time()
        if Globals.Stemmer == "Porter Stemmer":
            self.Stemmer = PorterStemmer()

        self.FileScanStartTime = time.time()
        self.fout = None
Exemple #7
0
    def boolean_search(self, text):
        results = []
        count = 0
        # PUT YOUR CODE HERE
        PS = PorterStemmer.PorterStemmer()
        stemmed_Search =[]
        words = text.split()
        for word in words:
            stemmed_Search.append(PS.stem(word, 0, len(word)-1))
            #print(stemmed_Search)
        for word in words:
            if(re.search('AND', word)):#if AND is in the query
                results+=self.computeAND(words[count-1], words[count])
                return results
            if(re.search('OR', word)):#if OR is in the query
                #print(words[count - 1])
                #print(words[count])
                results += self.computeOR(words[count-1], words[count])
                return results

        if(self._inverted_index.__contains__(stemmed_Search[0])):#for the cases without and/or
            #print(stemmed_Search[0])
            #print(self._inverted_index['footbal'])
            results = self._inverted_index[stemmed_Search[0]]

        return results
Exemple #8
0
    def boolean_search(self, text):
        results = []
        actualResults = ""
        stem = PorterStemmer.PorterStemmer()
        words = text.split()
        if words.__len__() == 1:
            results = self._inverted_index[stem.stem(words[0], 0,
                                                     len(words[0]) - 1)]
        else:
            results1 = self._inverted_index[stem.stem(words[0], 0,
                                                      len(words[0]) - 1)]
            #print (results1)
            results2 = self._inverted_index[stem.stem(words[2], 0,
                                                      len(words[2]) - 1)]
            #print(results2)

            if words[1] == "AND":
                results = (set(results1) & set(results2))
            if words[1] == "OR":
                results = set(results1).union(results2)
        # PUT YOUR CODE HERE

        for thingy in results:
            actualResults += str(thingy)
        #print (actualResults)
        return actualResults
def stemm(line):
    p = PorterStemmer()  #Here the porter stemmer is initialized.
    line += " "
    line1 = ""
    element = ''
    for c in line:
        if c.isalpha():
            element += c.lower()
        else:
            if element:
                element = p.stem(element, 0, len(element) - 1)
                line1 += element
                line1 += " "
                element = ''

    return line1
Exemple #10
0
def get_stems():
    """
    Returns the array of the filtered stems according to the conditions mentioned in the paper
    @return: stemarray
    """
    stemarray = []
    p = ps.PorterStemmer()
    infile = open("./part-of-speech.txt", 'r')
    while 1:
        output = ''
        line = infile.readline()
        line = line.split('\t')[0]
        if line == '':
            break
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    output += p.stem(word, 0, len(word) - 1)
                    word = ''
                output += c.lower()
        stemarray.append(output) if (len(output) > 2
                                     and output not in stemarray) else None
    infile.close()
    return stemarray
 def __init__(self, win, startTime):
     import HTMLParser
     self.win = win
     self.StartTime = startTime
     self.DocID = 0
     self.WordID = 0
     self.StemmedWordID = 0
     self.DirCount = 0
     self.FilesCount = 0
     self.WordCount = 0
     self.StemmedWordCount = 0
     self.ElapsedTime = ""
     self.ParseStatus = "Indexing in Progress..."
     self.KeyColumnNames = ""
     self.UseStemmer = False
     self.Stemmer = None
     #self.SetupTextCatDB()
     DBFunctions.SetupTextCatTables(Globals.TextCatFileName)
     
     """
     self.timerStatus = wx.Timer(id=wx.NewId(), owner=self)
     self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer,
           id=self.timerStatus.GetId())
     """
     self.EventStart = time.time()
     self.splitter = re.compile(r'\W*')
     #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I)
     if Globals.Stemmer == "Porter Stemmer":
         self.Stemmer = PorterStemmer()
         #self.UseStemmer = True
     self.htmlParser = HTMLParser.HTMLParser(self.Stemmer)   
     self.textParser = TextParser.TextParser(self.Stemmer)
     """
Exemple #12
0
 def stemming(self, tokens):
     stemmed_tokens = []
     porter = PorterStemmer.PorterStemmer()
     for i in range(len(tokens)):
         stemmed_tokens.append(porter.stem(tokens[i], 0,
                                           len(tokens[i]) - 1))
     return stemmed_tokens
 def __init__(self, path=None, fn=None):
     fin = open(path + fn, 'rbU')
     self.stopwords = set()
     for f in fin:
         token = filter(lambda x: x in string.ascii_letters, f)
         stemmed_token = stemmer.stem(token, 0, len(token) - 1)
         self.stopwords.add(token)
         self.stopwords.add(stemmed_token)
Exemple #14
0
 def stemming(self, tokens):
     stemmed_tokens = []
     PS = PorterStemmer.PorterStemmer()
     for word in tokens:
         stemmed_tokens.append(PS.stem(word, 0, len(word)-1))
     # PUT YOUR CODE HERE
     #print(stemmed_tokens)
     return stemmed_tokens
Exemple #15
0
 def stemming(self, tokens):
     stemmed_tokens = []
     steemer = PorterStemmer.PorterStemmer()
     tempSteemed = ""
     for i in tokens:
         tempSteemed = steemer.stem(i, 0, len(i) - 1)
         stemmed_tokens.append(tempSteemed)
     return stemmed_tokens
 def stemming(self, tokens):
     stemmed_tokens = []
     stemmer = PorterStemmer.PorterStemmer()
     for token in tokens:
         stemmed_token = stemmer.stem(token, 0, len(token) - 1)
         # print(stemmed_token)
         stemmed_tokens.append(stemmed_token)
     return stemmed_tokens
 def __init__(self,path=None,fn=None) :
         fin=open(path+fn,'rbU')
         self.stopwords=set()
         for f in fin :
                 token=filter(lambda x : x in string.ascii_letters, f)
                 stemmed_token=stemmer.stem(token,0,len(token)-1)
                 self.stopwords.add(token)
                 self.stopwords.add(stemmed_token)
 def stemming(self, tokens):
     if __name__ == '__main__':
         stemmed_tokens = []
         for i in tokens:
             stemmed_i = PorterStemmer.PorterStemmer().stem(i,0,(len(i)-1))
             stemmed_tokens.append(stemmed_i)
     files = self._documents
     return stemmed_tokens
    def Rocchio(self, invertedFile, documentsList, relevantDocs):

        p = PorterStemmer.PorterStemmer()

        weights = {}
        for term in invertedFile.iterkeys():
            sterm = term
            if STEM_IN_ROCCHIO:
                sterm = p.stem(term.lower(), 0, len(term) - 1)
            weights[
                sterm] = 0.0  #initialize weight vector for each key in inverted file
        print ''

        relevantDocsTFWeights = {}

        # ------------------------------------- #
        # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors
        for docId in relevantDocs:
            doc = documentsList[docId]
            for term in doc["tfVector"]:
                sterm = term
                if STEM_IN_ROCCHIO:
                    sterm = p.stem(term.lower(), 0, len(term) - 1)

                if sterm in relevantDocsTFWeights:
                    relevantDocsTFWeights[sterm] = relevantDocsTFWeights[
                        sterm] + doc["tfVector"][term]
                else:
                    relevantDocsTFWeights[sterm] = doc["tfVector"][term]

        # ------------------------------------- #
        # Compute Rocchio vector
        for term in invertedFile.iterkeys():
            idf = math.log(
                float(len(documentsList)) /
                float(len(invertedFile[term].keys())), 10)

            sterm = term
            if STEM_IN_ROCCHIO:
                sterm = p.stem(term.lower(), 0, len(term) - 1)

            # Terms 2 and 3 of Rocchio algorithm
            for docId in invertedFile[term].iterkeys():
                if documentsList[docId]['IsRelevant'] == 1:
                    # Term 2: Relevant documents weights normalized and given BETA weight
                    weights[sterm] = weights[sterm] + constants.BETA * idf * (
                        relevantDocsTFWeights[sterm] / len(relevantDocs))

            # Term 1 of Rocchio, query terms
            if term in self.query:
                self.query[term] = constants.BETA * self.query[term] + weights[
                    sterm]  #build new query vector of weights
            elif weights[sterm] > 0:
                self.query[term] = weights[sterm]

        with open('output_lucene_after_relevance_feedback.txt', 'w') as file:
            file.write(pickle.dumps(self.query))
Exemple #20
0
def stemWords(list_of_tokens):
    p = PorterStemmer.PorterStemmer()  # instance of a Porter Stemmer
    stemmed_list = []
    for token in list_of_tokens:
        if token.isalpha():
            stemmed_list.append(p.stem(token.lower(), 0, len(token) - 1))
        else:  # if non-aphabetical character exists, no stemming!
            stemmed_list.append(token.lower())
    return stemmed_list
Exemple #21
0
def getTopK(q, docIDList, pageRank, cache, zoneCache, normDict, zoneNorm,
            btree, zoneBtree):
    scores = dict()

    q = q.strip()

    # remove AND and OR
    regex1 = re.compile("AND")
    regex2 = re.compile("OR")
    q = re.sub(regex1, "", q)
    q = re.sub(regex2, "", q)

    if isWrappedInQuotes(q):
        q = q.strip('"')
    listOfTerms = re.split('[^a-zA-Z0-9*]+', q.lower())
    if listOfTerms[0] == '':
        listOfTerms.pop(0)
    if listOfTerms[-1] == '':
        listOfTerms.pop()

    stopwordedTermList = []
    for t in listOfTerms:
        if not t in stopWordSet:
            stopwordedTermList.append(t)

    docScores = dict()
    zoneScores = dict()

    # compute the scores in the "normal" index
    for term in stopwordedTermList:
        if (term.find("*") > -1):
            docScores = updateDocScoresWildcard(term, docScores, docIDList,
                                                cache, normDict, btree)
        else:
            docScores = updateDocScoresRegular(term, docScores, docIDList,
                                               cache, normDict)

    # compute the scores in the "zone" index, which indexes the top section
    # of each wikipedia page
    for term in stopwordedTermList:
        if (term.find("*") > -1):
            zoneScores = updateDocScoresWildcard(term, zoneScores, docIDList,
                                                 zoneCache, zoneNorm,
                                                 zoneBtree)
        else:
            zoneScores = updateDocScoresRegular(term, zoneScores, docIDList,
                                                zoneCache, zoneNorm)

    # stem the query words before passing them in to the final ranking function
    queryTerms = []
    for term in stopwordedTermList:
        queryTerms.append(PorterStemmer.stemWord(pstemmer, term))

    return computeFinalRanking(queryTerms, docIDList, docScores, zoneScores,
                               pageRank, cache, zoneCache)
Exemple #22
0
    def __init__(self):

        f = formatter.NullFormatter(
        )  #formatter.AbstractFormatter(formatter.DumbWriter())
        #htmllib.HTMLParser.__init__(self, f)
        sgmllib.SGMLParser.__init__(self, f)
        self.SqliteDB = SqliteDatabase(Globals.DBName)

        self.Stemmer = PorterStemmer()

        #self.textData = ""
        #self.BitMap = BitMap
        #self.WordFrequency = {}
        self.splitter = re.compile(r'\W+', re.I)
        #self.splitter = re.compile(r'\s+', re.I)
        #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I)
        #self.DigitWord = re.compile(r'\b\d+\b', re.I)
        self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I)
        self.AlphaNumericWord = re.compile(r'[a-z]*\W+[a-z]*', re.I)
        self.AlphabeticWord = re.compile(r'[a-z]+')
        #self.doubleSlashes = re.compile(r'\\*', re.I)
        self.BodyData = ""
Exemple #23
0
def stem_text(text):
    stemmer = ps.PorterStemmer()
    output = ''
    word = ''
    for c in text:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += stemmer.stem(word, 0, len(word) - 1)
                word = ''
            output += c.lower()
    output += stemmer.stem(word, 0, len(word) - 1)
    return output
Exemple #24
0
	def __init__(self):
		"""
		Constructor
		"""
		pat = r'["\.,:;?!\(\)\[\]\<\>{}' + r"']"
		self.__rexPunct = re.compile(pat)
		self.__rexSpace = re.compile(r'\s+')
		self.__repPunctStr = r' '

		cf = ConfigFile()
		self.__stopWordsVocab = Vocab()
		stopList = cf.GetConfig("STOPLIST")
		self.__stopWordsVocab.Read(stopList)

		self.__ptStm = PorterStemmer()
Exemple #25
0
 def content_terms_generate(self, content):
     content_dict = dict()
     # Generate a query term list from the query.
     stemmer = ps.PorterStemmer()
     content_terms = []
     term_list = content.strip().lower().split()
     for word in term_list:
         if word in self.stop_words:
             term_list.remove(word)
     for term in term_list:
         content_terms.append(stemmer.stem(term, 0, len(term)-1))
     # Modify the query term list to a dictionary, then to a vector (Pandas Series).
     for term in content_terms:
         content_dict[term] = 1 if term not in content_dict.keys() else content_dict[term]+1
     return content_dict
Exemple #26
0
def tokenizeFreeText(s):
    listOfTerms = re.split('[^a-z0-9]+', s.lower())
    if listOfTerms[0] == '':
        listOfTerms.pop(0)
    if listOfTerms[len(listOfTerms) - 1] == '':
        listOfTerms.pop()

    newTermList = []

    for term in listOfTerms:
        if not term in stopWordSet:
            term = PorterStemmer.stemWord(pstemmer, term)
            newTermList.append(term)

    return newTermList
Exemple #27
0
def add_page_to_index_re(index, url, content):
    i = 0
    # it is not a good idea to use regular expression to parse html
    # i did this just to give a quick and dirty result
    # to parse html pages in practice you should use a DOM parser
    regex = re.compile('(?<!script)[>](?![\s\#\'-<]).+?[<]')
    p = PorterStemmer.PorterStemmer()

    for words in regex.findall(content):
        word_list = split_string(words, """ ,"!-.()<>[]{};:?!-=`&""")
        for word in word_list:
            #word = stem(word,p)
            if word > 2:

                add_to_index(index, word, url)

    return i
Exemple #28
0
def preProcessString(s):
    isWrapped = isWrappedInQuotes(s)
    s = s.strip()
    s = s.strip('"')

    # produce nice spacing for boolean expressions
    s = s.replace("(", " ( ")
    s = s.replace(")", " ) ")

    listOfTerms = re.split('[^a-zA-Z0-9*()]+', s)
    if listOfTerms[0] == '':
        listOfTerms.pop(0)
    if listOfTerms[len(listOfTerms) - 1] == '':
        listOfTerms.pop()

    newTermList = []

    for term in listOfTerms:

        if (term == "AND" or term == "OR" or term == "(" or term == ")"):
            newTermList.append(term)
        elif term.find("*") > -1:
            newTermList.append(term.lower())
        elif not term in stopWordSet:
            term = PorterStemmer.stemWord(pstemmer, term.lower())
            newTermList.append(term)

    # ignore consecutive ANDs or ORs
    lastWasBool = False
    newTermList2 = []
    for term in newTermList:
        if (term == "AND" or term == "OR"):
            if not lastWasBool:
                newTermList2.append(term)
            lastWasBool = True
        else:
            lastWasBool = False
            newTermList2.append(term)

    toReturn = ' '.join(newTermList2)
    if (isWrapped):
        toReturn = '"' + toReturn + '"'
    return toReturn
Exemple #29
0
def buildIndexDictionary(listOfTerms, bigDict):

    # build the inverted index
    index = 0
    for term in listOfTerms:
        if term in stopWordSet:
            continue
        term = PorterStemmer.stemWord(pstemmer, term)
        if term in bigDict:
            littleDict = bigDict[term]
            if docID in littleDict:
                littleDict[docID].append(index)
            else:
                littleDict[docID] = [index]
        else:
            bigDict[term] = dict({docID: [index]})
        index += 1

    return bigDict
Exemple #30
0
def readInTitles(titleFile):

    for t in titleFile:
        fields = t.split("\t")

        docID = int(fields[0])

        # whether the header is stub-length
        stub1 = int(fields[1])
        if stub1 == 0:
            stubsHeader[docID] = False
        else:
            stubsHeader[docID] = True

        # whether the body of the article is stub-length
        stub2 = int(fields[2])
        if stub2 == 0:
            stubsBody[docID] = False
        else:
            stubsBody[docID] = True

        titleString = fields[3].lower()

        rawTitles[docID] = titleString

        listOfTerms = re.split('[^a-z0-9*()]+', titleString)
        if listOfTerms[0] == '':
            listOfTerms.pop(0)
        if listOfTerms[len(listOfTerms) - 1] == '':
            listOfTerms.pop()

        newListOfTerms = []
        for t in listOfTerms:
            if not t in stopWordSet:
                newListOfTerms.append(t)

        titleTermSet = set()
        for term in newListOfTerms:
            titleTermSet.add(PorterStemmer.stemWord(pstemmer, term))

        titles[docID] = titleTermSet
def removeCommonPluralsN7(fileName):
	try:
		#remove plurals that are normally used in plural form
		#clothes, lots, pants
		#Also, we need to look at plurals that have suffixes
		CommonPlurals=["CLOTHES", "LOTS", "PANTS", "SCISSORS","SHORTS", "TROUSERS","TONGS","PLIERS","GLASSES","STAIRS"]
		SecondExamplar=["BLOCKS", "GRAPES", "SHOES"]
		LexicalCriteria=["N1","N2","N3","N4","N5","N6","N7","N8","N9","N10","N11","V1","V2","V8","V10","V11","V14","V17","Q1","Q2","Q4","Q8","Q9","S5","S10"]
		ps=PorterStemmer.PorterStemmer()
		examplesToDelete=[]
		for i in range(len(FileList[fileName]["N7"])):
			if FileList[fileName]["N7"][i][1] in CommonPlurals:
				examplesToDelete.append(FileList[fileName]["N7"][i])
		for i in examplesToDelete:
			FileList[fileName]["N7"].remove(i)
		if len(FileList[fileName]["N7"])==1:
			if FileList[fileName]["N7"][0][1] in SecondExamplar:
				FileList[fileName]["N7"].pop(0)
		for lc in LexicalCriteria:
			l2=[]
			examplesToDelete=[]
			for i in range(len(FileList[fileName][lc])):
				present=False
				for l1 in l2:
					if ps.stem(l1.lower(),0,len(l1)-1)==ps.stem(FileList[fileName][lc][i][1].lower(),0,len(FileList[fileName][lc][i][1].lower())-1):
						present=True
						break
				if present==True:
					examplesToDelete.append(FileList[fileName][lc][i])
				else:
					l2.append(FileList[fileName][lc][i][1])
			FileList[fileName][lc].reverse()
			for l1 in examplesToDelete:
				FileList
				FileList[fileName][lc].remove(l1)
				FileList[fileName][lc].reverse()
	
		return
	except:
		print "Error occured while trying to remove plurals for N7"
Exemple #32
0
def updateDocScoresRegular(term, docScores, docIDList, cache, normDict):
    term = PorterStemmer.stemWord(pstemmer, term)

    if not term in cache:
        return docScores

    thetuple = cache[term]

    # maps from docID to position list
    dictionary = thetuple[0]

    # inverse document frequency score
    idf = thetuple[1]

    for ID in dictionary.keys():
        if ID in docScores and docIDList:
            docScores[ID] += (len(dictionary[ID]) * idf) / normDict[ID]

        elif ID in docIDList:
            docScores[ID] = (len(dictionary[ID]) * idf) / normDict[ID]

    return docScores
    def text_snippet(self, terms, start, length):
        """
        Return a snippet from pos start to end with highlighted terms
            start - the "word" position (as opposed to characater position)
            length - how many words to include


        """

        start_found = False
        new_start = 0
        new_end = 0
        pos = start

        for term in self.text.split(" "):
            pos = pos - 1

            if not start_found:
                new_start = new_start + 1
            else: 
                new_end = new_end + 1

            if not start_found and pos <= 0:
                pos = length
                start_found = True
            elif pos <= 0:
                break
        new_end = new_start + new_end
        snippet = " ".join(self.text.split(" ")[new_start:new_end])

        for term in terms:
            p = PorterStemmer.PorterStemmer()
            term = p.stem(term, 0,len(term)-1)
            snippet = re.sub('(?i)([\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]' + \
                             re.escape(term) + \
                             "[^\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]*)",
                             '\033[94m\\1\033[0m', snippet) 

        return snippet
 def __init__(self):
     self.stopList = set(self.readFile(os.path.join("..", "data", "english.stop")))
     self.stemmer = PorterStemmer()
Exemple #35
0
class Tokenizer:
	"""
	Tokenizer class
	"""

	__instance = None
	

	@staticmethod
	def GetInstance():
		if Tokenizer.__instance is None:
			Tokenizer.__instance = Tokenizer()
			print("Tokenzier instance initilized")

		return Tokenizer.__instance

	@staticmethod
	def Split(sen):
		tkz = Tokenizer.GetInstance()
		return tkz.__split(sen)

	@staticmethod
	def ProcessToken(tok, \
									 isToLower			= True, \
									 isUseStemmer		= True, \
									 isAlphaNumOnly = False, \
									 isRmStopWords	= False):
		"""
		Process token using given setting: 
		convert to lower? use stemmer? keep only alphanum chars?
		remove stop words?
		"""

		tkz = Tokenizer.GetInstance()
		return tkz.__processToken(tok, isToLower, isUseStemmer,\
															isAlphaNumOnly, isRmStopWords)

	def __init__(self):
		"""
		Constructor
		"""
		pat = r'["\.,:;?!\(\)\[\]\<\>{}' + r"']"
		self.__rexPunct = re.compile(pat)
		self.__rexSpace = re.compile(r'\s+')
		self.__repPunctStr = r' '

		cf = ConfigFile()
		self.__stopWordsVocab = Vocab()
		stopList = cf.GetConfig("STOPLIST")
		self.__stopWordsVocab.Read(stopList)

		self.__ptStm = PorterStemmer()



	def __processToken(self, tok, isToLower, isUseStemmer, \
										 isAlphaNumOnly, isRmStopWords):
		"""
		Process token, according to the configuration setting
		i.e. lower? stemmer? etc.
		"""

		tok = tok.strip()
		if tok == '':
			return None

		isAllNonASCII = True
		findPos = False
		findPercent = False
		
		lenTok = len(tok)
		for i in range(lenTok, 0, -1):
			idx = i - 1
			ch = tok[idx]
			ordVal = ord(ch)
			
			if ordVal < 128:
				isAllNonASCII = False

			replaceCh = ''
			doReplace = False
			if ordVal <= 32:
				# Special char! need to wipe out
				replaceCh = ''
				doReplace = True

			if ch == "'":
				replaceCh = "\'"
				doReplace = True

			if ch == '%':
				replaceCh = "_PERCENT_"
				doReplace = True

			# Doing replace
			if doReplace:
				tok = tok[:idx] + replaceCh + tok[idx + 1:]
			
		if isAllNonASCII:
			return None

		if isRmStopWords:
			if self.__stopWordsVocab.IsVocabWord(tok):
				#print("REMOVED: " + tok)
				return None

		if isToLower:
			tok = tok.lower()

		if isUseStemmer:
			tok = self.__ptStm.stem(tok, 0, len(tok) - 1)

		if isAlphaNumOnly:
			if not tok.isalnum():
				return None

		return tok


	def __isAllNonASCII(self, string):
		return all(ord(c) >= 128 for c in string)
		
	def __split(self, sen):
		"""
		Split given sentcen. Return a set of tokens
		"""

		# First, replace punctuation
		sen = self.__rexPunct.sub(self.__repPunctStr, sen)
		#print sen

		# Split
		sp = self.__rexSpace.split(sen)

		# Process contiguous spaces
		lenSp = len(sp)

		for i in range(lenSp, 0, -1):
			idx = i - 1
			c = sp[idx].strip()
			if c == '' or c == None:
				del sp[idx]

		return sp
class TweetFeaturizer:
    WORD_REGEX = "([\'A-Za-z0-9\@\#]+)" # note: includes @ and #

    def __init__(self):
        self.stopList = set(self.readFile(os.path.join("..", "data", "english.stop")))
        self.stemmer = PorterStemmer()

    def readFile(self, fileName):
        contents = []
        f = open(fileName)
        for line in f:
            contents.append(line)
        f.close()
        result = '\n'.join(contents).split()
        return result

    def filterStopWords(self, words):
        filtered = []
        for word in words:
            if not word.lower() in self.stopList and word.strip() != '':
                filtered.append(word)
        return filtered

    def tokenizeSentence(self, sentence):
        words = []
        matches = re.findall(self.WORD_REGEX, sentence)
        for match in matches:
            words.append(match)
        return words

    def stemTokens(self, tokens):
        return [self.stemmer.stem(xx) for xx in filteredTokens]

    def extractBigrams(self, sentence):
        # The sentence argument should be an
        # ordered array of tokens.
        bigrams = []
        for index in range(len(sentence) - 1):
            bigrams.append(sentence[index] + " " + sentence[index + 1])
        return bigrams

    def featurizeTweet(self, tweet):
        features = []

        tokens = self.tokenizeSentence(tweet)
        filteredTokens = self.filterStopWords(tokens)
        ##stemmedTokens = self.stemTokens(filteredTokens)
        features += filteredTokens

        ## These are experimental (should experiment to see which actually improve performance)

        if tweet.find("@") != -1:
            features.append("CONTAINS_@MENTION")
        if tweet.find("#") != -1:
            features.append("CONTAINS_HASHTAG")

        features.append(str(len(tweet)) + "_CHARACTERS")
        features.append(str(len(tokens)) + "_WORDS")

        bigrams = self.extractBigrams(tokens)
        features.append(bigrams)

        ## potentially add more features

        return features
 def stem_tokens(self,txt) :
         return [stemmer.stem(x,0,len(x)-1) for x in self.tokenize(txt)]
Exemple #38
0
By Siamak Faridani
1/10/2012

call it by: 
python main.py input.txt

"""
from PorterStemmer import *
import time


if __name__ == '__main__':
    print "Starting..."
    start = time.clock()
    wordsseen = {}
    p = PorterStemmer()
    if len(sys.argv) > 1:
        for f in sys.argv[1:]:
            infile = open(f, 'r')
            outfile = open("output.txt", "w")
            while 1:
                output = ''
                word = ''
                line = infile.readline()
                if line == '':
                    break
                for c in line:
                    if c.isalpha():
                        word += c.lower()