Python Indexer.getStemmed Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: getStemmed

Examples at hotexamples.com: 1

Python Indexer.getStemmed - 1 examples found. These are the top rated real world Python examples of indexer.Indexer.getStemmed extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

File: searcher.py Project: cacheyourdreams/comp4321

class Searcher:
	def __init__(self):
		self.myIndexer = Indexer()
		self.dbInstance = Database()

	def main(self, argv):
		#get command line arguments
		try:
			opts, args = getopt.getopt(argv, "s:", [])
		except getopt.GetoptError as err:
			print err
			print "Usage: searcher.py -s \"search query\""
			sys.exit(2)
		query = ""
		for opt, arg in opts:
			if opt == '-s':
				query = arg
		if (query == ""):
			print "Usage: searcher.py -s \"search quey\""
			sys.exit(2)
			
		sortedDocs = self.getSearchResults(query)
		for d in sortedDocs:
			print d
	
	def getSearchResults (self, query):
		parsed = self.parseQuery(query)	
		searchTerms = parsed[0]
		phrases_words = parsed[1]
		if len(searchTerms) == 0:
			return {}
			
		wordId = self.getWordIds(searchTerms)
		mapping = wordId[0]
		queryWordIds = wordId[1]
		
		phrases = []
		for p in range(0,len(phrases_words)):
			phrasesWordIds = []
			for i in range(0, len(phrases_words[p])):
				if phrases_words[p][i] in mapping.keys():
					phrasesWordIds.append(mapping[phrases_words[p][i]])
			phrases.append(phrasesWordIds)
		
		documentVectors = self.getDocumentVectors(searchTerms, phrases)
		sortedDocs = self.sortDocumentVectors(queryWordIds,documentVectors)
		return sortedDocs
			
	def sortDocumentVectors (self, queryWordIds, documentVectors):
		documents = dict()
		for doc in documentVectors.keys():
			sim = self.cosSim(queryWordIds,documentVectors[doc][1],documentVectors[doc][0])
			#document_size, vector, title, modified, keyword:freq, parents, children
			#r[0] = url
			#r[1][0] = rank
			#r[1][1] = title
			#r[1][2] = modified
			#r[1][3] = size
			#r[1][4] = dict{keyword: freq}
			#r[1][5] = array{parents}
			#r[1][6] = array{children}

			documents[doc] = [sim, documentVectors[doc][2], documentVectors[doc][3].strftime("%A %d, %B %Y"), documentVectors[doc][0], documentVectors[doc][4], documentVectors[doc][5], documentVectors[doc][6]]
		return sorted(documents.items(), key=operator.itemgetter(1), reverse=True)
		
	
	def getDocumentVectors(self, terms, phrases):
		N = 0
		self.dbInstance.query("SELECT COUNT(*) FROM Documents;")
		N = self.dbInstance.fetchOne()[0]
		
		#get inverted index entries for each word in the search term
		sql_select = "SELECT InvertedIndex.word_id, InvertedIndex.document_id, InvertedIndex.term_frequency, document_url, document_frequency, document_size, max_tf, document_title, InvertedIndex.in_title, modified, fetched, word, document_chars FROM InvertedIndex LEFT JOIN KeyWords on InvertedIndex.word_id=KeyWords.word_id LEFT JOIN Documents ON Documents.document_id = InvertedIndex.document_id  WHERE InvertedIndex.word_id IN (SELECT word_id FROM KeyWords WHERE word=%s"
		for x in range(1,len(terms)):
			sql_select = sql_select + " OR word = %s"
		sql_select = sql_select + ");"
		
		self.dbInstance.query(sql_select, terms)
		rows = self.dbInstance.fetchAll()
		
		documentVectors = dict()
		indexValue = 3
		
		for row in rows:
			docVector = dict()
			#do we already have at least one entry for this document in our sparse matrix?
			if row[indexValue] in documentVectors:
				docVector = documentVectors[row[indexValue]][1]
			else:
				#if not, create a new entry
				#get links to/from
				sql_select_links = "SELECT document_url, child_url FROM Links LEFT JOIN Documents on Documents.document_id = Links.parent_id WHERE Links.parent_id = %s OR Links.child_url = %s;"
				self.dbInstance.query(sql_select_links, (row[1],row[3]))
				links = self.dbInstance.fetchAll()
				parents = []
				children = []
				for link in links:
					if link[1] == row[3]:
						parents.append(link[0])
					else:
						children.append(link[1])
				#document_size, vector, title, modified, keyword:freq, parents, children, document_id
				documentVectors[row[indexValue]] = [row[5], dict(), row[7], row[9], dict() , parents, children, row[1], row[12]]
			#obtain normalised tf*idf value
			val = (float(row[2])*log(N/float(row[4]),2)) / float(row[6])
			#give a boost to the weight if it appears in the document title
			val = val * (1000 if row[8] == 1 else 1) 
			docVector[row[0]] = val
			documentVectors[row[indexValue]][1] = docVector
			documentVectors[row[indexValue]][4][row[11]] = row[2]
			
		
		#Now do phrase matching
		if len(phrases) > 0:
			newDocumentVectors = dict()
			#build lookup table for word positions for each document:
			sql_select = "SELECT IndexPositions.document_id, position, IndexPositions.word_id FROM IndexPositions LEFT JOIN KeyWords on KeyWords.word_id = IndexPositions.word_id WHERE word=%s"
			for x in range(1,len(terms)):
				sql_select = sql_select + " OR word = %s"
			sql_select = sql_select + ";"
			self.dbInstance.query(sql_select, terms)
			rows = self.dbInstance.fetchAll()
			
			lookup = dict()
			
			for r in rows:
				dlookup = (dict(),dict())
				if r[0] in lookup.keys():
					dlookup = lookup[r[0]]
				#word -> position
				if (r[2] in dlookup[0]):
					dlookup[0][r[2]].append(r[1])
				else:
					dlookup[0][r[2]] = [r[1]]
				#position -> word
				dlookup[1][r[1]] = r[2]
				lookup[r[0]] = dlookup
			
			for dv in documentVectors:
				#does this dv contain the phrases?
				hasPhrases = True
				for phrase in phrases:
					if (phrase[0] not in (lookup[documentVectors[dv][7]][0]).keys()):
						hasPhrases = False
						break
					#start with all locations of the first word in the phrase
					potentialMatches = lookup[documentVectors[dv][7]][0][phrase[0]]
					
					for i in range(1,len(phrase)):
						newPotentialMatches = []
						for old in potentialMatches:
							#does the next position after the previous match coorespond to the next letter in the phrase?
							if (old+1) in (lookup[documentVectors[dv][7]][1]).keys() and lookup[documentVectors[dv][7]][1][old+1] == phrase[i]:
								newPotentialMatches.append(old+1)
						potentialMatches = newPotentialMatches
						if len(potentialMatches) == 0:
							hasPhrases = False
							break
						
					if hasPhrases == False:
						break
				if hasPhrases:
					newDocumentVectors[dv] = documentVectors[dv]
					
			documentVectors = newDocumentVectors
			
		return documentVectors
	
	def getWordIds(self, words):
		mapping = {}
		wordIds = []
		#sanity check
		if len(words) == 0:
			return wordIds
		
		#select the word ids cooresponding to the given words form the database
		sql_select = "SELECT word, word_id FROM KeyWords WHERE word=%s"
		for x in range(1,len(words)):
			sql_select = sql_select + " OR word = %s"
		sql_select = sql_select + ";"
		self.dbInstance.query(sql_select, words)
		rows = self.dbInstance.fetchAll()
		for row in rows:
			wordIds.append(row[1])
			mapping[row[0]] = row[1]
		return (mapping, wordIds)
		
	def parseQuery(self, query):
		#select only alphanumeric words (allow dashes and underscores as well)
		queryTerms = re.split("[^A-Za-z0-9\-_]+", query)
		searchTerms = []
		
		#get the indexer to do stemming and stop word removal for us
		for w in queryTerms:
			if (self.myIndexer.isStopword(w)):
				continue
			stemmed = self.myIndexer.getStemmed(w)
			if len(stemmed) > 0:
				searchTerms.append(stemmed.lower())
		
		#get terms between double quotes:
		phrases = []
		rephrases = re.findall("\"([^\"]+)\"", query)
		for m in rephrases:
			
			phraseTerms = re.split("[^A-Za-z0-9\-_]+", m)
			phrase = []
			for wrd in phraseTerms:
				w = wrd.lower()
				if (self.myIndexer.isStopword(w)):
					continue
				stemmed = self.myIndexer.getStemmed(w)
				if len(stemmed) > 0:
					phrase.append(stemmed)
			if len(phrase) > 0:
				phrases.append(phrase)
		
		return (searchTerms, phrases)

	
	def cosSim (self, queryTerms, docVector, docLength):
		#just treat each term as weight 1, so all terms in query are equally important
		dotProduct = 0
		for qt in queryTerms:
			if qt in docVector:
				dotProduct = dotProduct + docVector[qt]
		return dotProduct / (docLength * len(queryTerms))