class Searcher: def __init__(self): self.myIndexer = Indexer() self.dbInstance = Database() def main(self, argv): #get command line arguments try: opts, args = getopt.getopt(argv, "s:", []) except getopt.GetoptError as err: print err print "Usage: searcher.py -s \"search query\"" sys.exit(2) query = "" for opt, arg in opts: if opt == '-s': query = arg if (query == ""): print "Usage: searcher.py -s \"search quey\"" sys.exit(2) sortedDocs = self.getSearchResults(query) for d in sortedDocs: print d def getSearchResults (self, query): parsed = self.parseQuery(query) searchTerms = parsed[0] phrases_words = parsed[1] if len(searchTerms) == 0: return {} wordId = self.getWordIds(searchTerms) mapping = wordId[0] queryWordIds = wordId[1] phrases = [] for p in range(0,len(phrases_words)): phrasesWordIds = [] for i in range(0, len(phrases_words[p])): if phrases_words[p][i] in mapping.keys(): phrasesWordIds.append(mapping[phrases_words[p][i]]) phrases.append(phrasesWordIds) documentVectors = self.getDocumentVectors(searchTerms, phrases) sortedDocs = self.sortDocumentVectors(queryWordIds,documentVectors) return sortedDocs def sortDocumentVectors (self, queryWordIds, documentVectors): documents = dict() for doc in documentVectors.keys(): sim = self.cosSim(queryWordIds,documentVectors[doc][1],documentVectors[doc][0]) #document_size, vector, title, modified, keyword:freq, parents, children #r[0] = url #r[1][0] = rank #r[1][1] = title #r[1][2] = modified #r[1][3] = size #r[1][4] = dict{keyword: freq} #r[1][5] = array{parents} #r[1][6] = array{children} documents[doc] = [sim, documentVectors[doc][2], documentVectors[doc][3].strftime("%A %d, %B %Y"), documentVectors[doc][0], documentVectors[doc][4], documentVectors[doc][5], documentVectors[doc][6]] return sorted(documents.items(), key=operator.itemgetter(1), reverse=True) def getDocumentVectors(self, terms, phrases): N = 0 self.dbInstance.query("SELECT COUNT(*) FROM Documents;") N = self.dbInstance.fetchOne()[0] #get inverted index entries for each word in the search term sql_select = "SELECT InvertedIndex.word_id, InvertedIndex.document_id, InvertedIndex.term_frequency, document_url, document_frequency, document_size, max_tf, document_title, InvertedIndex.in_title, modified, fetched, word, document_chars FROM InvertedIndex LEFT JOIN KeyWords on InvertedIndex.word_id=KeyWords.word_id LEFT JOIN Documents ON Documents.document_id = InvertedIndex.document_id WHERE InvertedIndex.word_id IN (SELECT word_id FROM KeyWords WHERE word=%s" for x in range(1,len(terms)): sql_select = sql_select + " OR word = %s" sql_select = sql_select + ");" self.dbInstance.query(sql_select, terms) rows = self.dbInstance.fetchAll() documentVectors = dict() indexValue = 3 for row in rows: docVector = dict() #do we already have at least one entry for this document in our sparse matrix? if row[indexValue] in documentVectors: docVector = documentVectors[row[indexValue]][1] else: #if not, create a new entry #get links to/from sql_select_links = "SELECT document_url, child_url FROM Links LEFT JOIN Documents on Documents.document_id = Links.parent_id WHERE Links.parent_id = %s OR Links.child_url = %s;" self.dbInstance.query(sql_select_links, (row[1],row[3])) links = self.dbInstance.fetchAll() parents = [] children = [] for link in links: if link[1] == row[3]: parents.append(link[0]) else: children.append(link[1]) #document_size, vector, title, modified, keyword:freq, parents, children, document_id documentVectors[row[indexValue]] = [row[5], dict(), row[7], row[9], dict() , parents, children, row[1], row[12]] #obtain normalised tf*idf value val = (float(row[2])*log(N/float(row[4]),2)) / float(row[6]) #give a boost to the weight if it appears in the document title val = val * (1000 if row[8] == 1 else 1) docVector[row[0]] = val documentVectors[row[indexValue]][1] = docVector documentVectors[row[indexValue]][4][row[11]] = row[2] #Now do phrase matching if len(phrases) > 0: newDocumentVectors = dict() #build lookup table for word positions for each document: sql_select = "SELECT IndexPositions.document_id, position, IndexPositions.word_id FROM IndexPositions LEFT JOIN KeyWords on KeyWords.word_id = IndexPositions.word_id WHERE word=%s" for x in range(1,len(terms)): sql_select = sql_select + " OR word = %s" sql_select = sql_select + ";" self.dbInstance.query(sql_select, terms) rows = self.dbInstance.fetchAll() lookup = dict() for r in rows: dlookup = (dict(),dict()) if r[0] in lookup.keys(): dlookup = lookup[r[0]] #word -> position if (r[2] in dlookup[0]): dlookup[0][r[2]].append(r[1]) else: dlookup[0][r[2]] = [r[1]] #position -> word dlookup[1][r[1]] = r[2] lookup[r[0]] = dlookup for dv in documentVectors: #does this dv contain the phrases? hasPhrases = True for phrase in phrases: if (phrase[0] not in (lookup[documentVectors[dv][7]][0]).keys()): hasPhrases = False break #start with all locations of the first word in the phrase potentialMatches = lookup[documentVectors[dv][7]][0][phrase[0]] for i in range(1,len(phrase)): newPotentialMatches = [] for old in potentialMatches: #does the next position after the previous match coorespond to the next letter in the phrase? if (old+1) in (lookup[documentVectors[dv][7]][1]).keys() and lookup[documentVectors[dv][7]][1][old+1] == phrase[i]: newPotentialMatches.append(old+1) potentialMatches = newPotentialMatches if len(potentialMatches) == 0: hasPhrases = False break if hasPhrases == False: break if hasPhrases: newDocumentVectors[dv] = documentVectors[dv] documentVectors = newDocumentVectors return documentVectors def getWordIds(self, words): mapping = {} wordIds = [] #sanity check if len(words) == 0: return wordIds #select the word ids cooresponding to the given words form the database sql_select = "SELECT word, word_id FROM KeyWords WHERE word=%s" for x in range(1,len(words)): sql_select = sql_select + " OR word = %s" sql_select = sql_select + ";" self.dbInstance.query(sql_select, words) rows = self.dbInstance.fetchAll() for row in rows: wordIds.append(row[1]) mapping[row[0]] = row[1] return (mapping, wordIds) def parseQuery(self, query): #select only alphanumeric words (allow dashes and underscores as well) queryTerms = re.split("[^A-Za-z0-9\-_]+", query) searchTerms = [] #get the indexer to do stemming and stop word removal for us for w in queryTerms: if (self.myIndexer.isStopword(w)): continue stemmed = self.myIndexer.getStemmed(w) if len(stemmed) > 0: searchTerms.append(stemmed.lower()) #get terms between double quotes: phrases = [] rephrases = re.findall("\"([^\"]+)\"", query) for m in rephrases: phraseTerms = re.split("[^A-Za-z0-9\-_]+", m) phrase = [] for wrd in phraseTerms: w = wrd.lower() if (self.myIndexer.isStopword(w)): continue stemmed = self.myIndexer.getStemmed(w) if len(stemmed) > 0: phrase.append(stemmed) if len(phrase) > 0: phrases.append(phrase) return (searchTerms, phrases) def cosSim (self, queryTerms, docVector, docLength): #just treat each term as weight 1, so all terms in query are equally important dotProduct = 0 for qt in queryTerms: if qt in docVector: dotProduct = dotProduct + docVector[qt] return dotProduct / (docLength * len(queryTerms))