def query (db, text): """ Performs the given query on the database """ start = time.time() # Clean our query and find our terms so we can perform our query terms = tokenize(text) doc_ids = [] for term in terms: if term in db['terms']: doc_ids.extend(db['terms'][term][2]) doc_ids = list(set(doc_ids)) print db['document_terms'][doc_ids[0]] end = time.time() print 'Found %d matching documents in %f seconds' % ( len(doc_ids), end - start )
def index (db): """ terms and indexs all documents contained in the database """ # To make things faster, will keep the terms stored in memory # Then, when we are done, will update the term database at once term_index = {} for doc_id, data in db['documents']: # Get list of terms terms = tokenize(data[u'title']) terms.extend(tokenize(data[u'text'])) # Make sure we have the structure for the document in each # database. if doc_id not in db['document_terms']: db['document_terms'][doc_id] = {} doc_term = db['document_terms'][doc_id] # Now build our datastructure of term->frequency for each # article. Also, make sure the term is included in our term table for term in terms: # If the term has not already been loaded from the database # then check and see if it exists in the db. If it does not, # then set the defaults. Otherwise, pull the list of doc_ids # for that term from the database. if term not in term_index: if term not in db['terms']: #total term occurences, idf, [doc_id,...] db['terms'][term] = [0, 0.0, []] term_index[term] = [] else: term_index[term] = db['terms'][term][2] # Make sure only one doc_id; lower the size! if doc_id not in term_index[term]: term_index[term].append(doc_id) if term not in doc_term: # frequency, tf, tf*idf doc_term[term] = [0,0.0,0.0] doc_term[term][0] += 1 db['document_terms'][doc_id] = doc_term print 'Indexed Document %s with %d terms.' % ( doc_id, len(doc_term) ) # Now, that we have index'd our documents time to do our second pass # and determine the tf*idf idf_table = determine_tfidf(db, term_index) # Update our term database now. occurences = 0 for term, value in term_index.iteritems(): occurences += len(value) db['terms'][term] = [len(value), idf_table[term], value] print '%d terms and %d occurences in %d documents' % ( len(db['terms']), occurences, len(db['documents']) )