Example #1
0
def query (db, text):
    
    """
    Performs the given query on the database
    """
    
    start = time.time()

    #   Clean our query and find our terms so we can perform our query
    terms = tokenize(text)
    
    doc_ids = []
    for term in terms:
        if term in db['terms']:
            doc_ids.extend(db['terms'][term][2])

    doc_ids = list(set(doc_ids))
    
    print db['document_terms'][doc_ids[0]]

    end = time.time()
    print 'Found %d matching documents in %f seconds' % (
        len(doc_ids),
        end - start
    )
Example #2
0
def index (db):

    """
    terms and indexs all documents contained in the database
    """
    
    #   To make things faster, will keep the terms stored in memory
    #   Then, when we are done, will update the term database at once
    term_index = {}

    for doc_id, data in db['documents']:
        
        #   Get list of terms
        terms = tokenize(data[u'title'])
        terms.extend(tokenize(data[u'text']))

        #   Make sure we have the structure for the document in each
        #   database.
        if doc_id not in db['document_terms']:
            db['document_terms'][doc_id] = {}
        
        doc_term = db['document_terms'][doc_id]

        #   Now build our datastructure of term->frequency for each
        #   article. Also, make sure the term is included in our term table
        for term in terms:
            
            #   If the term has not already been loaded from the database
            #   then check and see if it exists in the db. If it does not,
            #   then set the defaults. Otherwise, pull the list of doc_ids
            #   for that term from the database.
            if term not in term_index:
                if term not in db['terms']:
                    #total term occurences, idf, [doc_id,...]
                    db['terms'][term] = [0, 0.0, []]
                    term_index[term] = []
                else:
                    term_index[term] = db['terms'][term][2]

            #   Make sure only one doc_id; lower the size!
            if doc_id not in term_index[term]:
                term_index[term].append(doc_id)

            if term not in doc_term:
                # frequency, tf, tf*idf
                doc_term[term] = [0,0.0,0.0]
            
            doc_term[term][0] += 1
            
        db['document_terms'][doc_id] = doc_term
        print 'Indexed Document %s with %d terms.' % (
            doc_id,
            len(doc_term)
        )
        
    #   Now, that we have index'd our documents time to do our second pass 
    #   and determine the tf*idf
    idf_table = determine_tfidf(db, term_index)

    #   Update our term database now.
    occurences = 0
    for term, value in term_index.iteritems():
        occurences += len(value)
        db['terms'][term] = [len(value), idf_table[term], value]

    print '%d terms and %d occurences in %d documents' % (
        len(db['terms']),
        occurences,
        len(db['documents'])
    )