Python tokenize Examples

Programming Language: Python

Namespace/Package Name: sanitize

Method/Function: tokenize

Examples at hotexamples.com: 2

Python tokenize - 2 examples found. These are the top rated real world Python examples of sanitize.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: query.py Project: ianlivingstone/CSCI4141

def query (db, text):
    
    """
    Performs the given query on the database
    """
    
    start = time.time()

    #   Clean our query and find our terms so we can perform our query
    terms = tokenize(text)
    
    doc_ids = []
    for term in terms:
        if term in db['terms']:
            doc_ids.extend(db['terms'][term][2])

    doc_ids = list(set(doc_ids))
    
    print db['document_terms'][doc_ids[0]]

    end = time.time()
    print 'Found %d matching documents in %f seconds' % (
        len(doc_ids),
        end - start
    )

Example #2

Show file

File: index.py Project: ianlivingstone/CSCI4141

def index (db):

    """
    terms and indexs all documents contained in the database
    """
    
    #   To make things faster, will keep the terms stored in memory
    #   Then, when we are done, will update the term database at once
    term_index = {}

    for doc_id, data in db['documents']:
        
        #   Get list of terms
        terms = tokenize(data[u'title'])
        terms.extend(tokenize(data[u'text']))

        #   Make sure we have the structure for the document in each
        #   database.
        if doc_id not in db['document_terms']:
            db['document_terms'][doc_id] = {}
        
        doc_term = db['document_terms'][doc_id]

        #   Now build our datastructure of term->frequency for each
        #   article. Also, make sure the term is included in our term table
        for term in terms:
            
            #   If the term has not already been loaded from the database
            #   then check and see if it exists in the db. If it does not,
            #   then set the defaults. Otherwise, pull the list of doc_ids
            #   for that term from the database.
            if term not in term_index:
                if term not in db['terms']:
                    #total term occurences, idf, [doc_id,...]
                    db['terms'][term] = [0, 0.0, []]
                    term_index[term] = []
                else:
                    term_index[term] = db['terms'][term][2]

            #   Make sure only one doc_id; lower the size!
            if doc_id not in term_index[term]:
                term_index[term].append(doc_id)

            if term not in doc_term:
                # frequency, tf, tf*idf
                doc_term[term] = [0,0.0,0.0]
            
            doc_term[term][0] += 1
            
        db['document_terms'][doc_id] = doc_term
        print 'Indexed Document %s with %d terms.' % (
            doc_id,
            len(doc_term)
        )
        
    #   Now, that we have index'd our documents time to do our second pass 
    #   and determine the tf*idf
    idf_table = determine_tfidf(db, term_index)

    #   Update our term database now.
    occurences = 0
    for term, value in term_index.iteritems():
        occurences += len(value)
        db['terms'][term] = [len(value), idf_table[term], value]

    print '%d terms and %d occurences in %d documents' % (
        len(db['terms']),
        occurences,
        len(db['documents'])
    )