Ejemplo n.º 1
0
Archivo: load.py Proyecto: apenwarr/ekb
def _calc_related_matrix():
    print 'Deleting all relatedweights'
    db.run('delete from RelatedDocs')
    
    print 'Reading word weights'
    docs = list(Doc.search())
    docwords = {}
    for doc in docs:
        echo('.')
        l = docwords[doc] = {}
        for word,weight in db.run('select word,weight from WordWeights '
                                  '  where docid=?', doc.id):
            l[word] = weight
    print
    
    print 'Calculating related documents'
    correlations = {}
    for doc in docs:
        echo('.')
        l = correlations[doc] = {}
        for doc2 in docs:
            if doc2==doc: continue
            bits = (docwords[doc2].get(word,0)*weight
                      for word,weight in docwords[doc].iteritems())
            l[doc2] = sum(bits)
    print
    
    print 'Saving correlations'
    for doc in correlations:
        #print '%s:' % doc.filename
        for doc2,weight in correlations[doc].items():
            db.run('insert or replace into RelatedDocs '
                   '  (from_doc, to_doc, weight) '
                   '  values (?,?,?)', doc.id, doc2.id, weight)
Ejemplo n.º 2
0
Archivo: load.py Proyecto: apenwarr/ekb
def _calc_word_frequencies():
    print 'Deleting all wordweights'
    db.run('delete from WordWeights')
    db.run('delete from Words')
    
    totals = {}
    for doc in Doc.search():
        print ' %s' % doc.filename
        textbits = [doc.title, doc.title,  # title gets bonus points
                    doc.filename, doc.expanded_text(lambda x: x, headerdepth=1,
                                                    expandbooks=1)]
        textbits += doc.tags
        fulltext = join(' ', textbits)
        words = [w.lower() for w in re.findall(r"(\w+(?:[.'#%@]\w+)?)",
                                               fulltext)]
        total = len(words)*1.0
        wordcounts = {}
        echo('   %d total words' % total)
        for w in words:
            wordcounts[w] = wordcounts.get(w, 0) + 1
        echo(', %d unique' % len(wordcounts.keys()))
        new = 0
        for w,count in wordcounts.iteritems():
            if not w in totals:
                totals[w] = 0
                new += 1
            totals[w] += count
            db.run('insert into WordWeights (docid, word, weight) '
                   '  values (?,?,?)', doc.id, w, (count/total)**.5)
        echo(', %d new\n' % new)
    print ' %d total unique words' % len(totals)
    print 'Saving words'
    for word,count in totals.iteritems():
        db.run('insert into Words (word, total) values (?,?)', word, count)
Ejemplo n.º 3
0
def _tagdocs(search):
    tag = None
    docs = []
    q = db.run('select Tags.tag,Tags.docid '
               '  from Tags '
               '  join Docs on Docs.id = Tags.docid '
               '  where tag=?'
               '  order by Docs.title ',
               search)
    for t,d in q:
        tag = t
        docs.append(d)
    return tag,docs