def _calc_related_matrix(): print 'Deleting all relatedweights' db.run('delete from RelatedDocs') print 'Reading word weights' docs = list(Doc.search()) docwords = {} for doc in docs: echo('.') l = docwords[doc] = {} for word,weight in db.run('select word,weight from WordWeights ' ' where docid=?', doc.id): l[word] = weight print print 'Calculating related documents' correlations = {} for doc in docs: echo('.') l = correlations[doc] = {} for doc2 in docs: if doc2==doc: continue bits = (docwords[doc2].get(word,0)*weight for word,weight in docwords[doc].iteritems()) l[doc2] = sum(bits) print print 'Saving correlations' for doc in correlations: #print '%s:' % doc.filename for doc2,weight in correlations[doc].items(): db.run('insert or replace into RelatedDocs ' ' (from_doc, to_doc, weight) ' ' values (?,?,?)', doc.id, doc2.id, weight)
def _calc_word_frequencies(): print 'Deleting all wordweights' db.run('delete from WordWeights') db.run('delete from Words') totals = {} for doc in Doc.search(): print ' %s' % doc.filename textbits = [doc.title, doc.title, # title gets bonus points doc.filename, doc.expanded_text(lambda x: x, headerdepth=1, expandbooks=1)] textbits += doc.tags fulltext = join(' ', textbits) words = [w.lower() for w in re.findall(r"(\w+(?:[.'#%@]\w+)?)", fulltext)] total = len(words)*1.0 wordcounts = {} echo(' %d total words' % total) for w in words: wordcounts[w] = wordcounts.get(w, 0) + 1 echo(', %d unique' % len(wordcounts.keys())) new = 0 for w,count in wordcounts.iteritems(): if not w in totals: totals[w] = 0 new += 1 totals[w] += count db.run('insert into WordWeights (docid, word, weight) ' ' values (?,?,?)', doc.id, w, (count/total)**.5) echo(', %d new\n' % new) print ' %d total unique words' % len(totals) print 'Saving words' for word,count in totals.iteritems(): db.run('insert into Words (word, total) values (?,?)', word, count)
def _tagdocs(search): tag = None docs = [] q = db.run('select Tags.tag,Tags.docid ' ' from Tags ' ' join Docs on Docs.id = Tags.docid ' ' where tag=?' ' order by Docs.title ', search) for t,d in q: tag = t docs.append(d) return tag,docs