def _calc_related_matrix(): print 'Deleting all relatedweights' db.run('delete from RelatedDocs') print 'Reading word weights' docs = list(Doc.search()) docwords = {} for doc in docs: echo('.') l = docwords[doc] = {} for word,weight in db.run('select word,weight from WordWeights ' ' where docid=?', doc.id): l[word] = weight print print 'Calculating related documents' correlations = {} for doc in docs: echo('.') l = correlations[doc] = {} for doc2 in docs: if doc2==doc: continue bits = (docwords[doc2].get(word,0)*weight for word,weight in docwords[doc].iteritems()) l[doc2] = sum(bits) print print 'Saving correlations' for doc in correlations: #print '%s:' % doc.filename for doc2,weight in correlations[doc].items(): db.run('insert or replace into RelatedDocs ' ' (from_doc, to_doc, weight) ' ' values (?,?,?)', doc.id, doc2.id, weight)
def edit(req, id, docname): docid = atoi(id) doc = Doc.try_get(id=docid) if not doc: raise Http404("Document #%d (%s) does not exist." % (docid, id)) doc.use_latest() page = doc.get_edit_url() dict = {} dict['alltags'] = _alltags() dict['alldocs'] = Doc dict['menuitems'] = [ ('/kb/', 'Knowledgebase'), ] if len(doc.tags) > 0: t = doc.tags[0] dict['menuitems'].append(('/kb/%s' % t, t)) dict['menuitems'].append((doc.get_url(), 'KB%d' % doc.id)) dict['menuitems'].append((doc.get_edit_url(), '-Edit-')) dict['page'] = page dict['title'] = doc.title dict['tags'] = join(', ', doc.tags) dict['uploadurl'] = doc.get_upload_url() dict['text'] = doc.text return render_to_response('ekb/edit.html', dict)
def _calc_word_frequencies(): print 'Deleting all wordweights' db.run('delete from WordWeights') db.run('delete from Words') totals = {} for doc in Doc.search(): print ' %s' % doc.filename textbits = [doc.title, doc.title, # title gets bonus points doc.filename, doc.expanded_text(lambda x: x, headerdepth=1, expandbooks=1)] textbits += doc.tags fulltext = join(' ', textbits) words = [w.lower() for w in re.findall(r"(\w+(?:[.'#%@]\w+)?)", fulltext)] total = len(words)*1.0 wordcounts = {} echo(' %d total words' % total) for w in words: wordcounts[w] = wordcounts.get(w, 0) + 1 echo(', %d unique' % len(wordcounts.keys())) new = 0 for w,count in wordcounts.iteritems(): if not w in totals: totals[w] = 0 new += 1 totals[w] += count db.run('insert into WordWeights (docid, word, weight) ' ' values (?,?,?)', doc.id, w, (count/total)**.5) echo(', %d new\n' % new) print ' %d total unique words' % len(totals) print 'Saving words' for word,count in totals.iteritems(): db.run('insert into Words (word, total) values (?,?)', word, count)
def _load_docs(): seen = {} titlemap = {} for doc in Doc.search(): if not os.path.exists(os.path.join(DOCDIR, doc.pathname)): print 'Deleting old document: %r %r' % (DOCDIR, doc.pathname) doc.delete() else: titlemap[doc.title] = doc print 'Loading all from "%s"' % DOCDIR for (dirpath, dirnames, filenames) in os.walk(DOCDIR): assert(dirpath.startswith(DOCDIR)) for basename in filenames: fullpath = os.path.join(dirpath, basename) dirfile = fullpath[len(DOCDIR):] if (basename[-1] == '~' or basename[0] == '.' or fullpath.find('/.') >= 0 or basename=='Makefile'): continue echo(" %s" % fullpath) if basename in seen: raise KeyError('Duplicate basename "%s"' % basename) seen[basename] = 1 title = basename (title, tags, mtime, text) = parse_doc(dirfile) print " (tags=%s)" % repr(tags) while title in titlemap and titlemap[title].filename != basename: print ('WARNING: Duplicate title:\n "%s"\n "%s"' % (basename, titlemap[title].filename)) title += " [duplicate]" d = Doc.create(basename, dirfile, title) titlemap[title] = d d.use_latest() # FIXME: lame: this parses a second time d.title = title d.save()
def save(req, id, docname): if not req.POST: return HttpResponse('Error: you must use POST to save pages.', status=500) while docname.startswith('/'): docname = docname[1:] title = req.REQUEST.get('title-text', 'Untitled').replace('\n', ' ') tags = req.REQUEST.get('tags-text', '').replace('\n', ' ') text = req.REQUEST.get('markdown-text', '').strip() docid = atoi(id) if docid: doc = Doc.try_get(id=docid) else: doc = Doc.create(docname, docname, title) if not doc: raise Http404("Document #%d (%s) does not exist." % (docid, id)) redir_url = doc.get_url() # this function is uncallable after delete() if not text: _try_delete(doc) else: xtitle = title di = 0 while 1: if di > 1: xtitle = '%s [dup#%d]' % (title, di) elif di == 1: xtitle = '%s [dup]' % title try: _try_save(doc, xtitle, tags, text) except IntegrityError: if di < 16: di += 1 continue else: raise break return HttpResponseRedirect(redir_url)
def new(req, docname): doc = Doc.try_get(filename=docname) if doc: raise Http404("Document #%d (%s) already exists." % (doc.id, docname)) del doc page = Doc.get_new_url(docname) dict = {} dict['alltags'] = _alltags() dict['alldocs'] = Doc dict['menuitems'] = [ ('/kb/', 'Knowledgebase'), ] dict['menuitems'].append((Doc.get_new_url(docname), docname)) dict['page'] = page dict['title'] = '' dict['tags'] = '' dict['uploadurl'] = Doc.get_upload_url() dict['text'] = '' return render_to_response('ekb/edit.html', dict)
def pdf(req, id, docname): urlexpander = lambda url: _pdf_url(req, url) docid = atoi(id) doc = Doc.try_get(id=docid) if not doc: raise Http404("Document #%d (%s) does not exist." % (docid, id)) else: #adds a non breaking space at the end to avoid Figure X captions markdownfix = re.compile("!\[\]\((?P<name> .*)\)", re.VERBOSE) mdfx = NamedTemporaryFile() name = mdfx.name mdfname = name + '.mdown' mdf = open(mdfname, 'w') mdf.write(markdownfix.sub(r'![](\g<name>)\\ ', doc.expanded_text(urlexpander, headerdepth=1, expandbooks=1) .encode('utf-8'))) mdf.flush() p = Popen(args = ['pandoc', '-f', 'markdown', '-t', 'latex', mdfname], stdout=PIPE) latex = p.stdout.read() latex = re.sub(r'\\includegraphics{(.*?)}', r'\\resizebox{4in}{!}{\\includegraphics{\1}}', latex) p.wait() ltname = name + '.latex' pdname = name + '.pdf' ltf = open(ltname, 'w') ltf.write(_texfix(req, doc, latex)) ltf.flush() p.wait() #mdf.close() print 'Latex file: %s' % ltname for d in [1,2]: # we have to do this twice so that the TOC is generated correctly p = Popen(args = ['pdflatex', '-interaction', 'batchmode', ltname], cwd = dirname(ltname)) p.wait() pd = open(pdname) #os.unlink(pdname) #os.unlink(ltname) return HttpResponse(pd, "application/pdf")
def show(req, search = None): urlexpander = lambda url: _html_url(req, url) qsearch = req.REQUEST.get('q', '') if not search: search = qsearch dict = {} dict['alltags'] = _alltags() dict['alldocs'] = Doc dict['menuitems'] = [ ('/kb/', 'Knowledgebase'), ] doc = Doc.try_get(id=atoi(search)) if doc: search = qsearch # the old search was really a docid tag,tagdocs = _tagdocs(search) print 'tds: %r %r %r' % (search, tag, tagdocs) if search: dict['urlappend'] = '?q=%s' % search want_words = search.lower().split() if search: if tag: dict['menuitems'].append(('/kb/%s' % search, tag)) else: dict['menuitems'].append(('/kb/%s' % search, '"%s"' % search)) if tag: h = HtmlHighlighter([], '') else: h = HtmlHighlighter(want_words, 'u') dict['search'] = search if doc: # View the specific article they requested. doc.use_latest() pagebase = doc.get_url() page = pagebase + dict.get('urlappend', '') if req.path != pagebase and req.path != urllib.unquote(pagebase): return HttpResponsePermanentRedirect(page) dict['page'] = page if not tag and not search and doc.tags: t = doc.tags[0] dict['menuitems'].append(('/kb/%s' % t, t)) dict['menuitems'].append((page, 'KB%d' % doc.id)) dict['title'] = doc.title dict['when'] = nicedate(doc.mtime) dict['tags'] = doc.tags dict['editurl'] = doc.get_edit_url() dict['pdfurl'] = doc.get_pdf_url() dict['text'] = h.highlight(doc.expanded_text(urlexpander, headerdepth=3, expandbooks=0), markdown.markdown) dict['reference_parents'] = list(doc.reference_parents()) dict['similar'] = doc.similar(max=4) dict['dissimilar'] = doc.dissimilar(max=4) if tag: dict['search'] = '' return render_to_response('ekb/view.html', dict) else: # Search for matching articles page = '/kb/%s' % search dict['page'] = page if tag: # the search term is actually the name of a tag f = tagdocs dict['skip_tags'] = 1 dict['title'] = 'Category: %s' % tag dict['search'] = '' elif search: # the search term is just a search term dict['title'] = 'Search: "%s"' % search words = [] docids = list(db.selectcol('select docid from WordWeights ' ' where word=?', want_words[0])) for word in want_words[1:]: if not docids: # no remaining matches break docids = list(db.selectcol('select docid from WordWeights ' ' where word=? and docid in (%s)' % _marks(docids), word, *docids)) l = want_words + docids docweights = db.select('select avg(weight)*count(weight), docid ' ' from WordWeights ' ' where word in (%s) and docid in (%s) ' ' group by docid ' % (_marks(want_words), _marks(docids)), *l) f = [] for weight,docid in sorted(docweights): if weight > 0.0: f.append(docid) else: # there is no search term; toplevel index dict['title'] = 'Knowledgebase' return render_to_response('ekb/kb.html', dict) dict['docs'] = [] for docid in f: d = Doc(docid) d.autosummary = autosummarize(d.expanded_text(urlexpander, headerdepth=1, expandbooks=1), want_words, h.highlight) dict['docs'].append(d) return render_to_response('ekb/search.html', dict)