Ejemplo n.º 1
0
Archivo: load.py Proyecto: apenwarr/ekb
def _calc_related_matrix():
    print 'Deleting all relatedweights'
    db.run('delete from RelatedDocs')
    
    print 'Reading word weights'
    docs = list(Doc.search())
    docwords = {}
    for doc in docs:
        echo('.')
        l = docwords[doc] = {}
        for word,weight in db.run('select word,weight from WordWeights '
                                  '  where docid=?', doc.id):
            l[word] = weight
    print
    
    print 'Calculating related documents'
    correlations = {}
    for doc in docs:
        echo('.')
        l = correlations[doc] = {}
        for doc2 in docs:
            if doc2==doc: continue
            bits = (docwords[doc2].get(word,0)*weight
                      for word,weight in docwords[doc].iteritems())
            l[doc2] = sum(bits)
    print
    
    print 'Saving correlations'
    for doc in correlations:
        #print '%s:' % doc.filename
        for doc2,weight in correlations[doc].items():
            db.run('insert or replace into RelatedDocs '
                   '  (from_doc, to_doc, weight) '
                   '  values (?,?,?)', doc.id, doc2.id, weight)
Ejemplo n.º 2
0
def edit(req, id, docname):
    docid = atoi(id)
    doc = Doc.try_get(id=docid)
    if not doc:
        raise Http404("Document #%d (%s) does not exist." % (docid, id))

    doc.use_latest()
    page = doc.get_edit_url()
        
    dict = {}
    dict['alltags'] = _alltags()
    dict['alldocs'] = Doc
    dict['menuitems'] = [
        ('/kb/', 'Knowledgebase'),
    ]
    if len(doc.tags) > 0:
        t = doc.tags[0]
        dict['menuitems'].append(('/kb/%s' % t, t))
    dict['menuitems'].append((doc.get_url(), 'KB%d' % doc.id))
    dict['menuitems'].append((doc.get_edit_url(), '-Edit-'))
    dict['page'] = page
    dict['title'] = doc.title
    dict['tags'] = join(', ', doc.tags)
    dict['uploadurl'] = doc.get_upload_url()
    dict['text'] = doc.text

    return render_to_response('ekb/edit.html', dict)
Ejemplo n.º 3
0
Archivo: load.py Proyecto: apenwarr/ekb
def _calc_word_frequencies():
    print 'Deleting all wordweights'
    db.run('delete from WordWeights')
    db.run('delete from Words')
    
    totals = {}
    for doc in Doc.search():
        print ' %s' % doc.filename
        textbits = [doc.title, doc.title,  # title gets bonus points
                    doc.filename, doc.expanded_text(lambda x: x, headerdepth=1,
                                                    expandbooks=1)]
        textbits += doc.tags
        fulltext = join(' ', textbits)
        words = [w.lower() for w in re.findall(r"(\w+(?:[.'#%@]\w+)?)",
                                               fulltext)]
        total = len(words)*1.0
        wordcounts = {}
        echo('   %d total words' % total)
        for w in words:
            wordcounts[w] = wordcounts.get(w, 0) + 1
        echo(', %d unique' % len(wordcounts.keys()))
        new = 0
        for w,count in wordcounts.iteritems():
            if not w in totals:
                totals[w] = 0
                new += 1
            totals[w] += count
            db.run('insert into WordWeights (docid, word, weight) '
                   '  values (?,?,?)', doc.id, w, (count/total)**.5)
        echo(', %d new\n' % new)
    print ' %d total unique words' % len(totals)
    print 'Saving words'
    for word,count in totals.iteritems():
        db.run('insert into Words (word, total) values (?,?)', word, count)
Ejemplo n.º 4
0
Archivo: load.py Proyecto: apenwarr/ekb
def _load_docs():
    seen = {}
    
    titlemap = {}
    for doc in Doc.search():
        if not os.path.exists(os.path.join(DOCDIR, doc.pathname)):
            print 'Deleting old document: %r %r' % (DOCDIR, doc.pathname)
            doc.delete()
        else:
            titlemap[doc.title] = doc
            
    print 'Loading all from "%s"' % DOCDIR
    for (dirpath, dirnames, filenames) in os.walk(DOCDIR):
        assert(dirpath.startswith(DOCDIR))
        for basename in filenames:
            fullpath = os.path.join(dirpath, basename)
            dirfile = fullpath[len(DOCDIR):]
            if (basename[-1] == '~' 
                or basename[0] == '.' or fullpath.find('/.') >= 0
                or basename=='Makefile'):
                   continue
            echo("  %s" % fullpath)

            if basename in seen:
                raise KeyError('Duplicate basename "%s"' % basename)
            seen[basename] = 1
                
            title = basename

            (title, tags, mtime, text) = parse_doc(dirfile)
            print " (tags=%s)" % repr(tags)

            while title in titlemap and titlemap[title].filename != basename:
                print ('WARNING: Duplicate title:\n  "%s"\n  "%s"'
                       % (basename, titlemap[title].filename))
                title += " [duplicate]"

            d = Doc.create(basename, dirfile, title)
            titlemap[title] = d
            d.use_latest()  # FIXME: lame: this parses a second time
            d.title = title
            d.save()
Ejemplo n.º 5
0
def save(req, id, docname):
    if not req.POST:
        return HttpResponse('Error: you must use POST to save pages.',
                            status=500)
    while docname.startswith('/'):
        docname = docname[1:]
    title = req.REQUEST.get('title-text', 'Untitled').replace('\n', ' ')
    tags  = req.REQUEST.get('tags-text', '').replace('\n', ' ')
    text  = req.REQUEST.get('markdown-text', '').strip()

    docid = atoi(id)
    if docid:
        doc = Doc.try_get(id=docid)
    else:
        doc = Doc.create(docname, docname, title)
                         
    if not doc:
        raise Http404("Document #%d (%s) does not exist." % (docid, id))
    redir_url = doc.get_url()  # this function is uncallable after delete()
    if not text:
        _try_delete(doc)
    else:
        xtitle = title
        di = 0
        while 1:
            if di > 1:
                xtitle = '%s [dup#%d]' % (title, di)
            elif di == 1:
                xtitle = '%s [dup]' % title
            try:
                _try_save(doc, xtitle, tags, text)
            except IntegrityError:
                if di < 16:
                    di += 1
                    continue
                else:
                    raise
            break
    return HttpResponseRedirect(redir_url)
Ejemplo n.º 6
0
def new(req, docname):
    doc = Doc.try_get(filename=docname)
    if doc:
        raise Http404("Document #%d (%s) already exists." % (doc.id, docname))
        del doc

    page = Doc.get_new_url(docname)
        
    dict = {}
    dict['alltags'] = _alltags()
    dict['alldocs'] = Doc
    dict['menuitems'] = [
        ('/kb/', 'Knowledgebase'),
    ]
    dict['menuitems'].append((Doc.get_new_url(docname), docname))
    dict['page'] = page
    dict['title'] = ''
    dict['tags'] = ''
    dict['uploadurl'] = Doc.get_upload_url()
    dict['text'] = ''

    return render_to_response('ekb/edit.html', dict)
Ejemplo n.º 7
0
def pdf(req, id, docname):
    urlexpander = lambda url: _pdf_url(req, url)
    docid = atoi(id)
    doc = Doc.try_get(id=docid)
    if not doc:
        raise Http404("Document #%d (%s) does not exist." % (docid, id))
    else:
        #adds a non breaking space at the end to avoid Figure X captions
        markdownfix = re.compile("!\[\]\((?P<name> .*)\)", re.VERBOSE)
        mdfx = NamedTemporaryFile()
        name = mdfx.name

        mdfname = name + '.mdown'
        mdf = open(mdfname, 'w')
        mdf.write(markdownfix.sub(r'![](\g<name>)\\ ',
                  doc.expanded_text(urlexpander, headerdepth=1, expandbooks=1)
                  .encode('utf-8')))
        mdf.flush()

        p = Popen(args = ['pandoc',
                          '-f', 'markdown',
                          '-t', 'latex',
                          mdfname],
                  stdout=PIPE)
        latex = p.stdout.read()
        latex = re.sub(r'\\includegraphics{(.*?)}',
                       r'\\resizebox{4in}{!}{\\includegraphics{\1}}',
                       latex)
        p.wait()
        
        ltname = name + '.latex'
        pdname = name + '.pdf'
        ltf = open(ltname, 'w')
        ltf.write(_texfix(req, doc, latex))
        ltf.flush()
        p.wait()
        #mdf.close()
        print 'Latex file: %s' % ltname
        for d in [1,2]:
            # we have to do this twice so that the TOC is generated correctly
            p = Popen(args = ['pdflatex', '-interaction', 'batchmode', ltname],
                      cwd = dirname(ltname))
            p.wait()
        pd = open(pdname)
        #os.unlink(pdname)
        #os.unlink(ltname)
        return HttpResponse(pd, "application/pdf")
Ejemplo n.º 8
0
def show(req, search = None):
    urlexpander = lambda url: _html_url(req, url)
    qsearch = req.REQUEST.get('q', '')
    if not search:
        search = qsearch

    dict = {}
    dict['alltags'] = _alltags()
    dict['alldocs'] = Doc
    dict['menuitems'] = [
        ('/kb/', 'Knowledgebase'),
    ]

    doc = Doc.try_get(id=atoi(search))
    if doc: search = qsearch  # the old search was really a docid
    tag,tagdocs = _tagdocs(search)
    print 'tds: %r %r %r' % (search, tag, tagdocs)

    if search:
        dict['urlappend'] = '?q=%s' % search
    want_words = search.lower().split()

    if search:
        if tag:
            dict['menuitems'].append(('/kb/%s' % search, tag))
        else:
            dict['menuitems'].append(('/kb/%s' % search, '"%s"' % search))

    if tag:
        h = HtmlHighlighter([], '')
    else:
        h = HtmlHighlighter(want_words, 'u')

    dict['search'] = search
        
    if doc:
        # View the specific article they requested.
        doc.use_latest()
        pagebase = doc.get_url()
        page = pagebase + dict.get('urlappend', '')
        if req.path != pagebase and req.path != urllib.unquote(pagebase):
            return HttpResponsePermanentRedirect(page)
        dict['page'] = page
        if not tag and not search and doc.tags:
            t = doc.tags[0]
            dict['menuitems'].append(('/kb/%s' % t, t))
        dict['menuitems'].append((page, 'KB%d' % doc.id))
        dict['title'] = doc.title
        dict['when'] = nicedate(doc.mtime)
        dict['tags'] = doc.tags
        dict['editurl'] = doc.get_edit_url()
        dict['pdfurl'] = doc.get_pdf_url()
        dict['text'] = h.highlight(doc.expanded_text(urlexpander,
                                                     headerdepth=3,
                                                     expandbooks=0),
                                   markdown.markdown)
        dict['reference_parents'] = list(doc.reference_parents())
        dict['similar'] = doc.similar(max=4)
        dict['dissimilar'] = doc.dissimilar(max=4)
        if tag:
            dict['search'] = ''
        return render_to_response('ekb/view.html', dict)
    else:
        # Search for matching articles
        page = '/kb/%s' % search
        dict['page'] = page

        if tag:
            # the search term is actually the name of a tag
            f = tagdocs
            dict['skip_tags'] = 1
            dict['title'] = 'Category: %s' % tag
            dict['search'] = ''
        elif search:
            # the search term is just a search term
            dict['title'] = 'Search: "%s"' % search
            words = []
            docids = list(db.selectcol('select docid from WordWeights '
                                       '  where word=?', want_words[0]))
            for word in want_words[1:]:
                if not docids:
                    # no remaining matches
                    break
                docids = list(db.selectcol('select docid from WordWeights '
                                           '  where word=? and docid in (%s)'
                                           % _marks(docids),
                                           word, *docids))
            l = want_words + docids
            docweights = db.select('select avg(weight)*count(weight), docid '
                                   '  from WordWeights '
                                   '  where word in (%s) and docid in (%s) '
                                   '  group by docid '
                                   % (_marks(want_words), _marks(docids)),
                                   *l)
            f = []
            for weight,docid in sorted(docweights):
                if weight > 0.0:
                    f.append(docid)
        else:
            # there is no search term; toplevel index
            dict['title'] = 'Knowledgebase'
            return render_to_response('ekb/kb.html', dict)

        dict['docs'] = []
        for docid in f:
            d = Doc(docid)
            d.autosummary = autosummarize(d.expanded_text(urlexpander,
                                                         headerdepth=1,
                                                         expandbooks=1),
                                          want_words, h.highlight)
            dict['docs'].append(d)
                
        return render_to_response('ekb/search.html', dict)