Exemple #1
0
def get_doc_page(request, alg_db, doc_title, docid, docloc, doc_cutoff=10, topic_cutoff=10, alg=''):
    """
    return the document page to the user with related terms and topics and the document text
    TODO limit the length of the document returned to first XY bytes
    """

    myrelations = relations(alg_db)
    doc = Document(docid, doc_title)

    topics = myrelations.get_top_related_topics(doc, topic_cutoff)
    piearray = get_js_doc_topic_pie_array(topics)
    # related topics
    topic_keys = topics.keys()
    topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y]))
    leftcol = {'piearray':piearray, 'data':topic_keys[:topic_cutoff], 'webname':'topics'}

    # related documents
    docs = myrelations.get_top_related_docs(doc, doc_cutoff)
    doc_keys = docs.keys()
    doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y]))
    rightcol = {'data':doc_keys[:topic_cutoff], 'webname':'documents'}

    try:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),'r')
    except IOError:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'),'r') # TODO fix hack
    midcol = {'doc':gen_clean_text(doc_text_file)}
    
    return render_to_response("three-column-vis.html", {'leftcol':leftcol,
        'rightcol':rightcol, 'midcol':midcol, 'title':doc.title}, context_instance=RequestContext(request))
Exemple #2
0
def get_doc_page(request,
                 alg_db,
                 doc_title,
                 docid,
                 docloc,
                 doc_cutoff=10,
                 topic_cutoff=10,
                 alg=''):
    """
    return the document page to the user with related terms and topics and the document text
    TODO limit the length of the document returned to first XY bytes
    """

    myrelations = relations(alg_db)
    doc = Document(docid, doc_title)

    topics = myrelations.get_top_related_topics(doc, topic_cutoff)
    piearray = get_js_doc_topic_pie_array(topics)
    # related topics
    topic_keys = topics.keys()
    topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y]))
    leftcol = {
        'piearray': piearray,
        'data': topic_keys[:topic_cutoff],
        'webname': 'topics'
    }

    # related documents
    docs = myrelations.get_top_related_docs(doc, doc_cutoff)
    doc_keys = docs.keys()
    doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y]))
    rightcol = {'data': doc_keys[:topic_cutoff], 'webname': 'documents'}

    try:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),
                             'r')
    except IOError:
        doc_text_file = open(
            os.path.join(docloc,
                         slugify(unicode(doc_title)) + '.txt'),
            'r')  # TODO fix hack
    midcol = {'doc': gen_clean_text(doc_text_file)}

    return render_to_response("three-column-vis.html", {
        'leftcol': leftcol,
        'rightcol': rightcol,
        'midcol': midcol,
        'title': doc.title
    },
                              context_instance=RequestContext(request))
Exemple #3
0
def get_doc_text(docloc, title_wID, numbytes=500):
    """
    Obtain the text of the document without any surrounding html
    """
    doc_title = " ".join(title_wID.split('-')[0:-1])
    try:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),'r')
    except IOError:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'),'r') # TODO fix hack
    txt = doc_text_file.read(numbytes)
    doc_text_file.close()
    doc_text = escape(remove_non_ascii(txt))
    doc_text += "...<br /> <div style=\"text-align:center; margin-top: 10px;\"> <input type=\"button\" name=\"b1\" value=\"View full document\" onclick=\"openlw('" + title_wID + "')\" /> </div>"
    return HttpResponse(doc_text)
Exemple #4
0
def get_doc_text(docloc, title_wID, numbytes=500):
    """
    Obtain the text of the document without any surrounding html
    """
    doc_title = " ".join(title_wID.split('-')[0:-1])
    try:
        doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),
                             'r')
    except IOError:
        doc_text_file = open(
            os.path.join(docloc,
                         slugify(unicode(doc_title)) + '.txt'),
            'r')  # TODO fix hack
    txt = doc_text_file.read(numbytes)
    doc_text_file.close()
    doc_text = escape(remove_non_ascii(txt))
    doc_text += "...<br /> <div style=\"text-align:center; margin-top: 10px;\"> <input type=\"button\" name=\"b1\" value=\"View full document\" onclick=\"openlw('" + title_wID + "')\" /> </div>"
    return HttpResponse(doc_text)
Exemple #5
0
 def get_safe_title(self):
     safe_title = slugify(self.title)
     return safe_title
Exemple #6
0
 def get_safe_title(self):
     return slugify(unicode(self.title))
Exemple #7
0
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """
        
        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) 
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'
                           
        print '---adding text to corpus---'    
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" % folder)  # add text files included in folder 
        txtlist = map(lambda x: x.strip(), txtlist) 
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle 
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf 
                continue
                
            toparsetexts.append(txtname)
            self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs
            
        # now add all of the new texts to the corpus
        
        cfile = self.open_corpus()
        if self.usepara: # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir): 
                os.makedirs(self.paradir)
        else:     # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)')
            if self.make_stem_db:
                dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)')
            
        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-',' ')
            wordcounts = dict() 
            prestem_dic = dict() 
            try:
                infile = open(tfile,'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd=='':
                        continue
                    else:
                        prestem = wrd 
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1     
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(wrd): 
                                prestem_dic[wrd] = prestem
                                 
                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)    
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)                               
                        self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline)
                        useparanum += 1  
                    wordcounts = dict()
                    prestem_dic = dict() 
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords: 
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True
Exemple #8
0
 def get_safe_title(self):
     safe_title = slugify(self.title)
     return safe_title
Exemple #9
0
 def get_safe_title(self):
     return slugify(unicode(self.title))
Exemple #10
0
    def parse_folder(self, folder):
        """
        parses the various datatypes in the folder and writes the lda-c format to file
        """

        # obtain list of all pdfs (TODO add heterogenous file types)
        pdflist = os.popen("find %s -name '*.pdf' -type f" % folder)
        pdflist = pdflist.readlines()
        pdflist = map(lambda x: x.strip(), pdflist)
        self.pdf_list.extend(pdflist)
        toparsetexts = []
        if len(pdflist):
            print '--- beginning pdf to text conversion ---'
            for pdf in pdflist:
                doctitle = self._obtain_clean_title(pdf)
                txtname = self.textdir + '/%s.txt' % doctitle
                cmd = 'pdftotext %s %s' % (
                    pdf, txtname
                )  # TODO: figure out and print which documents did not convert
                os.system(cmd)
                toparsetexts.append(txtname)
                self.rawtextfiles.append(txtname)
            print '--- finished pdf to text conversion ---'

        print '---adding text to corpus---'
        # add textual data
        txtlist = os.popen("find %s -name '*.txt' -type f" %
                           folder)  # add text files included in folder
        txtlist = map(lambda x: x.strip(), txtlist)
        for txtf in txtlist:
            doctitle = self._obtain_clean_title(txtf)
            txtname = self.textdir + '/%s.txt' % doctitle
            try:
                os.system('ln -s %s %s' % (txtf, txtname))
            except IOError:
                print 'Warning: will not include %s, could not parse text file' % txtf
                continue

            toparsetexts.append(txtname)
            self.rawtextfiles.append(
                txtname)  # TODO: fix code repetition with parsing pdfs

        # now add all of the new texts to the corpus

        cfile = self.open_corpus()
        if self.usepara:  # make a directory for each of the individual paragraphs
            if not os.path.exists(self.paradir):
                os.makedirs(self.paradir)
        else:  # make a link to the textdir with the same name as the individual paragraph directory
            if not os.path.exists(self.paradir):
                os.system('ln -s %s %s' % (self.textdir, self.paradir))

        # initialize the database to keep track of term-doc occurances
        dbase = db(self.corpus_db)
        if not self.parsed_data:
            dbase.add_table(
                'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)'
            )
            if self.make_stem_db:
                dbase.add_table(
                    'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)'
                )

        # add the data to the corpus
        for tfile in toparsetexts:
            title = tfile.split('/')[-1].split('.')[0].replace('-', ' ')
            wordcounts = dict()
            prestem_dic = dict()
            try:
                infile = open(tfile, 'r')
            except IOError:
                print 'WARNING: could not find %s, will not include' % tfile
                continue
            useparanum = 1
            totparanum = 1
            for paraline in infile:
                totparanum += 1
                words = paraline.split()
                for wrd in words:
                    wrd = self.parse_word(wrd)
                    if wrd == '':
                        continue
                    else:
                        prestem = wrd
                        if self.dostem:
                            wrd = stem(wrd)
                        if wordcounts.has_key(wrd):
                            wordcounts[wrd] += 1
                        else:
                            wordcounts[wrd] = 1
                            # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the  first encounter of a stemmed word: perhaps make more general?
                            if self.make_stem_db and not self.vocab.has_key(
                                    wrd):
                                prestem_dic[wrd] = prestem

                if self.usepara:
                    if sum(wordcounts.values()) > self.minwords:
                        self.write_doc_line(cfile, wordcounts, dbase,
                                            prestem_dic)
                        usetitle = title + ' [P%d]' % useparanum
                        self.titles.append(usetitle)
                        if not isinstance(usetitle, unicode):
                            usetitle = unicode(usetitle)
                        self.write_document(
                            os.path.join(self.paradir, slugify(usetitle)),
                            paraline)
                        useparanum += 1
                    wordcounts = dict()
                    prestem_dic = dict()
            infile.close()
            if not self.usepara:
                if sum(wordcounts.values()) > self.minwords:
                    self.write_doc_line(cfile, wordcounts, dbase, prestem_dic)
                    self.titles.append(title)
        cfile.close()
        dbase.commit()
        if not self.parsed_data:
            dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)')
            dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)')
            dbase.commit()
        print '--- finished adding text to corpus ---'
        print
        self.parsed_data = True
Exemple #11
0
    def collect_arxiv_data(self, authors=None, cats=None):
        """
        Collect pdf data from arXiv with specified authors and category
        @param authors: The authors to be searched, separate authors with ' OR ' , note: author queries are exact
        e.g. 'Michael I. Jordan OR Michael Jordan OR David Blei OR David M. Blei', searches for the publications of the two authors with various spellings.
        @param cats: category restrictions
        """
        # TODO handle possible errors in data collection

        # extract params from form
        qry = 'http://export.arxiv.org/api/query?search_query='
        if cats:
            cats = map(lambda x: "cat:" + x, cats)
            if len(cats) > 1:
                cats = '%28' + '+OR+'.join(cats) + '%29'
            else:
                cats = cats[0]
            qry += cats
        if authors:
            authors = authors.lower().split(' or ')
            authors = map(lambda x: '%22' + x.replace(' ', '+') + '%22', authors)
            authors = map(lambda x: "au:" + x, authors)
            authors = '+OR+'.join(authors)
            authors = '%28' + authors.replace(' ','+') + '%29'
            if cats:
                qry += "+AND+"
            qry += authors

        qry += '&max_results=150' # ONLINE LIMITIATION, remove for standalone or set to 2000
        print qry
        req = urllib2.urlopen(qry, timeout=10)
        soup = BeautifulSoup(req.read())

        titles = soup.findAll('title')
        titles = titles[1:] # skip the query title
        titles = map(lambda x: x.text, titles)
        pdf_links = soup.findAll('link', attrs={'title': 'pdf'})
        pdf_urls = map(lambda x: x['href'], pdf_links)

        print 'downloading: %s, %i' % (authors, len(pdf_urls))
        print titles
        print len(pdf_urls)

        # randomly grab the urls so we don't have all article from one author in online version (i.e. with limitations)
        ct = 0
        for urlnum in random.sample(range(len(pdf_urls)), len(pdf_urls)):
            if self._stream_to_file(urllib2.urlopen(pdf_urls[urlnum], timeout=8), os.path.join(self.data_folder, slugify(titles[urlnum]) + '.pdf')):
                ct += 1
        print '\n$$$$\nAdded %i files from arXiv, total downloaded content at %0.2f Mb\n$$$$\n' % (ct, self.tot_dl)