def get_doc_page(request, alg_db, doc_title, docid, docloc, doc_cutoff=10, topic_cutoff=10, alg=''): """ return the document page to the user with related terms and topics and the document text TODO limit the length of the document returned to first XY bytes """ myrelations = relations(alg_db) doc = Document(docid, doc_title) topics = myrelations.get_top_related_topics(doc, topic_cutoff) piearray = get_js_doc_topic_pie_array(topics) # related topics topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = {'piearray':piearray, 'data':topic_keys[:topic_cutoff], 'webname':'topics'} # related documents docs = myrelations.get_top_related_docs(doc, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) rightcol = {'data':doc_keys[:topic_cutoff], 'webname':'documents'} try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),'r') except IOError: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'),'r') # TODO fix hack midcol = {'doc':gen_clean_text(doc_text_file)} return render_to_response("three-column-vis.html", {'leftcol':leftcol, 'rightcol':rightcol, 'midcol':midcol, 'title':doc.title}, context_instance=RequestContext(request))
def get_doc_page(request, alg_db, doc_title, docid, docloc, doc_cutoff=10, topic_cutoff=10, alg=''): """ return the document page to the user with related terms and topics and the document text TODO limit the length of the document returned to first XY bytes """ myrelations = relations(alg_db) doc = Document(docid, doc_title) topics = myrelations.get_top_related_topics(doc, topic_cutoff) piearray = get_js_doc_topic_pie_array(topics) # related topics topic_keys = topics.keys() topic_keys.sort(lambda x, y: -cmp(topics[x], topics[y])) leftcol = { 'piearray': piearray, 'data': topic_keys[:topic_cutoff], 'webname': 'topics' } # related documents docs = myrelations.get_top_related_docs(doc, doc_cutoff) doc_keys = docs.keys() doc_keys.sort(lambda x, y: -cmp(docs[x], docs[y])) rightcol = {'data': doc_keys[:topic_cutoff], 'webname': 'documents'} try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))), 'r') except IOError: doc_text_file = open( os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'), 'r') # TODO fix hack midcol = {'doc': gen_clean_text(doc_text_file)} return render_to_response("three-column-vis.html", { 'leftcol': leftcol, 'rightcol': rightcol, 'midcol': midcol, 'title': doc.title }, context_instance=RequestContext(request))
def get_doc_text(docloc, title_wID, numbytes=500): """ Obtain the text of the document without any surrounding html """ doc_title = " ".join(title_wID.split('-')[0:-1]) try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))),'r') except IOError: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'),'r') # TODO fix hack txt = doc_text_file.read(numbytes) doc_text_file.close() doc_text = escape(remove_non_ascii(txt)) doc_text += "...<br /> <div style=\"text-align:center; margin-top: 10px;\"> <input type=\"button\" name=\"b1\" value=\"View full document\" onclick=\"openlw('" + title_wID + "')\" /> </div>" return HttpResponse(doc_text)
def get_doc_text(docloc, title_wID, numbytes=500): """ Obtain the text of the document without any surrounding html """ doc_title = " ".join(title_wID.split('-')[0:-1]) try: doc_text_file = open(os.path.join(docloc, slugify(unicode(doc_title))), 'r') except IOError: doc_text_file = open( os.path.join(docloc, slugify(unicode(doc_title)) + '.txt'), 'r') # TODO fix hack txt = doc_text_file.read(numbytes) doc_text_file.close() doc_text = escape(remove_non_ascii(txt)) doc_text += "...<br /> <div style=\"text-align:center; margin-top: 10px;\"> <input type=\"button\" name=\"b1\" value=\"View full document\" onclick=\"openlw('" + title_wID + "')\" /> </div>" return HttpResponse(doc_text)
def get_safe_title(self): safe_title = slugify(self.title) return safe_title
def get_safe_title(self): return slugify(unicode(self.title))
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % (pdf, txtname) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append(txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table('term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)') if self.make_stem_db: dbase.add_table('termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)') # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-',' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile,'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd=='': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key(wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document(os.path.join(self.paradir, slugify(usetitle)),paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True
def parse_folder(self, folder): """ parses the various datatypes in the folder and writes the lda-c format to file """ # obtain list of all pdfs (TODO add heterogenous file types) pdflist = os.popen("find %s -name '*.pdf' -type f" % folder) pdflist = pdflist.readlines() pdflist = map(lambda x: x.strip(), pdflist) self.pdf_list.extend(pdflist) toparsetexts = [] if len(pdflist): print '--- beginning pdf to text conversion ---' for pdf in pdflist: doctitle = self._obtain_clean_title(pdf) txtname = self.textdir + '/%s.txt' % doctitle cmd = 'pdftotext %s %s' % ( pdf, txtname ) # TODO: figure out and print which documents did not convert os.system(cmd) toparsetexts.append(txtname) self.rawtextfiles.append(txtname) print '--- finished pdf to text conversion ---' print '---adding text to corpus---' # add textual data txtlist = os.popen("find %s -name '*.txt' -type f" % folder) # add text files included in folder txtlist = map(lambda x: x.strip(), txtlist) for txtf in txtlist: doctitle = self._obtain_clean_title(txtf) txtname = self.textdir + '/%s.txt' % doctitle try: os.system('ln -s %s %s' % (txtf, txtname)) except IOError: print 'Warning: will not include %s, could not parse text file' % txtf continue toparsetexts.append(txtname) self.rawtextfiles.append( txtname) # TODO: fix code repetition with parsing pdfs # now add all of the new texts to the corpus cfile = self.open_corpus() if self.usepara: # make a directory for each of the individual paragraphs if not os.path.exists(self.paradir): os.makedirs(self.paradir) else: # make a link to the textdir with the same name as the individual paragraph directory if not os.path.exists(self.paradir): os.system('ln -s %s %s' % (self.textdir, self.paradir)) # initialize the database to keep track of term-doc occurances dbase = db(self.corpus_db) if not self.parsed_data: dbase.add_table( 'term_doc_pair(id INTEGER PRIMARY KEY, term INTEGER, doc INTEGER)' ) if self.make_stem_db: dbase.add_table( 'termid_to_prestem(id INTEGER PRIMARY KEY, prestem VARCHAR)' ) # add the data to the corpus for tfile in toparsetexts: title = tfile.split('/')[-1].split('.')[0].replace('-', ' ') wordcounts = dict() prestem_dic = dict() try: infile = open(tfile, 'r') except IOError: print 'WARNING: could not find %s, will not include' % tfile continue useparanum = 1 totparanum = 1 for paraline in infile: totparanum += 1 words = paraline.split() for wrd in words: wrd = self.parse_word(wrd) if wrd == '': continue else: prestem = wrd if self.dostem: wrd = stem(wrd) if wordcounts.has_key(wrd): wordcounts[wrd] += 1 else: wordcounts[wrd] = 1 # keep track of the unstemmed forms of new words for later reference. TODO this currently keeps the unstemmed form of the first encounter of a stemmed word: perhaps make more general? if self.make_stem_db and not self.vocab.has_key( wrd): prestem_dic[wrd] = prestem if self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) usetitle = title + ' [P%d]' % useparanum self.titles.append(usetitle) if not isinstance(usetitle, unicode): usetitle = unicode(usetitle) self.write_document( os.path.join(self.paradir, slugify(usetitle)), paraline) useparanum += 1 wordcounts = dict() prestem_dic = dict() infile.close() if not self.usepara: if sum(wordcounts.values()) > self.minwords: self.write_doc_line(cfile, wordcounts, dbase, prestem_dic) self.titles.append(title) cfile.close() dbase.commit() if not self.parsed_data: dbase.add_index('term_doc_pair_idx1 ON term_doc_pair(term)') dbase.add_index('term_doc_pair_idx2 ON term_doc_pair(doc)') dbase.commit() print '--- finished adding text to corpus ---' print self.parsed_data = True
def collect_arxiv_data(self, authors=None, cats=None): """ Collect pdf data from arXiv with specified authors and category @param authors: The authors to be searched, separate authors with ' OR ' , note: author queries are exact e.g. 'Michael I. Jordan OR Michael Jordan OR David Blei OR David M. Blei', searches for the publications of the two authors with various spellings. @param cats: category restrictions """ # TODO handle possible errors in data collection # extract params from form qry = 'http://export.arxiv.org/api/query?search_query=' if cats: cats = map(lambda x: "cat:" + x, cats) if len(cats) > 1: cats = '%28' + '+OR+'.join(cats) + '%29' else: cats = cats[0] qry += cats if authors: authors = authors.lower().split(' or ') authors = map(lambda x: '%22' + x.replace(' ', '+') + '%22', authors) authors = map(lambda x: "au:" + x, authors) authors = '+OR+'.join(authors) authors = '%28' + authors.replace(' ','+') + '%29' if cats: qry += "+AND+" qry += authors qry += '&max_results=150' # ONLINE LIMITIATION, remove for standalone or set to 2000 print qry req = urllib2.urlopen(qry, timeout=10) soup = BeautifulSoup(req.read()) titles = soup.findAll('title') titles = titles[1:] # skip the query title titles = map(lambda x: x.text, titles) pdf_links = soup.findAll('link', attrs={'title': 'pdf'}) pdf_urls = map(lambda x: x['href'], pdf_links) print 'downloading: %s, %i' % (authors, len(pdf_urls)) print titles print len(pdf_urls) # randomly grab the urls so we don't have all article from one author in online version (i.e. with limitations) ct = 0 for urlnum in random.sample(range(len(pdf_urls)), len(pdf_urls)): if self._stream_to_file(urllib2.urlopen(pdf_urls[urlnum], timeout=8), os.path.join(self.data_folder, slugify(titles[urlnum]) + '.pdf')): ct += 1 print '\n$$$$\nAdded %i files from arXiv, total downloaded content at %0.2f Mb\n$$$$\n' % (ct, self.tot_dl)