def demo2(): """ You unexpectedly became very interested in Deep Belief Networks. As a first stab at some background reading, you want to: 1. Find all NIPS publications with Deep in title 2. open them in the browser Pre-requisites: - Assumes 'pubs_nips' exists. This can be obtained by running nips_download_parse.py or by downloading it from site. (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program """ print "loading the NIPS publications dataset..." pubs = loadPubs('pubs_nips') # get urls that correspond to publications with deep in title p = [x['pdf'] for x in pubs if 'deep' in x['title'].lower()] if len(p)>5: print "oops too many (%d) results! Only opening random 5." % (len(p),) p=p[:5] openPDFs(p)
def demo1(): """ You wrote an algorithm and benchmarked it on the MNIST dataset. You are wondering how your results compare with those in the literature: 1. Finds all publications that mention mnist 2. Print out their titles 3. Open the three latest publications that mention it at least twice Pre-requisites: - Assumes 'pubs_nips' exists and that pdf text is present. This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py, or by downloading it from site (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program """ print "loading the NIPS publications dataset..." pubs = loadPubs('pubs_nips') # get all papers that mention mnist p = [x for x in pubs if 'mnist' in x.get('pdf_text',{})] print "titles of papers that mention MNIST dataset:" for x in p: print x['title'] print "total of %d publications mention MNIST." %(len(p),) # sort by number of occurences occ = [(x['year'], x['pdf']) for i,x in enumerate(p) if x['pdf_text']['mnist']>1] occ.sort(reverse = True) # open the top 3 latest in browser print "opening the top 3..." openPDFs([x for year,x in occ[:3]])
def demo3(): """ You found a cool paper online and you want to find similar papers: 1. Download and parse the pdf 2. Compare to text of all publications in pubs_ database 3. Open the top 3 matches in browser (but note that current matching alg is very basic and could be much improved) Pre-requisites: - Assumes 'pubs_nips' exists and contains pdf text inside (under key 'pdf_text'). This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py or by downloading it from site. (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program """ # fetch this pdf from website, parse it, and make a publication dict from it # here is a random pdf from Andrew's website url = 'http://ai.stanford.edu/~ang/papers/icml11-DeepEnergyModels.pdf' print "downloading %s..." % (url, ) text = convertPDF(url) #extract the text bow = stringToWordDictionary( text) #extract the bag of words representation p = {'pdf_text': bow} #create a dummy publication dict # calculate similarities to our publications print "loading database..." pubs = loadPubs('pubs_nips') print "computing similarities. (may take while with current implementation)" scores = publicationSimilarityNaive(pubs, p) # find highest scoring pubs lst = [(s, i) for i, s in enumerate(scores) if s >= 0] lst.sort(reverse=True) # display top 50 matches m = min(50, len(lst)) for s, i in lst[:m]: print "%.2f is similarity to %s." % (s, pubs[i]['title']) #open the top 3 in browser print "opening the top 3..." openPDFs([pubs[i]['pdf'] for s, i in lst[:3]])
def demo3(): """ You found a cool paper online and you want to find similar papers: 1. Download and parse the pdf 2. Compare to text of all publications in pubs_ database 3. Open the top 3 matches in browser (but note that current matching alg is very basic and could be much improved) Pre-requisites: - Assumes 'pubs_nips' exists and contains pdf text inside (under key 'pdf_text'). This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py or by downloading it from site. (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program """ # fetch this pdf from website, parse it, and make a publication dict from it # here is a random pdf from Andrew's website url = 'http://ai.stanford.edu/~ang/papers/icml11-DeepEnergyModels.pdf' print "downloading %s..." % (url,) text = convertPDF(url) #extract the text bow = stringToWordDictionary(text) #extract the bag of words representation p = {'pdf_text' : bow} #create a dummy publication dict # calculate similarities to our publications print "loading database..." pubs = loadPubs('pubs_nips') print "computing similarities. (may take while with current implementation)" scores = publicationSimilarityNaive(pubs, p) # find highest scoring pubs lst = [(s, i) for i,s in enumerate(scores) if s>=0] lst.sort(reverse = True) # display top 50 matches m = min(50, len(lst)) for s, i in lst[:m]: print "%.2f is similarity to %s." % (s, pubs[i]['title']) #open the top 3 in browser print "opening the top 3..." openPDFs([pubs[i]['pdf'] for s,i in lst[:3]])
def demo1(): """ You wrote an algorithm and benchmarked it on the MNIST dataset. You are wondering how your results compare with those in the literature: 1. Finds all publications that mention mnist 2. Print out their titles 3. Open the three latest publications that mention it at least twice Pre-requisites: - Assumes 'pubs_nips' exists and that pdf text is present. This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py, or by downloading it from site (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program """ print "loading the NIPS publications dataset..." pubs = loadPubs('pubs_nips') # get all papers that mention mnist p = [x for x in pubs if 'mnist' in x.get('pdf_text', {})] print "titles of papers that mention MNIST dataset:" for x in p: print x['title'] print "total of %d publications mention MNIST." % (len(p), ) # sort by number of occurences occ = [(x['year'], x['pdf']) for i, x in enumerate(p) if x['pdf_text']['mnist'] > 1] occ.sort(reverse=True) # open the top 3 latest in browser print "opening the top 3..." openPDFs([x for year, x in occ[:3]])
""" Standalone helper script. Load nips pubs_ file, and adds to every paper its word counts under key 'pdf_text'. The PDF for each paper is downloaded from NIPS site. """ from repool_util import loadPubs, savePubs, stringToWordDictionary from pdf_read import convertPDF pubs_all = loadPubs('pubs_nips') print 'loaded pubs with %d entries.' % (len(pubs_all),) #possibly place restrictions on pubs to process here pubs = pubs_all for i,p in enumerate(pubs): #if the pdf url does not exist, in future this could possibly use google #search to try to look up a link for the pdf first. if p.has_key('pdf') and not p.has_key('pdf_text'): # try to open the PDF from downloaded location processed = False try: floc = p['pdf'].index('NIPS') fname = p['pdf'][floc:] txt = convertPDF('downloads/'+fname) processed = True print 'found %s in file!' % (p['title'],) except: