Esempio n. 1
0
def demo2():
    """
    You unexpectedly became very interested in Deep Belief Networks. As a first
    stab at some background reading, you want to:
    1. Find all NIPS publications with Deep in title
    2. open them in the browser
    
    Pre-requisites:
    - Assumes 'pubs_nips' exists. This can be obtained by running 
      nips_download_parse.py or by downloading it from site.
      (https://sites.google.com/site/researchpooler/home)
    
    Side-effects:
    - will use os call to open a pdf with default program
    """
    
    print "loading the NIPS publications dataset..."
    pubs = loadPubs('pubs_nips')
    
    # get urls that correspond to publications with deep in title
    p = [x['pdf'] for x in pubs if 'deep' in x['title'].lower()]
    
    if len(p)>5:
        print "oops too many (%d) results! Only opening random 5." % (len(p),)
        p=p[:5]
        
    openPDFs(p)
Esempio n. 2
0
def demo1():
    """
    You wrote an algorithm and benchmarked it on the MNIST dataset. You are 
    wondering how your results compare with those in the literature:
    1. Finds all publications that mention mnist
    2. Print out their titles
    3. Open the three latest publications that mention it at least twice
    
    Pre-requisites:
    - Assumes 'pubs_nips' exists and that pdf text is present. 
      This can be obtained by running 
      nips_download_parse.py and then nips_add_pdftext.py, or by downloading it 
      from site (https://sites.google.com/site/researchpooler/home)
    
    Side-effects:
    - will use os call to open a pdf with default program
    """
    
    print "loading the NIPS publications dataset..."
    pubs = loadPubs('pubs_nips')
    
    # get all papers that mention mnist
    p = [x for x in pubs if 'mnist' in x.get('pdf_text',{})]
    print "titles of papers that mention MNIST dataset:"
    for x in p:
        print x['title']
    print "total of %d publications mention MNIST." %(len(p),)
    
    # sort by number of occurences
    occ = [(x['year'], x['pdf']) for i,x in enumerate(p) if x['pdf_text']['mnist']>1]
    occ.sort(reverse = True)
    
    # open the top 3 latest in browser
    print "opening the top 3..."
    openPDFs([x for year,x in occ[:3]])
Esempio n. 3
0
def demo3():
    """
    You found a cool paper online and you want to find similar papers:
    1. Download and parse the pdf
    2. Compare to text of all publications in pubs_ database
    3. Open the top 3 matches in browser (but note that current matching alg is
                                          very basic and could be much improved)
    
    Pre-requisites:
    - Assumes 'pubs_nips' exists and contains pdf text inside 
      (under key 'pdf_text'). This can be obtained by running 
      nips_download_parse.py and then nips_add_pdftext.py 
      or by downloading it from site.
      (https://sites.google.com/site/researchpooler/home)
    
    Side-effects:
    - will use os call to open a pdf with default program
    """

    # fetch this pdf from website, parse it, and make a publication dict from it
    # here is a random pdf from Andrew's website
    url = 'http://ai.stanford.edu/~ang/papers/icml11-DeepEnergyModels.pdf'
    print "downloading %s..." % (url, )
    text = convertPDF(url)  #extract the text
    bow = stringToWordDictionary(
        text)  #extract the bag of words representation
    p = {'pdf_text': bow}  #create a dummy publication dict

    # calculate similarities to our publications
    print "loading database..."
    pubs = loadPubs('pubs_nips')
    print "computing similarities. (may take while with current implementation)"
    scores = publicationSimilarityNaive(pubs, p)

    # find highest scoring pubs
    lst = [(s, i) for i, s in enumerate(scores) if s >= 0]
    lst.sort(reverse=True)

    # display top 50 matches
    m = min(50, len(lst))
    for s, i in lst[:m]:
        print "%.2f is similarity to %s." % (s, pubs[i]['title'])

    #open the top 3 in browser
    print "opening the top 3..."
    openPDFs([pubs[i]['pdf'] for s, i in lst[:3]])
Esempio n. 4
0
def demo3():
    """
    You found a cool paper online and you want to find similar papers:
    1. Download and parse the pdf
    2. Compare to text of all publications in pubs_ database
    3. Open the top 3 matches in browser (but note that current matching alg is
                                          very basic and could be much improved)
    
    Pre-requisites:
    - Assumes 'pubs_nips' exists and contains pdf text inside 
      (under key 'pdf_text'). This can be obtained by running 
      nips_download_parse.py and then nips_add_pdftext.py 
      or by downloading it from site.
      (https://sites.google.com/site/researchpooler/home)
    
    Side-effects:
    - will use os call to open a pdf with default program
    """
    
    # fetch this pdf from website, parse it, and make a publication dict from it
    # here is a random pdf from Andrew's website
    url = 'http://ai.stanford.edu/~ang/papers/icml11-DeepEnergyModels.pdf'
    print "downloading %s..." % (url,)
    text = convertPDF(url) #extract the text
    bow = stringToWordDictionary(text) #extract the bag of words representation
    p = {'pdf_text' : bow} #create a dummy publication dict
    
    # calculate similarities to our publications
    print "loading database..."
    pubs = loadPubs('pubs_nips')
    print "computing similarities. (may take while with current implementation)"
    scores = publicationSimilarityNaive(pubs, p)
    
    # find highest scoring pubs
    lst = [(s, i) for i,s in enumerate(scores) if s>=0]
    lst.sort(reverse = True)
    
    # display top 50 matches
    m = min(50, len(lst))
    for s, i in lst[:m]:
        print "%.2f is similarity to %s." % (s, pubs[i]['title'])
    
    #open the top 3 in browser
    print "opening the top 3..."
    openPDFs([pubs[i]['pdf'] for s,i in lst[:3]])
Esempio n. 5
0
def demo1():
    """
    You wrote an algorithm and benchmarked it on the MNIST dataset. You are 
    wondering how your results compare with those in the literature:
    1. Finds all publications that mention mnist
    2. Print out their titles
    3. Open the three latest publications that mention it at least twice
    
    Pre-requisites:
    - Assumes 'pubs_nips' exists and that pdf text is present. 
      This can be obtained by running 
      nips_download_parse.py and then nips_add_pdftext.py, or by downloading it 
      from site (https://sites.google.com/site/researchpooler/home)
    
    Side-effects:
    - will use os call to open a pdf with default program
    """

    print "loading the NIPS publications dataset..."
    pubs = loadPubs('pubs_nips')

    # get all papers that mention mnist
    p = [x for x in pubs if 'mnist' in x.get('pdf_text', {})]
    print "titles of papers that mention MNIST dataset:"
    for x in p:
        print x['title']
    print "total of %d publications mention MNIST." % (len(p), )

    # sort by number of occurences
    occ = [(x['year'], x['pdf']) for i, x in enumerate(p)
           if x['pdf_text']['mnist'] > 1]
    occ.sort(reverse=True)

    # open the top 3 latest in browser
    print "opening the top 3..."
    openPDFs([x for year, x in occ[:3]])
"""
Standalone helper script.

Load nips pubs_ file, and adds to every paper its word counts under key 
'pdf_text'. The PDF for each paper is downloaded from NIPS site.
"""

from repool_util import loadPubs, savePubs, stringToWordDictionary
from pdf_read import convertPDF

pubs_all = loadPubs('pubs_nips')
print 'loaded pubs with %d entries.' % (len(pubs_all),)

#possibly place restrictions on pubs to process here
pubs = pubs_all

for i,p in enumerate(pubs):
    
    #if the pdf url does not exist, in future this could possibly use google
    #search to try to look up a link for the pdf first.
    if p.has_key('pdf') and not p.has_key('pdf_text'):
        
        # try to open the PDF from downloaded location
        processed = False
        try:
            floc = p['pdf'].index('NIPS')
            fname = p['pdf'][floc:]
            txt = convertPDF('downloads/'+fname)
            processed = True
            print 'found %s in file!' % (p['title'],)
        except: