floc = p['pdf'].index('NIPS') fname = p['pdf'][floc:] txt = convertPDF('downloads/'+fname) processed = True print 'found %s in file!' % (p['title'],) except: pass if not processed: # download the PDF and convert to text try: print 'downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper')) txt = convertPDF(p['pdf']) processed = True print 'processed from url!' except: print 'error: unable to open download the pdf from %s' % (p['pdf'],) print 'skipping...' if processed: # convert to bag of words and store try: p['pdf_text'] = stringToWordDictionary(txt) except: print 'was unable to convert text to bag of words. Skipped.' print '%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs)) savePubs('pubs_nips', pubs_all)
new_pub['authors'] = [x.strip() for x in author_list] # I hate myself a little for this # TODO LATER_MAYBE: CODE CHUNK DUPLICATION if not new_pub.has_key('authors'): warnings.append("oh oh no authors for publication... ") if not new_pub.has_key('title'): warnings.append("oh oh no title for publication... ") new_pub['venue'] = venue new_pub['year']= year pubs.append(new_pub) print "read in %d publications for year %d." % (len(pubs) - old_count, year) # show warnings, if any were generated if len(warnings)>0: print "%d warnings:" % (len(warnings),) for x in warnings: print x else: print "No warnings generated." # finally, save pickle as output print "read in a total of %d publications." % (len(pubs),) fname = "pubs_nips" print "saving pickle in %s" % (fname,) savePubs(fname, pubs) print "all done."
new_pub['authors'] = [x.strip() for x in author_list] # I hate myself a little for this # TODO LATER_MAYBE: CODE CHUNK DUPLICATION if not new_pub.has_key('authors'): warnings.append("oh oh no authors for publication... ") if not new_pub.has_key('title'): warnings.append("oh oh no title for publication... ") new_pub['venue'] = venue new_pub['year'] = year pubs.append(new_pub) print "read in %d publications for year %d." % (len(pubs) - old_count, year) # show warnings, if any were generated if len(warnings) > 0: print "%d warnings:" % (len(warnings), ) for x in warnings: print x else: print "No warnings generated." # finally, save pickle as output print "read in a total of %d publications." % (len(pubs), ) fname = "pubs_nips" print "saving pickle in %s" % (fname, ) savePubs(fname, pubs) print "all done."