import sys from DB import DB from URL import URL db = DB('citeseerx.db') db.create_tables() # db.del_all() # http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057 if len(sys.argv) == 2: url = URL(sys.argv[1]) url.open() db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()}) else: print 'Please supply proper URL.'
from URL import URL from DB import DB from bs4 import BeautifulSoup db = DB('citeseerx.db') count = 0 while db.count_unpr(): # url = URL('http://citeseerx.ist.psu.edu/viewdoc/summary?cid=4320') count = count + 1 url = db.get_unpr() print url url = URL(url) url.open() db.update_link(url.get_doi(), 2) if (not db.exists('link', url.get_doi()) and url.redirect_occured()): db.insert('link', { 'doi': url.get_doi(), 'url': url.get_redirect_url() }) if (not db.exists('metadata', url.get_doi())): html = url.fetch() # extract abstract soup = BeautifulSoup(html, "html.parser") title = soup.find('h2').findAll(text=True)[0] abstract_div = soup.find("div", {"id": "abstract"}) for tag in abstract_div: if tag.name == 'p': abstract = tag.findAll(text=True)