Example #1
0
import sys
from DB import DB
from URL import URL

db = DB('citeseerx.db')
db.create_tables()
# db.del_all()

# http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057
if len(sys.argv) == 2:
    url = URL(sys.argv[1])
    url.open()
    db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()})
else:
    print 'Please supply proper URL.'
Example #2
0
from URL import URL
from DB import DB
from bs4 import BeautifulSoup

db = DB('citeseerx.db')

count = 0
while db.count_unpr():
    # url = URL('http://citeseerx.ist.psu.edu/viewdoc/summary?cid=4320')
    count = count + 1
    url = db.get_unpr()
    print url
    url = URL(url)
    url.open()
    db.update_link(url.get_doi(), 2)

    if (not db.exists('link', url.get_doi()) and url.redirect_occured()):
        db.insert('link', {
            'doi': url.get_doi(),
            'url': url.get_redirect_url()
        })

    if (not db.exists('metadata', url.get_doi())):
        html = url.fetch()
        # extract abstract
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find('h2').findAll(text=True)[0]
        abstract_div = soup.find("div", {"id": "abstract"})
        for tag in abstract_div:
            if tag.name == 'p':
                abstract = tag.findAll(text=True)