Ejemplo n.º 1
0
def add_feed_items(scraper_module, feedhandler, feed_url):
    """Add feed items to database.."""

    # should be smarter here, e.g. use If-Modified-Since
    feed = feedparser.parse(feed_url, agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50')

    should_scrape = scraper_factory.should_scrape[scraper_module]

    for item in feed['items']:
        base_article = {}

        if type(feedhandler) is list:
          for feedhandler_ in feedhandler:
            if feedhandler_ in item:
              item_url = item[feedhandler_]
              break
          else:
            raise Exception("Feed tag(s) not valid.")
        else:
          item_url = item[feedhandler]

        check_exists = db_store.view('index/sources', key=item_url, include_docs='false')

        try:
          check_exists.rows
        except:
          check_exists = db_store.view('index/sources', key=item_url, include_docs='false')

        if not check_exists.rows:
          if should_scrape:
            scrape_journal.delay(item_url, base_article=base_article)
          else:
            scrape_rss.delay(scraper_module, item)
Ejemplo n.º 2
0
def scrape_doi(doi, doc_id=None):
    records_doi = db_store.view('index/ids', key='doi:' + doi, include_docs='true').rows

    url = resolve_doi(doi)
    records_source = db_store.view('index/sources', key=url, include_docs='true').rows
    
    if doc_id is not None or not (records_doi and records_source):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        try:
          scraped_article = resolve_and_scrape(url)

          # If we haven't excepted at this point, clear the current article and save it
          article.clear()
          article.update(scraped_article)

          # Add the id and revision back in since we just cleared the doc. Awkward.
          if doc_id:
            article['_id'] = doc_id
            article['_rev'] = rev_id
        except Exception, e:
          # Make a doc to remember to rescrape later
          article['error'] = str(e)
          article['rescrape'] = True
          article['source_urls'] = [url]

        if article:
          doc_id, _ = db_store.save(article)
Ejemplo n.º 3
0
def check_source(url):
  rows = db_store.view('index/sources', key=url).rows
  
  if len(rows) == 0:
    return True
  else:
    return False
Ejemplo n.º 4
0
def scrape_rss(scraper_module, item):
  s = scrapers.module_names[scraper_module]
  d = s.scrape_rss(item)

  if 'journal' in d:
    d['journal_id'] = resolve_journal(d['journal'])

  if check_source(d['source_urls'][0]):
    doc_id, _ = db_store.save(d)
  else:
    print "Already got this one"
    rows = db_store.view('index/sources', key=d['source_urls'][0], include_docs='true').rows
    article = rows[0].doc
    doc_id = article.id

  return doc_id
Ejemplo n.º 5
0
def scrape_journal(url, doc_id=None, base_article={}):
    """Find the paper in the database and then add or merge as
    necessary."""

    # TODO: Make sure that if doc_id is not None, it does actually
    # refer to a document in the database.

    # Scrape if we have a doc_id or it hasn't already been scraped
    # always scrape if we're given a doc_id
    if doc_id is not None or check_source(url):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        scraped_article = resolve_and_scrape(url)

        # clear the current article and save it
        article.clear()
        article.update(base_article)
        article.update(scraped_article)

        # Add the id and revision back in since we just cleared the
        # doc. Awkward.
        if doc_id:
          article['_id'] = doc_id
          article['_rev'] = rev_id

        # If we haven't explicitly asked for the article to be scraped
        # by providing a doc_id, then check that it hasn't been
        # inadvertantly scraped already before we go
        if doc_id is not None or check_source(article['source_urls'][-1]):
            doc_id, _ = db_store.save(article)
    else:
        # we've already scraped this url. there should only be one
        # such doc.
        rows = db_store.view('index/sources', key=url, include_docs='true').rows
        article = rows[0].doc
        doc_id = article.id

    resolve_merges()

    return doc_id
Ejemplo n.º 6
0
from akorn.celery.couch import db_journals, db_store

for journal_id in db_journals:
    rows = db_store.view("index/journal_id", key=journal_id).rows

    if len(rows) == 0:
        journal = db_journals[journal_id]
        try:
            print "No articles for {}".format(journal["name"])
        except:
            print journal
Ejemplo n.º 7
0
def rescrape_articles():
  records = db_store.view('rescrape/rescrape', include_docs='true').rows

  for record in records:
    scrape_journal.delay(record.doc['source_url'], record.doc.id)
from akorn.celery.couch import db_store, db_journals

print "Hello"

journal_id_map = {}

def make_journal(journal_name):
  doc = {'name': journal_name,
         'aliases': [journal_name],}

  doc_id, doc_rev = db_journals.save(doc)

  journal_id_map[journal_name] = doc_id

  return doc_id

for row in db_store.view('missing/journal_id', include_docs=True).rows:
  doc = row.doc
  if 'journal' in doc:
    print doc['journal']

    if doc['journal'] in journal_id_map:
      doc['journal_id'] = journal_id_map[doc['journal']]
      print "Re-using"
    else:
      doc['journal_id'] = make_journal(doc['journal'])
      print "Making new"

    db_store.save(doc)