Beispiel #1
0
def scrape_doi(doi, doc_id=None):
    records_doi = db_store.view('index/ids', key='doi:' + doi, include_docs='true').rows

    url = resolve_doi(doi)
    records_source = db_store.view('index/sources', key=url, include_docs='true').rows
    
    if doc_id is not None or not (records_doi and records_source):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        try:
          scraped_article = resolve_and_scrape(url)

          # If we haven't excepted at this point, clear the current article and save it
          article.clear()
          article.update(scraped_article)

          # Add the id and revision back in since we just cleared the doc. Awkward.
          if doc_id:
            article['_id'] = doc_id
            article['_rev'] = rev_id
        except Exception, e:
          # Make a doc to remember to rescrape later
          article['error'] = str(e)
          article['rescrape'] = True
          article['source_urls'] = [url]

        if article:
          doc_id, _ = db_store.save(article)
Beispiel #2
0
def scrape_rss(scraper_module, item):
  s = scrapers.module_names[scraper_module]
  d = s.scrape_rss(item)

  if 'journal' in d:
    d['journal_id'] = resolve_journal(d['journal'])

  if check_source(d['source_urls'][0]):
    doc_id, _ = db_store.save(d)
  else:
    print "Already got this one"
    rows = db_store.view('index/sources', key=d['source_urls'][0], include_docs='true').rows
    article = rows[0].doc
    doc_id = article.id

  return doc_id
Beispiel #3
0
def scrape_journal(url, doc_id=None, base_article={}):
    """Find the paper in the database and then add or merge as
    necessary."""

    # TODO: Make sure that if doc_id is not None, it does actually
    # refer to a document in the database.

    # Scrape if we have a doc_id or it hasn't already been scraped
    # always scrape if we're given a doc_id
    if doc_id is not None or check_source(url):
        # source url isn't in db
        if doc_id:
          article = db_store[doc_id]
          rev_id = article.rev
        else:
          article = {}
           
        scraped_article = resolve_and_scrape(url)

        # clear the current article and save it
        article.clear()
        article.update(base_article)
        article.update(scraped_article)

        # Add the id and revision back in since we just cleared the
        # doc. Awkward.
        if doc_id:
          article['_id'] = doc_id
          article['_rev'] = rev_id

        # If we haven't explicitly asked for the article to be scraped
        # by providing a doc_id, then check that it hasn't been
        # inadvertantly scraped already before we go
        if doc_id is not None or check_source(article['source_urls'][-1]):
            doc_id, _ = db_store.save(article)
    else:
        # we've already scraped this url. there should only be one
        # such doc.
        rows = db_store.view('index/sources', key=url, include_docs='true').rows
        article = rows[0].doc
        doc_id = article.id

    resolve_merges()

    return doc_id
from akorn.celery.couch import db_store, db_journals

print "Hello"

journal_id_map = {}

def make_journal(journal_name):
  doc = {'name': journal_name,
         'aliases': [journal_name],}

  doc_id, doc_rev = db_journals.save(doc)

  journal_id_map[journal_name] = doc_id

  return doc_id

for row in db_store.view('missing/journal_id', include_docs=True).rows:
  doc = row.doc
  if 'journal' in doc:
    print doc['journal']

    if doc['journal'] in journal_id_map:
      doc['journal_id'] = journal_id_map[doc['journal']]
      print "Re-using"
    else:
      doc['journal_id'] = make_journal(doc['journal'])
      print "Making new"

    db_store.save(doc)