def add_new_scans(args): """Adds new scans from yesterday. """ if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) c1 = 'opensource%' c2 = '%additional_collections%' q = ("SELECT identifier FROM metadata" + " WHERE repub_state=4" + " AND mediatype='texts'" + " AND collection NOT LIKE $c1" + " AND collection NOT LIKE $c2" + " AND publicdate >= $date and publicdate < ($date::date + INTERVAL '1' DAY)::text") result = get_ia_db().query(q, vars=dict(c1=c1, c2=c2, date=date.isoformat())) items = [row.identifier for row in result] batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)
def __init__(self): LegacyDataProvider.__init__(self) # cache for documents self.cache = {} self.metadata_cache = {} # cache for redirects self.redirect_cache = {} self.edition_keys_of_works_cache = {} import infogami from infogami.utils import delegate infogami._setup() delegate.fakeload() from openlibrary.solr.process_stats import get_ia_db, get_db self.db = get_db() self.ia_db = get_ia_db()
def add_new_scans(args): """Adds new scans from yesterday. """ if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) c1 = '%opensource%' c2 = '%additional_collections%' # Find all scans which are updated/added on the given date # and have been scanned at most 2 months ago q = ("SELECT identifier FROM metadata" + " WHERE repub_state=4" + " AND mediatype='texts'" + " AND scancenter IS NOT NULL" + " AND collection NOT LIKE $c1" + " AND collection NOT LIKE $c2" + " AND (curatestate IS NULL OR curatestate != 'dark')" + " AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" + " AND scandate is NOT NULL AND scandate > $min_scandate" + " AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)") min_scandate = date - datetime.timedelta(60) # 2 months ago result = get_ia_db().query(q, vars=dict( c1=c1, c2=c2, date=date.isoformat(), min_scandate=min_scandate.strftime("%Y%m%d"))) items = [row.identifier for row in result] batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)
def add_new_scans(args): """Adds new scans from yesterday. """ if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) c1 = '%opensource%' c2 = '%additional_collections%' # Find all scans which are updated/added on the given date # and have been scanned at most 2 months ago q = ( "SELECT identifier FROM metadata" + " WHERE repub_state=4" + " AND mediatype='texts'" + " AND scancenter IS NOT NULL" + " AND collection NOT LIKE $c1" + " AND collection NOT LIKE $c2" + " AND (curatestate IS NULL OR curatestate != 'dark')" + " AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" + " AND scandate is NOT NULL AND scandate > $min_scandate" + " AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)" ) min_scandate = date - datetime.timedelta(60) # 2 months ago result = get_ia_db().query( q, vars=dict(c1=c1, c2=c2, date=date.isoformat(), min_scandate=min_scandate.strftime("%Y%m%d"))) items = [row.identifier for row in result] batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)