Exemple #1
0
def add_new_scans(args):
    """Adds new scans from yesterday.
    """
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    c1 = 'opensource%'
    c2 = '%additional_collections%'

    q = ("SELECT identifier FROM metadata" +
        " WHERE repub_state=4" +
        "   AND mediatype='texts'" +
        "   AND collection NOT LIKE $c1" +
        "   AND collection NOT LIKE $c2" + 
        "   AND publicdate >= $date and publicdate < ($date::date + INTERVAL '1' DAY)::text")

    result = get_ia_db().query(q, vars=dict(c1=c1, c2=c2, date=date.isoformat()))
    items = [row.identifier for row in result]
    
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
Exemple #2
0
    def __init__(self):
        LegacyDataProvider.__init__(self)
        # cache for documents
        self.cache = {}
        self.metadata_cache = {}

        # cache for redirects
        self.redirect_cache = {}

        self.edition_keys_of_works_cache = {}

        import infogami
        from infogami.utils import delegate

        infogami._setup()
        delegate.fakeload()

        from openlibrary.solr.process_stats import get_ia_db, get_db
        self.db = get_db()
        self.ia_db = get_ia_db()
Exemple #3
0
    def __init__(self):
        LegacyDataProvider.__init__(self)
        # cache for documents
        self.cache = {}
        self.metadata_cache = {}

        # cache for redirects
        self.redirect_cache = {}

        self.edition_keys_of_works_cache = {}

        import infogami
        from infogami.utils import delegate

        infogami._setup()
        delegate.fakeload()

        from openlibrary.solr.process_stats import get_ia_db, get_db
        self.db = get_db()
        self.ia_db = get_ia_db()
def add_new_scans(args):
    """Adds new scans from yesterday.
    """
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    c1 = '%opensource%'
    c2 = '%additional_collections%'

    # Find all scans which are updated/added on the given date 
    # and have been scanned at most 2 months ago
    q = ("SELECT identifier FROM metadata" +
        " WHERE repub_state=4" +
        "   AND mediatype='texts'" +
        "   AND scancenter IS NOT NULL" +
        "   AND collection NOT LIKE $c1" +
        "   AND collection NOT LIKE $c2" + 
        "   AND (curatestate IS NULL OR curatestate != 'dark')" +
        "   AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" +
        "   AND scandate is NOT NULL AND scandate > $min_scandate" +
        "   AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)")

    min_scandate = date - datetime.timedelta(60) # 2 months ago
    result = get_ia_db().query(q, vars=dict(
        c1=c1, 
        c2=c2, 
        date=date.isoformat(),
        min_scandate=min_scandate.strftime("%Y%m%d")))
    items = [row.identifier for row in result]    
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
def add_new_scans(args):
    """Adds new scans from yesterday.
    """
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    c1 = '%opensource%'
    c2 = '%additional_collections%'

    # Find all scans which are updated/added on the given date
    # and have been scanned at most 2 months ago
    q = (
        "SELECT identifier FROM metadata" + " WHERE repub_state=4" +
        "   AND mediatype='texts'" + "   AND scancenter IS NOT NULL" +
        "   AND collection NOT LIKE $c1" + "   AND collection NOT LIKE $c2" +
        "   AND (curatestate IS NULL OR curatestate != 'dark')" +
        "   AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'"
        + "   AND scandate is NOT NULL AND scandate > $min_scandate" +
        "   AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)"
    )

    min_scandate = date - datetime.timedelta(60)  # 2 months ago
    result = get_ia_db().query(
        q,
        vars=dict(c1=c1,
                  c2=c2,
                  date=date.isoformat(),
                  min_scandate=min_scandate.strftime("%Y%m%d")))
    items = [row.identifier for row in result]
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)