Esempio n. 1
0
def old_dates_update():
    """
    This function updates data by pulling 5 old dates from the past year.
    When run once each day, the entire past year will update every 73 days.
    """

    # pick 5 oldest dates from past year
    date_sql = """
        SELECT * FROM doi_queue_paperbuzz_dates 
        WHERE id > now() - interval '1 year' 
        ORDER BY finished ASC NULLS FIRST LIMIT 5;
    """
    dates = db.session.execute(date_sql)

    # set values to null so they are picked up by importer
    for date in dates:
        id_date = date[0]
        run_sql(
            db,
            """update doi_queue_paperbuzz_dates 
                       set enqueued=NULL, finished=NULL, started=NULL, dyno=NULL 
                       where id = '{id_date}'""".format(id_date=id_date),
        )

    # run update script
    os.system("python doi_queue.py --dates --run")
Esempio n. 2
0
def kick(job_type):
    q = """update {table_name} set started=null, finished=null
          where finished is null
          and id in (select id from {table_name} where started is not null)""".format(
        table_name=table_name(job_type)
    )
    run_sql(db, q)
    print_status(job_type)
Esempio n. 3
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 1)
        queue_table = "doi_queue_dates"
        run_class = DateRange
        run_method = "scroll_through_all_dois"

        if not single_obj_id:
            text_query_pattern = """WITH picked_from_queue AS (
                       SELECT *
                       FROM   {queue_table}
                       WHERE  started is null
                       ORDER BY rand
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   picked_from_queue
                WHERE picked_from_queue.id = queue_rows_to_update.id
                RETURNING picked_from_queue.*;"""
            text_query = text_query_pattern.format(
                chunk=chunk,
                queue_table=queue_table
            )
            logger.info(u"the queue query is:\n{}".format(text_query))

        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                objects = [run_class.query.filter(run_class.id == single_obj_id).first()]
            else:
                # logger.info(u"looking for new jobs")
                objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            if object_ids:
                object_ids_str = u",".join([u"'{}'".format(str(id).replace(u"'", u"''")) for id in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  #sql escaping
                run_sql(db, u"update {queue_table} set finished=now() where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str))

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, chunk, start_time, index)
Esempio n. 4
0
 def process_failed_import(self):
     run_sql(
         db,
         """update doi_queue_paperbuzz_dates 
                                    set enqueued=NULL, finished=NULL, started=NULL, dyno=NULL 
                                    where id = '{id_date}'""".format(
             id_date=self.id),
     )
     raise requests.exceptions.ReadTimeout
Esempio n. 5
0
    def worker_run(self, **kwargs):
        run_class = PdfUrl

        single_url = kwargs.get("id", None)
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)

        if limit is None:
            limit = float("inf")

        if single_url:
            objects = [run_class.query.filter(run_class.url == single_url).first()]
            check_pdf_urls(objects)
        else:
            index = 0
            num_updated = 0
            start_time = time()

            while num_updated < limit:
                new_loop_start_time = time()

                objects = self.fetch_queue_chunk(chunk_size)

                if not objects:
                    sleep(5)
                    continue

                check_pdf_urls(objects)

                object_ids = [obj.url for obj in objects]
                object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  # sql escaping

                sql_command = u"""
                    update {queue_table} q
                    set
                        finished = now(),
                        started = null,
                        retry_interval = least(
                            case when is_pdf then '2 hours' else 4 * coalesce(retry_interval, '2 hours') end,
                            '2 months'
                        ),
                        retry_at = now() + case when is_pdf then '2 months' else coalesce(retry_interval, '2 hours') end
                    from
                        pdf_url
                    where
                        pdf_url.url = q.url
                        and q.url in ({ids})
                    """.format(
                        queue_table=self.table_name(None), ids=object_ids_str
                    )
                run_sql(db, sql_command)

                index += 1
                num_updated += len(objects)
                self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def add_dois_to_queue_from_file(filename, job_type):
    start = time()

    command = """psql `heroku config:get DATABASE_URL`?ssl=true -c "\copy {table_name} (id) FROM '{filename}' WITH CSV DELIMITER E'|';" """.format(
        table_name=table_name(job_type), filename=filename)
    call(command, shell=True)

    q = "update {} set id=lower(id)".format(table_name(job_type))
    run_sql(db, q)

    logger.info(u"add_dois_to_queue_from_file done in {} seconds".format(elapsed(start, 1)))
    print_status(job_type)
Esempio n. 7
0
def add_dois_to_queue_from_file(filename, job_type):
    start = time()

    command = """psql `heroku config:get DATABASE_URL`?ssl=true -c "\copy {table_name} (id) FROM '{filename}' WITH CSV DELIMITER E'|';" """.format(
        table_name=table_name(job_type), filename=filename)
    call(command, shell=True)

    q = "update {} set id=lower(id)".format(table_name(job_type))
    run_sql(db, q)

    logger.info(u"add_dois_to_queue_from_file done in {} seconds".format(elapsed(start, 1)))
    print_status(job_type)
Esempio n. 8
0
    def worker_run(self, **kwargs):
        run_class = Pub

        single_obj_id = kwargs.get("id", None)
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)

        if limit is None:
            limit = float("inf")

        if single_obj_id:
            single_obj_id = clean_doi(single_obj_id)
            objects = [run_class.query.filter(run_class.id == single_obj_id).first()]
            extract_pub_pdf_urls(objects)
        else:
            index = 0
            num_updated = 0
            start_time = time()

            while num_updated < limit:
                new_loop_start_time = time()

                objects = self.fetch_queue_chunk(chunk_size)

                if not objects:
                    sleep(5)
                    continue

                object_ids = [obj.id for obj in objects]
                extract_pub_pdf_urls(objects)

                object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  # sql escaping

                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=self.table_name(None), ids=object_ids_str
                )
                run_sql(db, sql_command)

                index += 1
                num_updated += len(objects)
                self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
    def worker_run(self, **kwargs):
        run_class = PdfUrl

        single_url = kwargs.get("id", None)
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)

        if limit is None:
            limit = float("inf")

        if single_url:
            objects = [run_class.query.filter(run_class.url == single_url).first()]
            check_pdf_urls(objects)
        else:
            index = 0
            num_updated = 0
            start_time = time()

            while num_updated < limit:
                new_loop_start_time = time()

                objects = self.fetch_queue_chunk(chunk_size)

                if not objects:
                    sleep(5)
                    continue

                check_pdf_urls(objects)

                object_ids = [obj.url for obj in objects]
                object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  # sql escaping

                sql_command = u"update {queue_table} set finished=now(), started=null where url in ({ids})".format(
                    queue_table=self.table_name(None), ids=object_ids_str
                )
                run_sql(db, sql_command)

                index += 1
                num_updated += len(objects)
                self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
Esempio n. 10
0
def add_dois_to_queue_from_query(where, job_type):
    logger.info(u"adding all dois, this may take a while")
    start = time()

    table_name = "doi_queue"

    # run_sql(db, "drop table {} cascade".format(table_name(job_type)))
    # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format(
    #     table_name(job_type))
    create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format(
        table_name)

    if where:
        create_table_command = create_table_command.replace(
            "from pub)", "from pub where {})".format(where))
    run_sql(db, create_table_command)
    create_table_command += """
        alter table {table_name} alter column rand set default random();
        CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id);
        CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null;
        CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null;
        -- from https://lob.com/blog/supercharge-your-postgresql-performance
        -- vacuums and analyzes every ten million rows
        ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000);
        ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000);
        """.format(table_name=table_name)
    for command in create_table_command.split(";"):
        run_sql(db, command)

    command = """create or replace view export_queue as
     SELECT id AS doi,
        updated AS updated,
        response_jsonb->>'evidence' AS evidence,
        response_jsonb->>'oa_status' AS oa_color,
        response_jsonb->>'free_fulltext_url' AS best_open_url,
        response_jsonb->>'year' AS year,
        response_jsonb->>'found_hybrid' AS found_hybrid,
        response_jsonb->>'found_green' AS found_green,
        response_jsonb->>'error' AS error,
        response_jsonb->>'is_boai_license' AS is_boai_license,
        replace(api->'_source'->>'journal', '
    ', '') AS journal,
        replace(api->'_source'->>'publisher', '
    ', '') AS publisher,
        api->'_source'->>'title' AS title,
        api->'_source'->>'subject' AS subject,
        response_jsonb->>'license' AS license
       FROM pub where id in (select id from {table_name})""".format(
        table_name=table_name(job_type))

    # if job_type:
    #     command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid")
    run_sql(db, command)

    # they are already lowercased
    logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(
        elapsed(start, 1)))
    print_status(job_type)
Esempio n. 11
0
def daily_update():
    # get date we want to add to database
    today = datetime.today()
    new_date = (today - timedelta(days=1)).strftime("%Y-%m-%d")

    # check if date already exists, if not then add it
    ts = new_date + " 00:00:00.000000"
    existing_record = db.session.execute(
        "select * from doi_queue_paperbuzz_dates where id = :val", {"val": ts}
    )

    if existing_record.first() is None:
        q = """insert into doi_queue_paperbuzz_dates (select s as id, random() as rand, 
                false as enqueued, null::timestamp as finished, null::timestamp 
                as started, null::text as dyno FROM generate_series
                ('{start}'::timestamp, '{end}'::timestamp, '1 day'::interval) s);""".format(
            start=new_date, end=new_date
        )
        run_sql(db, q)

    # run update script
    os.system("python doi_queue.py --dates --run")
def add_dois_to_queue_from_query(where, job_type):
    logger.info(u"adding all dois, this may take a while")
    start = time()

    table_name = "doi_queue"

    # run_sql(db, "drop table {} cascade".format(table_name(job_type)))
    # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format(
    #     table_name(job_type))
    create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format(
        table_name)

    if where:
        create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where))
    run_sql(db, create_table_command)
    create_table_command += """
        alter table {table_name} alter column rand set default random();
        CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id);
        CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null;
        CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null;
        -- from https://lob.com/blog/supercharge-your-postgresql-performance
        -- vacuums and analyzes every ten million rows
        ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000);
        ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0);
        ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000);
        """.format(
        table_name=table_name)
    for command in create_table_command.split(";"):
        run_sql(db, command)

    command = """create or replace view export_queue as
     SELECT id AS doi,
        updated AS updated,
        response_jsonb->>'evidence' AS evidence,
        response_jsonb->>'oa_status' AS oa_color,
        response_jsonb->>'free_fulltext_url' AS best_open_url,
        response_jsonb->>'year' AS year,
        response_jsonb->>'found_hybrid' AS found_hybrid,
        response_jsonb->>'found_green' AS found_green,
        response_jsonb->>'error' AS error,
        response_jsonb->>'is_boai_license' AS is_boai_license,
        replace(api->'_source'->>'journal', '
    ', '') AS journal,
        replace(api->'_source'->>'publisher', '
    ', '') AS publisher,
        api->'_source'->>'title' AS title,
        api->'_source'->>'subject' AS subject,
        response_jsonb->>'license' AS license
       FROM pub where id in (select id from {table_name})""".format(
        table_name=table_name(job_type))

    # if job_type:
    #     command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid")
    run_sql(db, command)

    # they are already lowercased
    logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1)))
    print_status(job_type)
Esempio n. 13
0
 def reset_enqueued(self, job_type):
     q = u"update {} set started=null, finished=null".format(
         self.table_name(job_type))
     run_sql(db, q)
Esempio n. 14
0
 def kick(self, job_type):
     q = u"""update {table_name} set started=null, finished=null
           where finished is null""".format(
         table_name=self.table_name(job_type))
     run_sql(db, q)
     self.print_status(job_type)
Esempio n. 15
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 1)
        queue_table = "doi_queue_dates"
        run_class = DateRange
        run_method = "scroll_through_all_dois"

        if not single_obj_id:
            text_query_pattern = """WITH picked_from_queue AS (
                       SELECT *
                       FROM   {queue_table}
                       WHERE  started is null
                       ORDER BY rand
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   picked_from_queue
                WHERE picked_from_queue.id = queue_rows_to_update.id
                RETURNING picked_from_queue.*;"""
            text_query = text_query_pattern.format(chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))

        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                objects = [
                    run_class.query.filter(
                        run_class.id == single_obj_id).first()
                ]
            else:
                # logger.info(u"looking for new jobs")
                objects = run_class.query.from_statement(
                    text(text_query)).execution_options(autocommit=True).all()
                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            if object_ids:
                object_ids_str = u",".join([
                    u"'{}'".format(str(id).replace(u"'", u"''"))
                    for id in object_ids
                ])
                object_ids_str = object_ids_str.replace(u"%",
                                                        u"%%")  #sql escaping
                run_sql(
                    db,
                    u"update {queue_table} set finished=now() where id in ({ids})"
                    .format(queue_table=queue_table, ids=object_ids_str))

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, chunk,
                                  start_time, index)
def reset_enqueued(job_type):
    q = u"update {} set started=null, finished=null".format(table_name(job_type))
    run_sql(db, q)
Esempio n. 17
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 10)
        limit = kwargs.get("limit", 10)
        queue_table = "pmh_record"
        run_class = PmhRecord
        run_method = "mint_pages"

        if single_obj_id:
            limit = 1
        else:
            if not limit:
                limit = 1000
            text_query_pattern = """WITH picked_from_queue AS (
                       SELECT *
                       --FROM   {queue_table}
                       from (select * from pmh_record where repo_id='digitallibrary.amnh.org/oai/request') s
                       WHERE  started is null
                       -- ORDER BY rand
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   picked_from_queue
                WHERE picked_from_queue.id = queue_rows_to_update.id
                RETURNING picked_from_queue.*;"""
            text_query = text_query_pattern.format(limit=limit,
                                                   chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))

        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                objects = [
                    run_class.query.filter(
                        run_class.id == single_obj_id).first()
                ]
            else:
                # logger.info(u"looking for new jobs")
                objects = run_class.query.from_statement(
                    text(text_query)).execution_options(autocommit=True).all()
                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            object_ids_str = u",".join(
                [u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids])
            object_ids_str = object_ids_str.replace(u"%", u"%%")  #sql escaping
            run_sql(
                db,
                u"update {queue_table} set finished=now() where id in ({ids})".
                format(queue_table=queue_table, ids=object_ids_str))

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit,
                                  start_time, index)
Esempio n. 18
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", 10)
        run_class = Pub
        run_method = kwargs.get("method")

        if single_obj_id:
            limit = 1
            queue_table = None
        elif run_method == "refresh":
            queue_table = "pub_refresh_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """
                with refresh_queue as (
                    select id
                    from {queue_table}
                    where started is null
                    order by
                        priority desc,
                        finished nulls first,
                        started,
                        rand
                    limit {chunk}
                    for update skip locked
                )
                update {queue_table} queue_rows_to_update
                set started = now()
                from refresh_queue
                where refresh_queue.id = queue_rows_to_update.id
                returning refresh_queue.id;"""
            text_query = text_query_pattern.format(
                chunk=chunk,
                queue_table=queue_table
            )
            logger.info(u"the queue query is:\n{}".format(text_query))
        else:
            queue_table = "pub_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """WITH update_pub_queue AS (
                       SELECT id
                       FROM   {queue_table}
                       WHERE  started is null
                       order by finished asc
                       nulls first
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   update_pub_queue
                WHERE update_pub_queue.id = queue_rows_to_update.id
                RETURNING update_pub_queue.id;"""
            text_query = text_query_pattern.format(
                limit=limit,
                chunk=chunk,
                queue_table=queue_table
            )
            logger.info(u"the queue query is:\n{}".format(text_query))
        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                single_obj_id = clean_doi(single_obj_id)
                objects = [run_class.query.filter(run_class.id == single_obj_id).first()]
            else:
                logger.info(u"looking for new jobs")

                job_time = time()
                row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                logger.info(u"got ids, took {} seconds".format(elapsed(job_time)))

                job_time = time()
                q = db.session.query(Pub).options(orm.undefer('*')).filter(Pub.id.in_(object_ids))
                objects = q.all()
                logger.info(u"got pub objects in {} seconds".format(elapsed(job_time)))

                # shuffle them or they sort by doi order
                random.shuffle(objects)

                # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all()

                # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # id_rows =  db.engine.execute(text(text_query)).fetchall()
                # ids = [row[0] for row in id_rows]
                #
                # job_time = time()
                # objects = run_class.query.filter(run_class.id.in_(ids)).all()

                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            # logger.info(u"finished update_fn")
            if queue_table:
                object_ids_str = u",".join([u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  #sql escaping
                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str)
                # logger.info(u"sql command to update finished is: {}".format(sql_command))
                run_sql(db, sql_command)
                # logger.info(u"finished run_sql")

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def truncate(job_type):
    q = "truncate table {}".format(table_name(job_type))
    run_sql(db, q)
def kick(job_type):
    q = u"""update {table_name} set started=null, finished=null
          where finished is null""".format(
          table_name=table_name(job_type))
    run_sql(db, q)
    print_status(job_type)
Esempio n. 21
0
 def truncate(self, job_type):
     q = "truncate table {}".format(self.table_name(job_type))
     run_sql(db, q)
Esempio n. 22
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", 10)
        run_class = Pub
        run_method = kwargs.get("method")

        if single_obj_id:
            limit = 1
            queue_table = None
        elif run_method == "refresh":
            queue_table = "pub_refresh_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """
                with refresh_queue as (
                    select id
                    from {queue_table}
                    where started is null
                    order by
                        priority desc,
                        finished nulls first,
                        started,
                        rand
                    limit {chunk}
                    for update skip locked
                )
                update {queue_table} queue_rows_to_update
                set started = now()
                from refresh_queue
                where refresh_queue.id = queue_rows_to_update.id
                returning refresh_queue.id;"""
            text_query = text_query_pattern.format(chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))
        else:
            queue_table = "pub_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """WITH update_pub_queue AS (
                       SELECT id
                       FROM   {queue_table}
                       WHERE  started is null
                       order by finished asc
                       nulls first
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   update_pub_queue
                WHERE update_pub_queue.id = queue_rows_to_update.id
                RETURNING update_pub_queue.id;"""
            text_query = text_query_pattern.format(limit=limit,
                                                   chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))
        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                single_obj_id = clean_doi(single_obj_id)
                objects = [
                    run_class.query.filter(
                        run_class.id == single_obj_id).first()
                ]
            else:
                logger.info(u"looking for new jobs")

                job_time = time()
                row_list = db.engine.execute(
                    text(text_query).execution_options(
                        autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                logger.info(u"got ids, took {} seconds".format(
                    elapsed(job_time)))

                job_time = time()
                q = db.session.query(Pub).options(orm.undefer('*')).filter(
                    Pub.id.in_(object_ids))
                objects = q.all()
                logger.info(u"got pub objects in {} seconds".format(
                    elapsed(job_time)))

                # shuffle them or they sort by doi order
                random.shuffle(objects)

                # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all()

                # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # id_rows =  db.engine.execute(text(text_query)).fetchall()
                # ids = [row[0] for row in id_rows]
                #
                # job_time = time()
                # objects = run_class.query.filter(run_class.id.in_(ids)).all()

                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            # logger.info(u"finished update_fn")
            if queue_table:
                object_ids_str = u",".join([
                    u"'{}'".format(id.replace(u"'", u"''"))
                    for id in object_ids
                ])
                object_ids_str = object_ids_str.replace(u"%",
                                                        u"%%")  #sql escaping
                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str)
                # logger.info(u"sql command to update finished is: {}".format(sql_command))
                run_sql(db, sql_command)
                # logger.info(u"finished run_sql")

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit,
                                  start_time, index)
Esempio n. 23
0
    def run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        limit = kwargs.get("limit", 0)
        chunk = kwargs.get("chunk", self.chunk)
        after = kwargs.get("after", None)
        queue_table = "doi_queue"

        if single_obj_id:
            limit = 1
        else:
            if not limit:
                limit = 1000
            ## based on http://dba.stackexchange.com/a/69497
            if self.action_table == "base":
                text_query_pattern = """WITH selected AS (
                           SELECT *
                           FROM   {table}
                           WHERE  queue != '{queue_name}' and {where}
                       LIMIT  {chunk}
                       FOR UPDATE SKIP LOCKED
                       )
                    UPDATE {table} records_to_update
                    SET    queue='{queue_name}'
                    FROM   selected
                    WHERE selected.id = records_to_update.id
                    RETURNING records_to_update.id;"""
                text_query = text_query_pattern.format(
                    table=self.action_table,
                    where=self.where,
                    chunk=chunk,
                    queue_name=self.queue_name)
            else:
                my_dyno_name = os.getenv("DYNO", "unknown")
                if kwargs.get("hybrid", False) or "hybrid" in my_dyno_name:
                    queue_table += "_with_hybrid"
                elif kwargs.get("dates", False) or "dates" in my_dyno_name:
                    queue_table += "_dates"

                text_query_pattern = """WITH picked_from_queue AS (
                           SELECT *
                           FROM   {queue_table}
                           WHERE  started is null
                           ORDER BY rand
                       LIMIT  {chunk}
                       FOR UPDATE SKIP LOCKED
                       )
                    UPDATE {queue_table} doi_queue_rows_to_update
                    SET    started=now()
                    FROM   picked_from_queue
                    WHERE picked_from_queue.id = doi_queue_rows_to_update.id
                    RETURNING doi_queue_rows_to_update.id;"""
                text_query = text_query_pattern.format(
                    chunk=chunk,
                    queue_table=queue_table
                )
            logger.info(u"the queue query is:\n{}".format(text_query))

        index = 0

        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                object_ids = [single_obj_id]
            else:
                # logger.info(u"looking for new jobs")
                row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                # logger.info(u"finished get-new-ids query in {} seconds".format(elapsed(new_loop_start_time)))

            if not object_ids:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            update_fn_args = [self.cls, self.method, object_ids]

            shortcut_data = None
            if self.shortcut_fn_per_chunk:
                shortcut_data_start = time()
                logger.info(u"Getting shortcut data...")
                shortcut_data = self.shortcut_fn_per_chunk()
                logger.info(u"Got shortcut data in {} seconds".format(
                    elapsed(shortcut_data_start)))

            update_fn(*update_fn_args, index=index, shortcut_data=shortcut_data)

            try:
                ids_escaped = [id.replace(u"'", u"''") for id in object_ids]
            except TypeError:
                ids_escaped = object_ids
            object_ids_str = u",".join([u"'{}'".format(id) for id in ids_escaped])
            object_ids_str = object_ids_str.replace(u"%", u"%%")  #sql escaping
            run_sql(db, u"update {queue_table} set finished=now() where id in ({ids})".format(
                queue_table=queue_table, ids=object_ids_str))

            index += 1

            if single_obj_id:
                return
            else:
                num_items = limit  #let's say have to do the full limit
                num_jobs_remaining = num_items - (index * chunk)
                try:
                    jobs_per_hour_this_chunk = chunk / float(elapsed(new_loop_start_time) / 3600)
                    predicted_mins_to_finish = round(
                        (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60,
                        1
                    )
                    logger.info(u"\n\nWe're doing {} jobs per hour. At this rate, if we had to do everything up to limit, done in {}min".format(
                        int(jobs_per_hour_this_chunk),
                        predicted_mins_to_finish
                    ))
                    logger.info(u"\t{} seconds this loop, {} chunks in {} seconds, {} seconds/chunk average\n".format(
                        elapsed(new_loop_start_time),
                        index,
                        elapsed(start_time),
                        round(elapsed(start_time)/float(index), 1)
                    ))
                except ZeroDivisionError:
                    # logger.info(u"not printing status because divide by zero")
                    logger.info(u"."),