Ejemplo n.º 1
0
def add_pubs_from_dois(dois):
    new_pubs = []
    for doi in dois:
        crossref_api = get_api_for_one_doi(doi)
        new_pub = build_new_pub(doi, crossref_api)

        # hack so it gets updated soon
        new_pub.updated = datetime.datetime(1042, 1, 1)

        new_pubs.append(new_pub)

    added_pubs = add_new_pubs(new_pubs)
    return added_pubs
Ejemplo n.º 2
0
def add_pubs_from_dois(dois):
    new_pubs = []
    for doi in dois:
        crossref_api = get_api_for_one_doi(doi)
        new_pub = build_new_pub(doi, crossref_api)

        # hack so it gets updated soon
        new_pub.updated = datetime.datetime(1042, 1, 1)

        new_pubs.append(new_pub)

    added_pubs = add_new_pubs(new_pubs)
    return added_pubs
Ejemplo n.º 3
0
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
Ejemplo n.º 4
0
def get_new_dois_and_data_from_crossref(query_doi=None,
                                        first=None,
                                        last=None,
                                        today=False,
                                        week=False,
                                        offset_days=0,
                                        chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {
        "Accept": "application/json",
        "User-Agent": "mailto:[email protected]"
    }

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(
            elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(
                resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(
                        u"added {} pubs, loop done in {} seconds".format(
                            len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(
        u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
            num_pubs_added_so_far,
            datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))