def reported_noncompliant_url_fragments(dirty_doi):
    if not dirty_doi:
        return []

    lookup_normalized = {}
    for (doi_key, fragment_list) in lookup_raw.iteritems():
        lookup_normalized[clean_doi(doi_key)] = [noncompliant_url_fragment.lower() for noncompliant_url_fragment in fragment_list]

    return lookup_normalized.get(clean_doi(dirty_doi), [])
def reported_noncompliant_url_fragments(dirty_doi):
    if not dirty_doi:
        return []

    lookup_normalized = {}
    for (doi_key, fragment_list) in lookup_raw.iteritems():
        lookup_normalized[clean_doi(doi_key)] = [noncompliant_url_fragment.lower() for noncompliant_url_fragment in fragment_list]

    return lookup_normalized.get(clean_doi(dirty_doi), [])
Esempio n. 3
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [
        c for c in
        [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c
    ]

    q = db.session.query(pub.Pub.response_jsonb).filter(
        pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows if row[0]]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [
        d for d in dirty_dois_list
        if clean_doi(d, return_none_if_error=True) not in pub_dois
    ]
    placeholder_responses = [
        pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois
    ]

    responses = pub_responses + placeholder_responses

    # save jsonl
    with open("output.jsonl", 'wb') as f:
        for response_jsonb in responses:
            f.write(json.dumps(response_jsonb, sort_keys=True))
            f.write("\n")

    # save csv
    csv_dicts = [
        pub.csv_dict_from_response_dict(my_dict) for my_dict in responses
    ]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]
    with open("output.csv", 'wb') as f:
        writer = unicodecsv.DictWriter(f,
                                       fieldnames=fieldnames,
                                       dialect='excel')
        writer.writeheader()
        for my_dict in csv_dicts:
            writer.writerow(my_dict)

    # prep email
    email_address = body["email"]
    email = create_email(email_address, "Your Unpaywall results",
                         "simple_query_tool", {"profile": {}},
                         ["output.csv", "output.jsonl"])
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
Esempio n. 4
0
    def set_content_url(self, input):
        has_doi = False
        if input.startswith("10."):
            has_doi = True
        elif self.content_url:
            if self.content_url.startswith(
                    "http") and "doi.org/10." in self.content_url:
                has_doi = True
                return
        elif input.startswith("http") and "doi.org/10." in input:
            has_doi = True
        elif self.extract_doi(input):
            has_doi = True

        if not has_doi:
            return

        input = self.extract_doi(input)

        # print "has_doi", has_doi, input[0:10]

        try:
            doi = clean_doi(input)
        except Exception:
            print("no doi found for {}".format(input))
            return

        doi_url = "https://doi.org/{}".format(doi)
        self.content_url = doi_url
Esempio n. 5
0
    def is_bronze(self):
        if self.display_evidence == 'open (via free pdf)':
            return True

        if is_doi_url(self.best_url):
            return clean_doi(self.best_url) == self.doi and not (self.is_gold or self.is_hybrid)

        return False
Esempio n. 6
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c]

    q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois]
    placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois]

    responses = pub_responses + placeholder_responses

    # save jsonl
    with open("output.jsonl", 'wb') as f:
        for response_jsonb in responses:
            f.write(json.dumps(response_jsonb, sort_keys=True))
            f.write("\n")


    # save csv
    csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]
    with open("output.csv", 'wb') as f:
        writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel')
        writer.writeheader()
        for my_dict in csv_dicts:
            writer.writerow(my_dict)

    # prep email
    email_address = body["email"]
    email = create_email(email_address,
                 "Your Unpaywall results",
                 "simple_query_tool",
                 {"profile": {}},
                 ["output.csv", "output.jsonl"])
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
Esempio n. 7
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns == "DOI":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:
        # try url
        try:
            id_string = str(orcid_product_dict['url']['value'].encode('utf-8')).lower()
            if is_doi_url(id_string):
                doi = clean_doi(id_string)  # throws error unless valid DOI
        except (TypeError, NoDoiException):
            doi = None
    return doi
Esempio n. 8
0
    def populate(self, pmh_input_record):
        self.updated = datetime.datetime.utcnow().isoformat()
        self.id = pmh_input_record.header.identifier
        self.api_raw = pmh_input_record.raw
        self.record_timestamp = pmh_input_record.header.datestamp
        self.title = oai_tag_match("title", pmh_input_record)
        self.authors = oai_tag_match("creator",
                                     pmh_input_record,
                                     return_list=True)
        self.relations = oai_tag_match("relation",
                                       pmh_input_record,
                                       return_list=True)
        self.oa = oai_tag_match("oa", pmh_input_record)
        self.license = oai_tag_match("rights", pmh_input_record)
        self.sources = oai_tag_match("collname",
                                     pmh_input_record,
                                     return_list=True)
        identifier_matches = oai_tag_match("identifier",
                                           pmh_input_record,
                                           return_list=True)
        self.urls = self.get_good_urls(identifier_matches)
        if not self.urls:
            self.urls = self.get_good_urls(self.relations)

        possible_dois = []

        if self.relations:
            possible_dois += [
                s for s in self.relations if s and '/*ref*/' not in s
            ]
        if identifier_matches:
            possible_dois += [s for s in identifier_matches if s]

        if possible_dois:
            for possible_doi in possible_dois:
                if (is_doi_url(possible_doi)
                        or possible_doi.startswith(u"doi:")
                        or re.findall(ur"10\.\d", possible_doi)):
                    try:
                        doi_candidate = clean_doi(possible_doi)

                        skip_these_doi_snippets = [
                            u'10.17605/osf.io',
                            u'10.14279/depositonce',
                            u'/(issn)',
                            u'10.17169/refubium',
                        ]
                        for doi_snippet in skip_these_doi_snippets:
                            if doi_snippet in doi_candidate:
                                doi_candidate = None
                                break

                        if doi_candidate:
                            self.doi = doi_candidate
                    except NoDoiException:
                        pass
Esempio n. 9
0
def post_gs_cache(**kwargs):
    my_doi = clean_doi(kwargs["doi"])
    q = Gs.query.filter(Gs.doi == my_doi,
                        Gs.landing_page_url == kwargs["landing_page_url"])
    my_gs = q.first()
    if not my_gs:
        my_gs = Gs(**kwargs)
        db.session.add(my_gs)
        safe_commit(db)
    return my_gs
Esempio n. 10
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns == "DOI":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:
        # try url
        try:
            id_string = str(
                orcid_product_dict['url']['value'].encode('utf-8')).lower()
            if is_doi_url(id_string):
                doi = clean_doi(id_string)  # throws error unless valid DOI
        except (TypeError, NoDoiException):
            doi = None
    return doi
Esempio n. 11
0
def get_doi_endpoint(doi):
    my_doi = Doi(clean_doi(doi))
    if my_doi.is_cached_not_expired():
        # responses with many events are cached in the database
        response = my_doi.cached_response()
    else:
        my_doi.get()
        response = my_doi.to_dict()
        my_doi.save_to_cache(response)
    return jsonify(response)
Esempio n. 12
0
    def is_bronze(self):
        if self.best_url and not (self.is_gold
                                  or self.is_green) and not self.has_license:
            return True

        if is_doi_url(self.best_url):
            return (clean_doi(self.best_url, return_none_if_error=True)
                    == self.doi and not (self.is_gold or self.is_hybrid))

        return False
Esempio n. 13
0
def get_gs_cache(dirty_doi):
    my_doi = clean_doi(dirty_doi)

    # return the best one we've got, so null urls are last
    my_gs = Gs.query.filter(Gs.doi == my_doi).order_by(
        Gs.landing_page_url.desc().nullslast()).first()

    # if my_gs:
    #     my_gs.num_hits +=1
    #     safe_commit(db)
    return my_gs
Esempio n. 14
0
    def is_hybrid(self):
        # import pdb; pdb.set_trace()

        if self.display_evidence and self.display_evidence.startswith("open"):
            return True

        if is_doi_url(self.best_url):
            if self.is_gold:
                return False
            if clean_doi(self.best_url) == self.doi:
                return True
        return False
Esempio n. 15
0
def get_doi_from_biblio_dict(orcid_product_dict):
    doi = None
    for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
        if ns.lower() == "doi":
            try:
                doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass

    if not doi:

        # try url
        for (ns, nid) in get_identifiers_from_biblio_dict(orcid_product_dict):
            try:
                if is_doi_url(nid):
                    doi = clean_doi(nid)  # throws error unless valid DOI
            except (TypeError, NoDoiException):
                pass


    return doi
Esempio n. 16
0
    def reset_vars(self):
        if self.id and self.id.startswith("10."):
            self.id = clean_doi(self.id)

        self.license = None
        self.free_metadata_url = None
        self.free_pdf_url = None
        self.fulltext_url = None
        self.oa_color = None
        self.evidence = None
        self.open_locations = []
        self.closed_urls = []
        self.session_id = None
        self.version = None
Esempio n. 17
0
def lookup_product(**biblio):
    my_pub = None
    if "doi" in biblio and biblio["doi"]:
        doi = clean_doi(biblio["doi"])
        my_pub = Pub.query.get(doi)
        if my_pub:
            logger.info(u"found {} in pub db table!".format(my_pub.id))
            my_pub.reset_vars()
        else:
            raise NoDoiException
        #     my_pub = Crossref(**biblio)
        #     logger.info(u"didn't find {} in crossref db table".format(my_pub))

    return my_pub
Esempio n. 18
0
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    number_added = 0

    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(
            first=first,
            last=last,
            rows=chunk_size,
            next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds.  url: {}".format(elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
Esempio n. 19
0
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    number_added = 0

    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(
            first=first,
            last=last,
            rows=chunk_size,
            next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds.  url: {}".format(elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [clean_doi(api_raw["DOI"]) for api_raw in resp_data["items"]]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
Esempio n. 20
0
 def strip_junk_from_end_of_doi(self, doi):
     doi = re.sub("\s+", "", doi)
     if '">' in doi:
         doi = doi.split('">')[0]
     if "</a>" in doi:
         doi = doi.split("</a>")[0]
     doi = doi.strip(
         ",")  # has to be first, because comma would be last item on line
     doi = doi.strip(
         "."
     )  # has to be near first, because period would be last item on line
     doi = doi.strip("'")
     doi = doi.strip('"')
     doi = doi.strip("}")
     doi = clean_doi(doi).lower()
     return doi
Esempio n. 21
0
def simple_query_tool():
    body = request.json
    return_type = body.get("return_type", "csv")
    dirty_dois_list = body["dois"]

    clean_dois = [
        clean_doi(dirty_doi, return_none_if_error=True)
        for dirty_doi in dirty_dois_list
    ]
    clean_dois = [doi for doi in clean_dois if doi]

    q = db.session.query(pub.Pub.response_jsonb).filter(
        pub.Pub.id.in_(clean_dois))
    rows = q.all()
    pub_responses = [row[0] for row in rows]

    # save jsonl
    with open("output.jsonl", 'wb') as f:
        for response_jsonb in pub_responses:
            f.write(json.dumps(response_jsonb, sort_keys=True))
            f.write("\n")

    # save csv
    csv_dicts = [
        pub.csv_dict_from_response_dict(my_dict) for my_dict in pub_responses
    ]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]
    with open("output.csv", 'wb') as f:
        writer = unicodecsv.DictWriter(f,
                                       fieldnames=fieldnames,
                                       dialect='excel')
        writer.writeheader()
        for my_dict in csv_dicts:
            writer.writerow(my_dict)

    # prep email
    email_address = body["email"]
    email = create_email(email_address, "Your Unpaywall results",
                         "simple_query_tool", {"profile": {}},
                         ["output.csv", "output.jsonl"])
    send(email, for_real=True)

    # @todo make sure in the return dict that there is a row for every doi
    # even those not in our db
    return jsonify({"got it": email_address, "dois": clean_dois})
Esempio n. 22
0
def run_update(parsed_args):
    update = update_registry.get(parsed_args.fn)

    start = time()

    #convenience method for handling an doi
    if parsed_args.doi:
        from pub import Pub
        from util import clean_doi

        my_pub = db.session.query(Pub).filter(Pub.id==clean_doi(parsed_args.doi)).first()
        parsed_args.id = my_pub.id
        logger.info(u"Got database hit for this doi: {}".format(my_pub.id))

    update.run(**vars(parsed_args))

    db.session.remove()
    logger.info(u"finished update in {} secconds".format(elapsed(start)))
Esempio n. 23
0
    def __init__(self, **kwargs):
        self.request_kwargs = kwargs
        self.base_dcoa = None
        self.repo_urls = {"urls": []}
        self.license_string = ""

        self.id = shortuuid.uuid()[0:10]
        self.created = datetime.datetime.utcnow()
        self.updated = datetime.datetime.utcnow()

        for (k, v) in kwargs.iteritems():
            if v:
                value = v.strip()
                setattr(self, k, value)

        if self.doi:
            self.doi = clean_doi(self.doi)
            self.url = u"http://doi.org/{}".format(self.doi)
Esempio n. 24
0
    def populate(self, pmh_input_record):
        self.updated = datetime.datetime.utcnow().isoformat()
        self.id = pmh_input_record.header.identifier
        self.api_raw = pmh_input_record.raw
        self.record_timestamp = pmh_input_record.header.datestamp
        self.title = oai_tag_match("title", pmh_input_record)
        self.authors = oai_tag_match("creator",
                                     pmh_input_record,
                                     return_list=True)
        self.relations = oai_tag_match("relation",
                                       pmh_input_record,
                                       return_list=True)
        self.oa = oai_tag_match("oa", pmh_input_record)
        self.license = oai_tag_match("rights", pmh_input_record)
        self.sources = oai_tag_match("collname",
                                     pmh_input_record,
                                     return_list=True)
        identifier_matches = oai_tag_match("identifier",
                                           pmh_input_record,
                                           return_list=True)
        self.urls = self.get_good_urls(identifier_matches)
        if not self.urls:
            self.urls = self.get_good_urls(self.relations)

        possible_dois = []
        if identifier_matches:
            possible_dois += [s for s in identifier_matches if s]
        if self.relations:
            possible_dois += [s for s in self.relations if s]
        if possible_dois:
            for possible_doi in possible_dois:
                if (is_doi_url(possible_doi)
                        or possible_doi.startswith(u"doi:")
                        or re.findall(u"10\./d", possible_doi)):
                    try:
                        self.doi = clean_doi(possible_doi)
                        dont_use_these_doi_snippets = [u"10.17605/osf.io"]
                        for doi_snippet in dont_use_these_doi_snippets:
                            if self.doi and doi_snippet in self.doi:
                                self.doi = None
                    except NoDoiException:
                        pass

        self.doi = self._doi_override_by_id().get(self.id, self.doi)
Esempio n. 25
0
def run(parsed_args, job_type):
    start = time()
    if job_type in ("normal", "hybrid"):
        update = update_registry.get("WeeklyStats." + process_name(job_type))
        if parsed_args.doi:
            parsed_args.id = clean_doi(parsed_args.doi)
            parsed_args.doi = None
    else:
        update = update_registry.get("DateRange.get_events")

    update.run(**vars(parsed_args))

    logger.info("finished update in {} seconds".format(elapsed(start)))

    if job_type in ("normal", "hybrid"):
        from event import CedEvent

        my_event = CedEvent.query.get(parsed_args.id)
        pprint(my_event)
Esempio n. 26
0
def run_update(parsed_args):
    update = update_registry.get(parsed_args.fn)

    start = time()

    #convenience method for handling an doi
    if parsed_args.doi:
        from pub import Pub
        from util import clean_doi

        my_pub = db.session.query(Pub).filter(
            Pub.id == clean_doi(parsed_args.doi)).first()
        parsed_args.id = my_pub.id
        logger.info(u"Got database hit for this doi: {}".format(my_pub.id))

    update.run(**vars(parsed_args))

    db.session.remove()
    logger.info(u"finished update in {} secconds".format(elapsed(start)))
Esempio n. 27
0
    def worker_run(self, **kwargs):
        run_class = Pub

        single_obj_id = kwargs.get("id", None)
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)

        if limit is None:
            limit = float("inf")

        if single_obj_id:
            single_obj_id = clean_doi(single_obj_id)
            objects = [run_class.query.filter(run_class.id == single_obj_id).first()]
            extract_pub_pdf_urls(objects)
        else:
            index = 0
            num_updated = 0
            start_time = time()

            while num_updated < limit:
                new_loop_start_time = time()

                objects = self.fetch_queue_chunk(chunk_size)

                if not objects:
                    sleep(5)
                    continue

                object_ids = [obj.id for obj in objects]
                extract_pub_pdf_urls(objects)

                object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  # sql escaping

                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=self.table_name(None), ids=object_ids_str
                )
                run_sql(db, sql_command)

                index += 1
                num_updated += len(objects)
                self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def run(parsed_args, job_type):
    start = time()
    if job_type in ("normal", "hybrid"):
        update = update_registry.get("Pub."+process_name(job_type))
        if parsed_args.doi:
            parsed_args.id = clean_doi(parsed_args.doi)
            parsed_args.doi = None
    else:
        update = update_registry.get("DateRange.get_unpaywall_events")
        # update = update_registry.get("DateRange.get_pmh_events")

    update.run(**vars(parsed_args))

    logger.info(u"finished update in {} seconds".format(elapsed(start)))

    resp = None
    if job_type in ("normal", "hybrid"):
        my_pub = Pub.query.get(parsed_args.id)
        resp = my_pub.response_jsonb
        pprint(resp)

    return resp
Esempio n. 29
0
def get_pub_by_doi(my_doi):
    my_clean_doi = clean_doi(my_doi)
    # print my_clean_doi

    query = db.session.query(PubDoi).filter(
        PubDoi.doi == my_clean_doi).options(orm.undefer_group('full'))
    # print query
    my_pub = query.first()
    # print my_pub
    if not my_pub:
        abort_json(
            404, u"'{}' is an invalid doi.  See https://doi.org/{}".format(
                my_clean_doi, my_clean_doi))

    my_pub_list = PubList(pubs=[my_pub])
    my_pub_list.set_dandelions()
    my_pub_list.set_pictures()
    results = my_pub_list.to_dict_serp_list()
    return jsonify({
        "results": my_pub_list.to_dict_serp_list(),
        "annotations": my_pub_list.to_dict_annotation_metadata(),
    })
Esempio n. 30
0
def run(parsed_args, job_type):
    start = time()
    if job_type in ("normal", "hybrid"):
        update = update_registry.get("Pub." + process_name(job_type))
        if parsed_args.doi:
            parsed_args.id = clean_doi(parsed_args.doi)
            parsed_args.doi = None
    else:
        update = update_registry.get("DateRange.get_unpaywall_events")
        # update = update_registry.get("DateRange.get_pmh_events")

    update.run(**vars(parsed_args))

    logger.info(u"finished update in {} seconds".format(elapsed(start)))

    resp = None
    if job_type in ("normal", "hybrid"):
        my_pub = Pub.query.get(parsed_args.id)
        resp = my_pub.response_jsonb
        pprint(resp)

    return resp
Esempio n. 31
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", 10)
        run_class = Pub
        run_method = kwargs.get("method")

        if single_obj_id:
            limit = 1
            queue_table = None
        elif run_method == "refresh":
            queue_table = "pub_refresh_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """
                with refresh_queue as (
                    select id
                    from {queue_table}
                    where started is null
                    order by
                        priority desc,
                        finished nulls first,
                        started,
                        rand
                    limit {chunk}
                    for update skip locked
                )
                update {queue_table} queue_rows_to_update
                set started = now()
                from refresh_queue
                where refresh_queue.id = queue_rows_to_update.id
                returning refresh_queue.id;"""
            text_query = text_query_pattern.format(
                chunk=chunk,
                queue_table=queue_table
            )
            logger.info(u"the queue query is:\n{}".format(text_query))
        else:
            queue_table = "pub_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """WITH update_pub_queue AS (
                       SELECT id
                       FROM   {queue_table}
                       WHERE  started is null
                       order by finished asc
                       nulls first
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   update_pub_queue
                WHERE update_pub_queue.id = queue_rows_to_update.id
                RETURNING update_pub_queue.id;"""
            text_query = text_query_pattern.format(
                limit=limit,
                chunk=chunk,
                queue_table=queue_table
            )
            logger.info(u"the queue query is:\n{}".format(text_query))
        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                single_obj_id = clean_doi(single_obj_id)
                objects = [run_class.query.filter(run_class.id == single_obj_id).first()]
            else:
                logger.info(u"looking for new jobs")

                job_time = time()
                row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                logger.info(u"got ids, took {} seconds".format(elapsed(job_time)))

                job_time = time()
                q = db.session.query(Pub).options(orm.undefer('*')).filter(Pub.id.in_(object_ids))
                objects = q.all()
                logger.info(u"got pub objects in {} seconds".format(elapsed(job_time)))

                # shuffle them or they sort by doi order
                random.shuffle(objects)

                # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all()

                # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # id_rows =  db.engine.execute(text(text_query)).fetchall()
                # ids = [row[0] for row in id_rows]
                #
                # job_time = time()
                # objects = run_class.query.filter(run_class.id.in_(ids)).all()

                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            # logger.info(u"finished update_fn")
            if queue_table:
                object_ids_str = u",".join([u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids])
                object_ids_str = object_ids_str.replace(u"%", u"%%")  #sql escaping
                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str)
                # logger.info(u"sql command to update finished is: {}".format(sql_command))
                run_sql(db, sql_command)
                # logger.info(u"finished run_sql")

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit, start_time, index)
Esempio n. 32
0
def run_through_dois(filename=None, reverse=None, loggly=False):
    total_start = time()
    i = 0
    output_dicts = []
    fh = open(filename, "r")

    lines = fh.readlines()

    if reverse:
        logger.info(u"reverse!")
        lines.reverse()
        i = -1 * len(lines)

    dois = []
    for line in lines:
        dois.append(line.strip())

        # line = line.replace('"', '')
        # if u"," in line:
        #     split_line = line.split(",")
        #     if loggly:
        #         dois.append(split_line[1])
        #     else:
        #         dois.append(split_line[0])
        # else:
        #     dois.append(line.strip())

    # deduplicate, preserving order
    duplicated_dois = dois
    dois = []
    for doi in duplicated_dois:
        if doi not in dois:
            dois.append(doi)

    logger.info(u"length of deduped doi list: {}".format(len(dois)))

    for doi in dois:

        try:
            my_doi = clean_doi(doi)
        except NoDoiException:
            logger.info(u"bad doi: {}".format(doi))
            continue

        if not my_doi:
            logger.info(u"bad doi: {}".format(doi))
            continue

        my_pub = Oab.query.get(my_doi)
        if not my_pub:
            my_pub = Oab()
            db.session.add(my_pub)
        my_pub.id = my_doi
        my_doi_url = "http://doi.org/{}".format(my_doi)
        my_doi_url_encoded = urllib.quote_plus(my_doi_url)
        api_url = "https://api.openaccessbutton.org/availability?url={}".format(my_doi_url_encoded)
        headers = {"content-type": "application/json"}
        r = requests.get(api_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success with oab! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.api = r.json()
            flag_modified(my_pub, "api")
        else:
            logger.info(u"problem with oab, status_code {}".format(r.status_code))

        dissemin_url = "http://dissem.in/api/{}".format(my_doi)
        r = requests.get(dissemin_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success! with dissemin! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.dissemin = r.json()
            flag_modified(my_pub, "dissemin")
        else:
            logger.info(u"problem with dissemin, status_code {}".format(r.status_code))

        safe_commit(db)
        i += 1

    logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start, 2)))

    fh.close()
Esempio n. 33
0
    def save_new_dois(self, rows=1000):
        headers = {
            "Accept": "application/json",
            "User-Agent": "impactstory.org"
        }
        base_url_with_last = "http://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&cursor={next_cursor}"
        # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates

        next_cursor = "*"
        has_more_responses = True
        num_so_far = 0
        num_between_commits = 0

        while has_more_responses:
            start_time = time()
            url = base_url_with_last.format(first=self.first_day,
                                            last=self.last_day,
                                            rows=rows,
                                            next_cursor=next_cursor)
            # logger.info(u"calling url: {}".format(url))

            resp = requests.get(url, headers=headers)
            logger.info(u"getting crossref response took {} seconds".format(
                elapsed(start_time, 2)))
            if resp.status_code != 200:
                logger.info(u"error in crossref call, status_code = {}".format(
                    resp.status_code))
                return

            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                doi = clean_doi(api_raw["DOI"])
                my_pub = Pub(id=doi, crossref_api_raw_new=api_raw)
                # my_pub.title = my_pub.crossref_title
                # my_pub.normalized_title = normalize_title(my_pub.title)

                # my_pub.update()
                db.session.merge(my_pub)
                num_between_commits += 1
                num_so_far += 1

                if num_between_commits > 100:
                    # logger.info(u"committing")
                    start_commit = time()
                    safe_commit(db)
                    logger.info(u"committing done in {} seconds".format(
                        elapsed(start_commit, 2)))
                    num_between_commits = 0

            # logger.info(u"at bottom of loop, got {} records".format(len(resp_data["items"])))

        # make sure to get the last ones
        logger.info(u"done everything, saving last ones")
        safe_commit(db)
        return num_so_far
Esempio n. 34
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c]

    q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows if row[0]]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois]
    placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois]

    responses = pub_responses + placeholder_responses

    formats = body.get("formats", []) or ["jsonl", "csv"]
    files = []

    if "jsonl" in formats:
        # save jsonl
        with open("output.jsonl", 'wb') as f:
            for response_jsonb in responses:
                f.write(json.dumps(response_jsonb, sort_keys=True))
                f.write("\n")
        files.append("output.jsonl")

    csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]

    if "csv" in formats:
        # save csv
        with open("output.csv", 'wb') as f:
            writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel')
            writer.writeheader()
            for my_dict in csv_dicts:
                writer.writerow(my_dict)
        files.append("output.csv")

    if "xlsx" in formats:
        book = Workbook()
        sheet = book.worksheets[0]
        sheet.title = "results"

        for col_idx, field_name in enumerate(fieldnames):
            sheet.cell(column=col_idx+1, row=1, value=field_name)

        for row_idx, row in enumerate(csv_dicts):
            for col_idx, field_name in enumerate(fieldnames):
                sheet.cell(column=col_idx+1, row=row_idx+2, value=row[field_name])

        book.save(filename="output.xlsx")
        files.append("output.xlsx")

    # prep email
    email_address = body["email"]
    email = create_email(email_address,
                 "Your Unpaywall results",
                 "simple_query_tool",
                 {"profile": {}},
                 files)
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
Esempio n. 35
0
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
Esempio n. 36
0
def fulltext_search_title(original_query, query_entities, oa_only, full=True):

    start_time = time()
    original_query_escaped = original_query.replace("'", "''")
    original_query_with_ands = ' & '.join(original_query_escaped.split(" "))
    query_to_use = u"({})".format(original_query_with_ands)


    if oa_only:
        oa_clause = u" and is_oa=True "
    else:
        oa_clause = " "

    dois = []
    rows = []
    search_done = False

    if is_doi(original_query):
        dois = [clean_doi(original_query)]
        search_done = True

    # if "from_" in original_query and "to_" in original_query:
    #     print u"getting recent query"
    #     matches = re.findall("from_(\d{4}.\d{2}.\d{2})_to_(\d{4}.\d{2}.\d{2})", original_query)
    #     from_date = matches[0][0].replace("_", "-")
    #     to_date = matches[0][1].replace("_", "-")
    #     query_string = u"""
    #         select pmid, 0.05*COALESCE(num_events, 0.0)::float as rank
    #         from search_recent_hits_mv
    #         where published_date > :from_date ::timestamp and published_date < :to_date ::timestamp
    #         and num_events is not null
    #         {oa_clause}
    #         order by num_events desc
    #         limit 100 """.format(oa_clause=oa_clause)
    #     rows = db.engine.execute(sql.text(query_string), from_date=from_date, to_date=to_date).fetchall()
    #     print "done getting query getting pmids"

    if not search_done and query_entities and len(query_entities)==1:
        query_entity = query_entities[0]
        query_entity = query_entity.replace("(", " ")
        query_entity = query_entity.replace(")", " ")
        query_entity = query_entity.replace("&", " ")

        print u"have query_entities"

        query_string = u"""
            select doi 
            from search_title_dandelion_simple_mv
            where title=:query_entity 
            and num_events >= 3
            {oa_clause}
            order by num_events desc 
            limit 120""".format(oa_clause=oa_clause)

        rows = db.engine.execute(sql.text(query_string), query_entity=query_entity).fetchall()
        print "done getting query getting dois"
        original_query_escaped = query_entity.replace("'", "''")
        original_query_with_ands = ' & '.join(original_query_escaped.split(" "))
        query_to_use = u"({})".format(original_query_with_ands)


    if rows:
        dois = [row[0] for row in rows]
        print "len dois", len(dois)

    if not search_done and len(dois) < 25:
        print "len(dois) < 25, in fulltext_search_title"

    # if True: # debug
    #     print "doing full text search anyway"

        # need to do the full search
        print "len(dois) < 25, in fulltext_search_title"
        original_query_escaped = original_query.replace("'", "''")
        original_query_escaped = original_query_escaped.replace("&", "")
        original_query_escaped = original_query_escaped.replace("(", " ")
        original_query_escaped = original_query_escaped.replace(")", " ")
        original_query_with_ands = ' & '.join([w for w in original_query_escaped.split(" ") if w and w != " "])
        query_to_use = u"({})".format(original_query_with_ands)

        if query_entities:
            entities_escaped = []
            for query_entity in query_entities:
                print query_entity
                entity_escaped = query_entity
                entity_escaped = entity_escaped.replace("'", "''")
                entity_escaped = entity_escaped.replace("&", "")
                entity_escaped = entity_escaped.replace("(", "")
                entity_escaped = entity_escaped.replace(")", "")
                entity_escaped = u" & ".join(entity_escaped.split(u" "))
                entities_escaped += [entity_escaped]
                print "entities_escaped", entities_escaped
            entity_with_ands = u' & '.join(entities_escaped)
            print "entity_with_ands", entity_with_ands
            query_to_use += u" | ({})".format(entity_with_ands)

        # get ride of bad characters
        query_to_use = query_to_use.replace("!", "")

        print u"starting query for {}".format(query_to_use)

        query_string = u"""
            select
            doi,
            (ts_rank_cd(to_tsvector('english', article_title), to_tsquery(:query), 1) + 0.05*COALESCE(num_events,0.0)) AS rank
            FROM ricks_gtr_sort_results
            WHERE  
            to_tsvector('english', article_title) @@  to_tsquery(:query)
            and doi is not null 
            {oa_clause}
            order by rank desc
            limit 120;
            """.format(oa_clause=oa_clause)


        # print query_string


        rows = db.engine.execute(sql.text(query_string), query=query_to_use).fetchall()
        print "done getting query of sort data"

        # print rows
        dois = [row[0] for row in rows]

    time_for_dois = elapsed(start_time, 3)
    print u"done query for dois and sort data: got {} dois".format(len(dois))

    time_for_pubs_start_time = time()

    my_pubs_filtered = []
    if dois:
        if full:
            query_string = u"""
                select pmid,
                    doi,
                    article_title,
                    journal_title,
                    pub_types,
                    abstract_length,
                    is_oa,
                    num_events,
                    num_news_events,
                    (ts_rank_cd(to_tsvector('english', article_title), to_tsquery(:query), 1) + 0.05*COALESCE(num_events,0.0)) AS rank
                    from ricks_gtr_sort_results
                    where doi in ({dois_string})
                """.format(dois_string=u",".join([u"'{}'".format(str(d)) for d in dois]))
            # print query_string
            rows = db.engine.execute(sql.text(query_string), query=query_to_use, dois=dois).fetchall()
            print "done getting sort data"
            # print rows

            # print rows
            my_pubs_filtered = []
            for row in rows:
                my_dict = {
                    "pmid": row[0],
                    "doi": row[1],
                    "article_title": row[2],
                    "journal_title": row[3],
                    "pub_types": row[4],
                    "abstract_length": row[5],
                    "is_oa": row[6],
                    "num_events": row[7],
                    "num_news_events": row[8],
                    "score": row[9],
                    "query": query_to_use,
                    "query_entities": query_entities
                     }
                my_dict["adjusted_score"] = adjusted_score(my_dict)
                my_pubs_filtered.append(my_dict)

            # my_pubs = db.session.query(Pub).filter(Pub.pmid.in_(pmids)).options(orm.undefer_group('full')).all()
            # my_pubs = db.session.query(Pub).filter(Pub.pmid.in_(pmids)).\
            #     options(orm.raiseload(Pub.authors)).\
            #     options(orm.raiseload(Pub.dandelion_lookup)).\
            #     options(orm.raiseload(Pub.doi_lookup)).\
            #     all()
        else:
            my_pubs = db.session.query(Pub).filter(Pub.doi.in_(dois)).\
                options(orm.raiseload(Pub.authors)).\
                options(orm.raiseload(Pub.dandelion_lookup)).\
                options(orm.raiseload(Pub.doi_lookup)).\
                all()
            my_pubs_filtered = [p for p in my_pubs if not p.suppress]

    print "done query for my_pubs"


    time_for_pubs = elapsed(time_for_pubs_start_time, 3)

    return (my_pubs_filtered, time_for_dois, time_for_pubs)
Esempio n. 37
0
 def clean_doi(self):
     if not self.doi:
         return None
     return clean_doi(self.doi)
Esempio n. 38
0
 def __init__(self, **kwargs):
     self.updated = datetime.datetime.utcnow()
     if "doi" in kwargs:
         kwargs["doi"] = clean_doi(kwargs["doi"])
     super(Chorus, self).__init__(**kwargs)
Esempio n. 39
0
def get_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000, get_updates=False):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    if get_updates:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}"
    else:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs

    while has_more_responses:
        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = insert_pub_fn(pubs_this_chunk)
                    logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = insert_pub_fn(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
Esempio n. 40
0
def get_overrides_dict():
    override_dict = defaultdict(dict)

    # cindy wu example
    override_dict["10.1038/nature21360"] = {
        "pdf_url": "https://arxiv.org/pdf/1703.01424.pdf",
        "version": "submittedVersion"
    }

    # example from twitter
    override_dict["10.1021/acs.jproteome.5b00852"] = {
        "pdf_url": "http://pubs.acs.org/doi/pdfplus/10.1021/acs.jproteome.5b00852",
        "host_type_set": "publisher",
        "version": "publishedVersion"
    }

    # have the unpaywall example go straight to the PDF, not the metadata page
    override_dict["10.1098/rspa.1998.0160"] = {
        "pdf_url": "https://arxiv.org/pdf/quant-ph/9706064.pdf",
        "version": "submittedVersion"
    }

    # missed, not in BASE, from Maha Bali in email
    override_dict["10.1080/13562517.2014.867620"] = {
        "pdf_url": "http://dar.aucegypt.edu/bitstream/handle/10526/4363/Final%20Maha%20Bali%20TiHE-PoD-Empowering_Sept30-13.pdf",
        "version": "submittedVersion"
    }

    # otherwise links to figshare match that only has data, not the article
    override_dict["110.1126/science.aaf3777"] = {}

    #otherwise links to a metadata page that doesn't have the PDF because have to request a copy: https://openresearch-repository.anu.edu.au/handle/1885/103608
    override_dict["10.1126/science.aad2622"] = {
        "pdf_url": "https://lra.le.ac.uk/bitstream/2381/38048/6/Waters%20et%20al%20draft_post%20review_v2_clean%20copy.pdf",
        "version": "submittedVersion"
    }

    # otherwise led to http://www.researchonline.mq.edu.au/vital/access/services/Download/mq:39727/DS01 and authorization error
    override_dict["10.1126/science.aad2622"] = {}

    # else goes here: http://www.it-c.dk/people/schmidt/papers/complexity.pdf
    override_dict["10.1007/978-1-84800-068-1_9"] = {}

    # otherwise led to https://dea.lib.unideb.hu/dea/bitstream/handle/2437/200488/file_up_KMBT36220140226131332.pdf;jsessionid=FDA9F1A60ACA567330A8B945208E3CA4?sequence=1
    override_dict["10.1007/978-3-211-77280-5"] = {}

    # otherwise led to publisher page but isn't open
    override_dict["10.1016/j.renene.2015.04.017"] = {}

    # override old-style webpage
    override_dict["10.1210/jc.2016-2141"] = {
        "pdf_url": "https://academic.oup.com/jcem/article-lookup/doi/10.1210/jc.2016-2141",
        "host_type_set": "publisher",
        "version": "publishedVersion",
    }

    # not indexing this location yet, from @rickypo
    override_dict["10.1207/s15327957pspr0203_4"] = {
        "pdf_url": "http://www2.psych.ubc.ca/~schaller/528Readings/Kerr1998.pdf",
        "version": "submittedVersion"
    }

    # mentioned in world bank as good unpaywall example
    override_dict["10.3386/w23298"] = {
        "pdf_url": "https://economics.mit.edu/files/12774",
        "version": "submittedVersion"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1007/bf02693740"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.536.6939&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1126/science.1150952"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.3796&rep=rep1&type=pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1515/eqc.2007.295"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.543.7752&rep=rep1&type=pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1038/nature21377"] = {
        "pdf_url": "http://eprints.whiterose.ac.uk/112179/1/ppnature21377_Dodd_for%20Symplectic.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.1016/j.gtc.2016.09.007"] = {
        "pdf_url": "https://cora.ucc.ie/bitstream/handle/10468/3544/Quigley_Chapter.pdf?sequence=1&isAllowed=y",
        "version": "acceptedVersion"
    }

    # stephen hawking's thesis
    override_dict["10.17863/cam.11283"] = {
        "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/251038/PR-PHD-05437_CUDL2017-reduced.pdf?sequence=15&isAllowed=y",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1152/advan.00040.2005"] = {
        "pdf_url": "https://www.physiology.org/doi/pdf/10.1152/advan.00040.2005",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1016/j.chemosphere.2014.07.047"] = {
        "pdf_url": "https://manuscript.elsevier.com/S0045653514009102/pdf/S0045653514009102.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.4324/9780203900956"] = {}

    # from email
    override_dict["10.3810/psm.2010.04.1767"] = {
        "pdf_url": "http://cupola.gettysburg.edu/cgi/viewcontent.cgi?article=1014&context=healthfac",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1016/S0140-6736(17)33308-1"] = {
        "pdf_url": "https://www.rug.nl/research/portal/files/64097453/Author_s_version_Gonadotrophins_versus_clomiphene_citrate_with_or_without_intrauterine_insemination_in_women.pdf",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1093/joclec/nhy009"] = {
        "pdf_url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3126848",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1038/s41477-017-0019-3"] = {
        "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/270235/3383_1_merged_1502805167.pdf?sequence=1&isAllowed=y",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1029/wr015i006p01633"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.497&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email, zenodo
    override_dict["10.1080/01650521.2018.1460931"] = {
        "metadata_url": "https://zenodo.org/record/1236622",
        "host_type_set": "repository",
        "version": "acceptedVersion"
    }

    # from email
    override_dict["10.3928/01477447-20150804-53"] = {}

    # from twitter
    override_dict["10.1103/physreva.97.013421"] = {
        "pdf_url": "https://arxiv.org/pdf/1711.10074.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.1016/j.amjmed.2005.09.031"] = {
        "pdf_url": "https://www.amjmed.com/article/S0002-9343(05)00885-5/pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1080/15348458.2017.1327816"] = {}

    # from chorus
    override_dict["10.1103/physrevd.94.052011"] = {
        "pdf_url": "https://link.aps.org/accepted/10.1103/PhysRevD.94.052011",
        "version": "acceptedVersion",
    }
    override_dict["10.1063/1.4962501"] = {
        "pdf_url": "https://aip.scitation.org/doi/am-pdf/10.1063/1.4962501",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email, broken citeseer link
    override_dict["10.2202/1949-6605.1908"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.535.9289&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1561/1500000012"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.174.8814&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1137/s0036142902418680"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.144.7627&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1088/1741-2552/aab4e4"] = {
        "pdf_url": "http://iopscience.iop.org/article/10.1088/1741-2552/aab4e4/pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1145/1031607.1031615"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.540.8125&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1007/s11227-016-1779-7"] = {
        "pdf_url": "https://hcl.ucd.ie/system/files/TJS-Hasanov-2016.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/s0020-0190(03)00351-x"] = {
        "pdf_url": "https://kam.mff.cuni.cz/~kolman/papers/noteb.ps",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1002/14651858.cd001704.pub4"] = {
        "pdf_url": "https://core.ac.uk/download/pdf/9440822.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/j.tetlet.2015.04.131"] = {
        "pdf_url": "https://www.sciencedirect.com/sdfe/pdf/download/read/aam/noindex/pii/S0040403915007881",
        "version": "acceptedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1016/j.nima.2016.04.104"] = {
        "pdf_url": "http://cds.cern.ch/record/2239750/files/1-s2.0-S0168900216303400-main.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/s1470-2045(15)00444-1"] = {
        "pdf_url": "https://www.statsarecool.com/data/uploads/journal-articles/who_declares_reds_meat_carcinogeniclancet_oct_2015.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1056/NEJM199406233302502"] = {
        "pdf_url": "https://www.nejm.org/doi/full/10.1056/NEJM199406233302502",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1056/NEJMra1201534"] = {
        "pdf_url": "https://www.nejm.org/doi/pdf/10.1056/NEJMra1201534",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1016/j.cmet.2018.03.012"] = {
        "pdf_url": "https://www.biorxiv.org/content/biorxiv/early/2018/01/15/245332.full.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1093/sf/65.1.1"] = {
        "pdf_url": "https://faculty.washington.edu/charles/new%20PUBS/A52.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1088/1751-8121/aabd9c"] = {}

    # from email
    override_dict["10.1017/CBO9781139173728.002"] = {}

    # from email
    override_dict["10.2174/97816810846711170101"] = {}

    # from email
    override_dict["10.1177/1354066196002003001"] = {}

    # from email
    override_dict["10.1093/bioinformatics/bty721"] = {}

    # from email
    override_dict["10.1088/1361-6528/aac7a4"] = {}

    # from email
    override_dict["10.1088/1361-6528/aac645"] = {}

    # from email
    override_dict["10.1111/1748-8583.12159"] = {}

    # from email
    override_dict["10.1042/BJ20080963"] = {}

    # from email
    override_dict["10.1136/bmj.j5007"] = {}

    # from email
    override_dict["10.1016/j.phrs.2017.12.007"] = {}

    # from email
    override_dict["10.4324/9781315770185"] = {}

    # from email
    override_dict["10.1108/PIJPSM-02-2016-0019"] = {}

    # from email
    override_dict["10.1016/j.ejca.2017.07.015"] = {}

    # from email
    override_dict["10.1080/14655187.2017.1469322"] = {}

    # from email
    override_dict["10.1080/02684527.2017.1407549"] = {}

    # from email
    override_dict["10.1093/jat/bky025"] = {}

    # from email
    override_dict["10.1016/j.midw.2009.07.004"] = {}

    # from email
    override_dict["10.1177/247553031521a00105"] = {}

    # from email
    override_dict["10.1002/0471445428"] = {}

    # from email
    override_dict["10.1007/978-3-642-31232-8"] = {}

    # ticket 267
    override_dict["10.1016/j.anucene.2014.08.021"] = {}

    # ticket 199
    # pdf has embedded password protection
    override_dict["10.22381/rcp1720184"] = {}

    # ticket 256
    # journal in doaj but article not available
    override_dict["10.1016/j.mattod.2018.03.001"] = {}

    # ticket 277
    # pmh record with spurious title: oai:works.swarthmore.edu:fac-psychology-1039
    override_dict["10.1016/j.actpsy.2010.01.009"] = {}

    # ticket 280
    # green scrape gets overexcited about a .doc link
    override_dict["10.1108/09596111211217932"] = {}

    # ticket 279
    # match to wrong pdf, currently suppressed incorrectly by bad pdf check
    override_dict["10.1238/physica.topical.102a00059"] = {}

    # from email
    override_dict["10.1016/S0022-1996(00)00093-3"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.3874&rep=rep1&type=pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1177/088840649401700203"] = {
        "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1014.8577&rep=rep1&type=pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.7326/L18-0139"] = {
        "pdf_url": "http://annals.org/data/journals/aim/936928/aime201804170-l180139.pdf",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1007/978-3-319-48881-3_55"] = {
        "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:1063949/FULLTEXT01.pdf",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1109/ICCVW.2015.86"] = {
        "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:917646/FULLTEXT01",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1126/science.aap9559"] = {
        "pdf_url": "http://vermontcomplexsystems.org/share/papershredder/vosoughi2018a.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1109/tpds.2012.97"] = {
        "pdf_url": "https://www.cnsr.ictas.vt.edu/publication/06171175.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # ticket 261
    # crossref metadata points to wrong article
    override_dict["10.4149/BLL_2013_058"] = {
        "pdf_url": "http://www.elis.sk/download_file.php?product_id=3759&session_id=lnkeo437s8hv5t0r28g6ku93b0",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # ticket 200
    # we forgot to say the magic word
    override_dict["10.1007/s11465-016-0392-z"] = {
        "pdf_url": "https://cora.ucc.ie/bitstream/10468/4112/1/2986.pdf?&isAllowed=y",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # the use of this is counting on the doi keys being lowercase/cannonical
    response = {}
    for k, v in override_dict.iteritems():
        response[clean_doi(k)] = v

    return response
Esempio n. 41
0
    def call_pmh_endpoint(self,
                          first=None,
                          last=None,
                          chunk_size=10,
                          scrape=False):

        args = {}
        args['metadataPrefix'] = 'oai_dc'

        if "citeseerx" in self.pmh_url:
            proxy_url = os.getenv("STATIC_IP_PROXY")
            proxies = {"https": proxy_url, "http": proxy_url}
        else:
            proxies = {}

        my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120)
        logger.info(u"connected to sickle with {} {}".format(
            self.pmh_url, proxies))

        args['from'] = first
        if last:
            args["until"] = last

        records_to_save = []

        logger.info(u"calling ListRecords with {} {}".format(
            self.pmh_url, args))
        try:
            pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args)
            logger.info(u"got pmh_records with {} {}".format(
                self.pmh_url, args))
            pmh_input_record = safe_get_next_record(pmh_records)
        except Exception as e:
            logger.info(u"no records with {} {}".format(self.pmh_url, args))
            # logger.exception(u"no records with {} {}".format(self.pmh_url, args))
            pmh_input_record = None

        while pmh_input_record:

            my_pmh_record = pmh_record.PmhRecord()

            my_pmh_record.id = pmh_input_record.header.identifier
            my_pmh_record.api_raw = pmh_input_record.raw
            my_pmh_record.record_timestamp = pmh_input_record.header.datestamp
            my_pmh_record.title = oai_tag_match("title", pmh_input_record)
            my_pmh_record.authors = oai_tag_match("creator",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.oa = oai_tag_match("oa", pmh_input_record)
            my_pmh_record.urls = oai_tag_match("identifier",
                                               pmh_input_record,
                                               return_list=True)
            for fulltext_url in my_pmh_record.urls:
                if fulltext_url and (is_doi_url(fulltext_url)
                                     or fulltext_url.startswith(u"doi:")
                                     or re.findall(u"10\.", fulltext_url)):
                    try:
                        my_pmh_record.doi = clean_doi(fulltext_url)
                    except NoDoiException:
                        pass

            my_pmh_record.license = oai_tag_match("rights", pmh_input_record)
            my_pmh_record.relations = oai_tag_match("relation",
                                                    pmh_input_record,
                                                    return_list=True)
            my_pmh_record.sources = oai_tag_match("collname",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.source = self.id

            if is_complete(my_pmh_record):
                db.session.merge(my_pmh_record)
                my_pages = my_pmh_record.mint_pages()
                logger.info(u"made {} pages for id {}".format(
                    len(my_pages), my_pmh_record.id))
                for my_page in my_pages:
                    if scrape:
                        logger.info(u"scraping pages")
                        my_page.scrape()
                    db.session.merge(my_page)
                records_to_save.append(my_pmh_record)
                # logger.info(u":")
                logger.info(u"my_pmh_record {}".format(
                    my_pmh_record.get_good_urls()))
            else:
                logger.info(u"not complete")

            if len(records_to_save) >= chunk_size:
                last_record = records_to_save[-1]
                logger.info(u"last record saved: {} for {}".format(
                    last_record.id, self.id))
                safe_commit(db)
                records_to_save = []

            pmh_input_record = safe_get_next_record(pmh_records)

        # make sure to get the last ones
        if records_to_save:
            last_record = records_to_save[-1]
            logger.info(
                u"saving {} last ones, last record saved: {} for {}".format(
                    len(records_to_save), last_record.id, self.id))
            safe_commit(db)
        logger.info(u"done everything for {}".format(self.id))
Esempio n. 42
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", 10)
        run_class = Pub
        run_method = kwargs.get("method")

        if single_obj_id:
            limit = 1
            queue_table = None
        elif run_method == "refresh":
            queue_table = "pub_refresh_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """
                with refresh_queue as (
                    select id
                    from {queue_table}
                    where started is null
                    order by
                        priority desc,
                        finished nulls first,
                        started,
                        rand
                    limit {chunk}
                    for update skip locked
                )
                update {queue_table} queue_rows_to_update
                set started = now()
                from refresh_queue
                where refresh_queue.id = queue_rows_to_update.id
                returning refresh_queue.id;"""
            text_query = text_query_pattern.format(chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))
        else:
            queue_table = "pub_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """WITH update_pub_queue AS (
                       SELECT id
                       FROM   {queue_table}
                       WHERE  started is null
                       order by finished asc
                       nulls first
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   update_pub_queue
                WHERE update_pub_queue.id = queue_rows_to_update.id
                RETURNING update_pub_queue.id;"""
            text_query = text_query_pattern.format(limit=limit,
                                                   chunk=chunk,
                                                   queue_table=queue_table)
            logger.info(u"the queue query is:\n{}".format(text_query))
        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                single_obj_id = clean_doi(single_obj_id)
                objects = [
                    run_class.query.filter(
                        run_class.id == single_obj_id).first()
                ]
            else:
                logger.info(u"looking for new jobs")

                job_time = time()
                row_list = db.engine.execute(
                    text(text_query).execution_options(
                        autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                logger.info(u"got ids, took {} seconds".format(
                    elapsed(job_time)))

                job_time = time()
                q = db.session.query(Pub).options(orm.undefer('*')).filter(
                    Pub.id.in_(object_ids))
                objects = q.all()
                logger.info(u"got pub objects in {} seconds".format(
                    elapsed(job_time)))

                # shuffle them or they sort by doi order
                random.shuffle(objects)

                # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all()

                # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # id_rows =  db.engine.execute(text(text_query)).fetchall()
                # ids = [row[0] for row in id_rows]
                #
                # job_time = time()
                # objects = run_class.query.filter(run_class.id.in_(ids)).all()

                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            # logger.info(u"finished update_fn")
            if queue_table:
                object_ids_str = u",".join([
                    u"'{}'".format(id.replace(u"'", u"''"))
                    for id in object_ids
                ])
                object_ids_str = object_ids_str.replace(u"%",
                                                        u"%%")  #sql escaping
                sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str)
                # logger.info(u"sql command to update finished is: {}".format(sql_command))
                run_sql(db, sql_command)
                # logger.info(u"finished run_sql")

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit,
                                  start_time, index)
Esempio n. 43
0
def get_chorus_data(starting_offset=0, agency_id=None):
    requests_session = requests.Session()
    retries = Retry(total=10,
                backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    agencies = get_chorus_agencies()
    for agency in agencies:
        if agency_id:
            if int(agency["Agency_Id"]) != int(agency_id):
                print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"])
                continue
        if starting_offset:
            offset = starting_offset
        else:
            offset = 0

        logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"]))
        url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}"
        limit = 50
        total_results = None
        while total_results==None or offset < total_results:
            loop_start = time()
            url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit)
            print url
            try:
                r = requests_session.get(url, timeout=360)  # wait for 3 minutes
            except Exception, e:
                logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8")))
                r = None

            print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1))
            offset += limit

            if r:
                data = r.json()
                total_results = data["total_results"]
                logger.info(u"Has {} total results, {} remaining".format(
                    total_results, total_results - offset))


                items = data["items"]
                new_objects = []
                for item in items:
                    if item["DOI"]:
                        doi = clean_doi(item["DOI"])
                        new_objects.append(Chorus(id=doi, raw=item))

                ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()]
                objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db]
                if objects_to_add_to_db:
                    logger.info(u"adding {} items".format(len(objects_to_add_to_db)))
                    db.session.add_all(objects_to_add_to_db)
                    safe_commit(db)
                else:
                    logger.info(u"all of these items already in db")

            logger.info(u"sleeping for 2 seconds")
            sleep(2)