Example #1
0
def cache_api_response(my_saved_object):
    entity_term = my_saved_object.entity_title
    entity_term = entity_term.replace(u" ", u"_")

    url = u"https://gtr-api.herokuapp.com/search/{}?automated=true&nocache=true".format(
        entity_term)
    r = requests.get(url)
    print r
    print url
    my_saved_object.api_response = r.json()
    flag_modified(my_saved_object, "api_response"
                  )  # required, to force sqlalchemy to update because jsonb

    url = u"https://gtr-api.herokuapp.com/search/{}?oa=true&automated=true&nocache=true".format(
        entity_term)
    r = requests.get(url)
    print r
    print url
    my_saved_object.api_response_oa_only = r.json()
    flag_modified(my_saved_object, "api_response_oa_only"
                  )  # required, to force sqlalchemy to update because jsonb

    my_saved_object.collected = datetime.datetime.utcnow()
    db.session.merge(my_saved_object)
    safe_commit(db)
    print ".",

    return
Example #2
0
def package_create(jusp_id, institution_id, package_type):

    jisc_package_id = u"package-jiscels{}".format(jusp_id)
    package_id = u"package-n8els_{}_{}".format(jusp_id,
                                               package_type.replace(" ", ""))
    package_name = u"Elsevier n8 ({})".format(package_type)
    scenario_id = u"scenario-n8els_{}_{}".format(jusp_id,
                                                 package_type.replace(" ", ""))
    scenario_name = u"n8 ({})".format(package_type)

    my_package = Package.query.get(package_id)
    if not my_package:
        print u"package {} doesn't exist, making".format(package_id)
        my_package = Package(package_id=package_id,
                             publisher="Elsevier",
                             package_name=package_name,
                             created=datetime.datetime.utcnow().isoformat(),
                             institution_id=institution_id,
                             is_demo=False,
                             currency="GBP")
        db.session.add(my_package)
        print my_package
        safe_commit(db)

        if package_type == "own pta":
            copy_into_n8_package(old_package_id=jisc_package_id,
                                 new_package_id=package_id,
                                 copy_perpetual_access=True)
        elif package_type == "group pta":
            copy_into_n8_package(old_package_id=jisc_package_id,
                                 new_package_id=package_id,
                                 copy_perpetual_access=False)
        elif package_type == "uk pta":
            copy_into_n8_package(old_package_id=jisc_package_id,
                                 new_package_id=package_id,
                                 copy_perpetual_access=False)

    my_scenario = SavedScenario.query.get(scenario_id)
    if not my_scenario:
        print u"scenario {} doesn't exist, making".format(scenario_id)
        my_scenario = SavedScenario(False, scenario_id, None)
        my_scenario.package_id = package_id
        my_scenario.created = datetime.datetime.utcnow().isoformat()
        db.session.add(my_scenario)
        safe_commit(db)

    print "updating settings, including big deal cost from jisc package"
    big_deal_price = get_sql_answer(
        db,
        "select big_deal_cost from jump_account_package where package_id = '{}';"
        .format(jisc_package_id))

    dict_to_save = my_scenario.to_dict_saved_from_db()
    dict_to_save["name"] = scenario_name
    dict_to_save["configs"]["cost_bigdeal"] = big_deal_price
    dict_to_save["configs"]["cost_bigdeal_increase"] = 2
    dict_to_save["configs"]["include_social_networks"] = True  # set to true
    dict_to_save["configs"]["weight_authorship"] = 0  # 100
    dict_to_save["configs"]["weight_citation"] = 0  # 10
    save_raw_scenario_to_db(scenario_id, dict_to_save, None)
def check_pdf_urls(pdf_urls):
    for url in pdf_urls:
        make_transient(url)

    # free up the connection while doing net IO
    safe_commit(db)
    db.engine.dispose()

    req_pool = get_request_pool()

    checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1)
    req_pool.close()
    req_pool.join()

    row_dicts = [x.__dict__ for x in checked_pdf_urls]
    for row_dict in row_dicts:
        row_dict.pop('_sa_instance_state')

    db.session.bulk_update_mappings(PdfUrl, row_dicts)

    start_time = time()
    commit_success = safe_commit(db)
    if not commit_success:
        logger.info(u"COMMIT fail")
    logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
Example #4
0
def add_new_log_temp_profile(my_temp_person, request=None):
    if LogTempProfile.query.get(my_temp_person.orcid_id):
        return

    new_log = LogTempProfile(my_temp_person, request)
    db.session.add(new_log)
    safe_commit(db)
    def set_coauthors(self):
        # commit first, to make sure fresh session etc
        safe_commit(db)

        # now go for it
        print u"running coauthors for {}".format(self.orcid_id)
        coauthor_orcid_id_query = u"""select distinct orcid_id
                    from product
                    where doi in
                      (select doi from product where orcid_id='{}')""".format(
            self.orcid_id)
        rows = db.engine.execute(text(coauthor_orcid_id_query))
        orcid_ids = [row[0] for row in rows]

        coauthors = Person.query.filter(Person.orcid_id.in_(orcid_ids)).all()
        resp = {}
        for coauthor in coauthors:
            if coauthor.id != self.id:
                resp[coauthor.orcid_id] = {
                    "name": coauthor.full_name,
                    "id": coauthor.id,
                    "orcid_id": coauthor.orcid_id,
                    "openness_perc": coauthor.display_openness_perc,
                    "engagement_perc": coauthor.display_engagement_perc,
                    "buzz_perc": coauthor.display_buzz_perc
                }
        self.coauthors = resp
Example #6
0
def check_pdf_urls(pdf_urls):
    for url in pdf_urls:
        make_transient(url)

    # free up the connection while doing net IO
    safe_commit(db)
    db.engine.dispose()

    req_pool = get_request_pool()

    checked_pdf_urls = req_pool.map(get_pdf_url_status, pdf_urls, chunksize=1)
    req_pool.close()
    req_pool.join()

    row_dicts = [x.__dict__ for x in checked_pdf_urls]
    for row_dict in row_dicts:
        row_dict.pop('_sa_instance_state')

    db.session.bulk_update_mappings(PdfUrl, row_dicts)

    start_time = time()
    commit_success = safe_commit(db)
    if not commit_success:
        logger.info(u"COMMIT fail")
    logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
def add_new_log(my_temp_person, request=None):
    if LogTempProfile.query.get(my_temp_person.orcid_id):
        return

    new_log = LogTempProfile(my_temp_person, request)
    db.session.add(new_log)
    safe_commit(db)
Example #8
0
def from_bq_overwrite_data(db_tablename, bq_tablename):
    temp_data_filename = 'data_export.csv'

    column_names = from_bq_to_local_file(temp_data_filename,
                                         bq_tablename,
                                         header=False)
    print "column_names", column_names
    print "\n"

    cursor = db.session.connection().connection.cursor()

    cursor.execute(u"truncate {};".format(db_tablename))

    # replace quoted tabs with just a tab, because the quote is there by mistake
    # temp_data_cleaned_filename = 'data_export_cleaned.csv'

    # o = open(temp_data_cleaned_filename,"w")
    # data = open(temp_data_filename).read()
    # o.write(re.sub("\t", "|", re.sub("|"," ", data)))
    # o.close()

    with open(temp_data_filename, "rb") as f:
        cursor.copy_from(f,
                         db_tablename,
                         sep='\t',
                         columns=column_names,
                         null="")

    # this commit is necessary
    safe_commit(db)
Example #9
0
def add_endpoint(my_request):

    if not my_request.pmh_url:
        return None

    endpoint_with_this_id = Endpoint.query.filter(
        Endpoint.repo_request_id == my_request.id).first()
    if endpoint_with_this_id:
        print u"one already matches {}".format(my_request.id)
        return None

    raw_endpoint = my_request.pmh_url
    clean_endpoint = raw_endpoint.strip()
    clean_endpoint = clean_endpoint.strip("?")
    clean_endpoint = re.sub(u"\?verb=.*$", "", clean_endpoint, re.IGNORECASE)
    print u"raw endpoint is {}, clean endpoint is {}".format(
        raw_endpoint, clean_endpoint)

    matching_endpoint = Endpoint()
    matching_endpoint.pmh_url = clean_endpoint

    repo_matches = my_request.matching_repositories()
    if repo_matches:
        matching_repo = repo_matches[0]
        print u"yay! for {} {} matches repository {}".format(
            my_request.institution_name, my_request.repo_name, matching_repo)
    else:
        print u"no matching repository for {}: {}".format(
            my_request.institution_name, my_request.repo_name)
        matching_repo = Repository()

    # overwrite stuff with request
    matching_repo.institution_name = my_request.institution_name
    matching_repo.repository_name = my_request.repo_name
    matching_repo.home_page = my_request.repo_home_page
    matching_endpoint.repo_unique_id = matching_repo.id
    matching_endpoint.email = my_request.email
    matching_endpoint.repo_request_id = my_request.id
    matching_endpoint.ready_to_run = True
    matching_endpoint.set_identify_and_initial_query()

    db.session.merge(matching_endpoint)
    db.session.merge(matching_repo)
    print u"added {} {}".format(matching_endpoint, matching_repo)
    print u"see at url http://unpaywall.org/sources/repository/{}".format(
        matching_endpoint.id)
    safe_commit(db)
    print "saved"

    print "now sending email"
    # get the endpoint again, so it gets with all the meta info etc
    matching_endpoint = Endpoint.query.get(matching_endpoint.id)
    matching_endpoint.contacted_text = "automated welcome email"
    matching_endpoint.contacted = datetime.datetime.utcnow().isoformat()
    safe_commit(db)
    send_announcement_email(matching_endpoint)

    print "email sent"

    return matching_endpoint
Example #10
0
def add_pubs_or_update_crossref(pubs):
    if not pubs:
        return []

    pubs_by_id = dict((p.id, p) for p in pubs)

    existing_pub_ids = set([
        id_tuple[0] for id_tuple in db.session.query(Pub.id).filter(
            Pub.id.in_(list(pubs_by_id.keys()))).all()
    ])

    pubs_to_add = [p for p in pubs if p.id not in existing_pub_ids]
    pubs_to_update = [p for p in pubs if p.id in existing_pub_ids]

    if pubs_to_add:
        logger.info("adding {} pubs".format(len(pubs_to_add)))
        db.session.add_all(pubs_to_add)

    if pubs_to_update:
        row_dicts = [{
            'id': p.id,
            'crossref_api_raw_new': p.crossref_api_raw_new
        } for p in pubs_to_update]
        logger.info("updating {} pubs".format(len(pubs_to_update)))
        db.session.bulk_update_mappings(Pub, row_dicts)

    safe_commit(db)
    return pubs_to_add
Example #11
0
def commit_repo(repo):
    try:
        db.session.commit()
    except DataError:
        print "error committing repo, rolling back and setting save error for ", repo
        db.session.rollback()
        repo.set_save_error()
        safe_commit(db)
Example #12
0
def get_pub_from_biblio(biblio, run_with_hybrid=False, skip_all_hybrid=False):
    my_pub = lookup_product(**biblio)
    if run_with_hybrid:
        my_pub.run_with_hybrid()
        safe_commit(db)
    else:
        my_pub.recalculate()

    return my_pub
def add_endpoint(my_request):

    if not my_request.pmh_url:
        return None

    endpoint_with_this_id = Endpoint.query.filter(Endpoint.repo_request_id==my_request.id).first()
    if endpoint_with_this_id:
        print u"one already matches {}".format(my_request.id)
        return None

    raw_endpoint = my_request.pmh_url
    clean_endpoint = raw_endpoint.strip()
    clean_endpoint = clean_endpoint.strip("?")
    clean_endpoint = re.sub(u"\?verb=.*$", "", clean_endpoint, re.IGNORECASE)
    print u"raw endpoint is {}, clean endpoint is {}".format(raw_endpoint, clean_endpoint)

    matching_endpoint = Endpoint()
    matching_endpoint.pmh_url = clean_endpoint

    repo_matches = my_request.matching_repositories()
    if repo_matches:
        matching_repo = repo_matches[0]
        print u"yay! for {} {} matches repository {}".format(
            my_request.institution_name, my_request.repo_name, matching_repo)
    else:
        print u"no matching repository for {}: {}".format(
            my_request.institution_name, my_request.repo_name)
        matching_repo = Repository()

    # overwrite stuff with request
    matching_repo.institution_name = my_request.institution_name
    matching_repo.repository_name = my_request.repo_name
    matching_repo.home_page = my_request.repo_home_page
    matching_endpoint.repo_unique_id = matching_repo.id
    matching_endpoint.email = my_request.email
    matching_endpoint.repo_request_id = my_request.id
    matching_endpoint.ready_to_run = True
    matching_endpoint.set_identify_and_initial_query()

    db.session.merge(matching_endpoint)
    db.session.merge(matching_repo)
    print u"added {} {}".format(matching_endpoint, matching_repo)
    print u"see at url http://unpaywall.org/sources/repository/{}".format(matching_endpoint.id)
    safe_commit(db)
    print "saved"

    print "now sending email"
    # get the endpoint again, so it gets with all the meta info etc
    matching_endpoint = Endpoint.query.get(matching_endpoint.id)
    matching_endpoint.contacted_text = "automated welcome email"
    matching_endpoint.contacted = datetime.datetime.utcnow().isoformat()
    safe_commit(db)
    send_announcement_email(matching_endpoint)

    print "email sent"

    return matching_endpoint
Example #14
0
 def maint(self, **kwargs):
     # endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response==None, Endpoint.error==None).all()
     endpoints = Endpoint.query.filter(
         Endpoint.repo_request_id != None).all()
     shuffle(endpoints)
     for my_endpoint in endpoints:
         my_endpoint.run_diagnostics()
         logger.info(u"my_endpoint: {}".format(my_endpoint))
         db.session.merge(my_endpoint)
         safe_commit(db)
Example #15
0
def post_gs_cache(**kwargs):
    my_doi = clean_doi(kwargs["doi"])
    q = Gs.query.filter(Gs.doi == my_doi,
                        Gs.landing_page_url == kwargs["landing_page_url"])
    my_gs = q.first()
    if not my_gs:
        my_gs = Gs(**kwargs)
        db.session.add(my_gs)
        safe_commit(db)
    return my_gs
Example #16
0
def update_refsets():
    from models.person import Person

    print u"getting the badge percentile refsets...."

    # only get out the badge objects
    q = db.session.query(Person).options(
             Load(Person).load_only("campaign", "orcid_id"))
    q = q.options(orm.noload('*'))
    q = q.options(orm.subqueryload("badges"))

    # limit to just what we want for the refset
    q = refine_refset_query(q)

    # and do the get
    rows = q.all()

    print u"query finished, now set the values in the lists"
    refset_list_dict = defaultdict(list)
    for person in rows:
        for badge in person.badges:
            # print "BADGE", badge
            # handle the nones below, with the zeros
            if badge.value != None:
                refset_list_dict[badge.name].append(badge.value)

    num_in_refset = num_people_in_refset()

    for name, unsorted_values in refset_list_dict.iteritems():
        print u"refreshing refset {}".format(name)

        assigner = get_badge_assigner(name)
        if assigner.pad_percentiles_with_zeros:
            # pad with zeros for all the people who didn't get the badge
            unsorted_values.extend([0] * (num_in_refset - len(unsorted_values)))

        # now sort
        refset_list_dict[name] = sorted(unsorted_values)

        # now pick out the cutoffs, minimum value at each of 100
        cutoffs = []
        for sublist in chunk_into_n_sublists(refset_list_dict[name], 100):
            sublist_values = sublist
            if sublist_values:
                cutoffs.append(min(sublist_values))

        this_badge_refset = Refset(name=name, cutoffs=cutoffs)
        print u"saving refset {} with cutoffs {}".format(name, cutoffs)

        db.session.merge(this_badge_refset)


    # and finally save it all

    safe_commit(db)
Example #17
0
def get_pub_from_biblio(biblio, force_refresh=False):
    my_pub = lookup_product_in_db(**biblio)
    if not my_pub:
        my_pub = build_publication(**biblio)

    if force_refresh or not my_pub.evidence:
        my_pub.refresh()
        db.session.merge(my_pub)
        safe_commit(db)

    return my_pub
Example #18
0
def get_pubs_from_biblio(biblios, force_refresh=False):
    threads = []
    returned_pubs = []
    for biblio in biblios:
        process = Thread(target=thread_result_wrapper,
                         args=[get_pub_from_biblio, (biblio, force_refresh), returned_pubs])
        process.start()
        threads.append(process)
    for process in threads:
        process.join(timeout=30)

    safe_commit(db)
    return returned_pubs
Example #19
0
def modify_profile_endpoint(orcid_id):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()

    product_id = request.json["product"]["id"]
    my_product = next(my_product for my_product in my_person.products if my_product.id==product_id)
    url = request.json["product"]["fulltext_url"]
    my_product.set_oa_from_user_supplied_fulltext_url(url)

    my_person.recalculate_openness()

    safe_commit(db)

    return json_resp(my_person.to_dict())
Example #20
0
    def maint(self, **kwargs):
        if parsed_args.id:
            endpoints = Endpoint.query.filter(Endpoint.id == parsed_args.id).all()
        else:
            # endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response==None, Endpoint.error==None).all()
            endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response == None).all()
            shuffle(endpoints)

        for my_endpoint in endpoints:
            my_endpoint.run_diagnostics()
            db.session.merge(my_endpoint)
            safe_commit(db)
            logger.info(u"merged and committed my_endpoint: {}".format(my_endpoint))
Example #21
0
    def maint(self, **kwargs):
        if parsed_args.id:
            endpoints = Endpoint.query.filter(Endpoint.id == parsed_args.id).all()
        else:
            # endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response==None, Endpoint.error==None).all()
            endpoints = Endpoint.query.filter(Endpoint.harvest_identify_response == None).all()
            shuffle(endpoints)

        for my_endpoint in endpoints:
            my_endpoint.run_diagnostics()
            db.session.merge(my_endpoint)
            safe_commit(db)
            logger.info(u"merged and committed my_endpoint: {}".format(my_endpoint))
Example #22
0
def modify_profile_endpoint(orcid_id):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()

    product_id = request.json["product"]["id"]
    my_product = next(my_product for my_product in my_person.products if my_product.id==product_id)
    url = request.json["product"]["fulltext_url"]
    my_product.set_oa_from_user_supplied_fulltext_url(url)

    my_person.recalculate_openness()

    safe_commit(db)

    return json_resp(my_person.to_dict())
Example #23
0
def get_or_make_person(**kwargs):
    res = None

    if 'name' in kwargs and kwargs["name"] == "UNKNOWN":
        # pypi sets unknown people to have the name "UNKNOWN"
        # we don't want to make tons of these, it's just one 'person'.
        res = db.session.query(Person).filter(Person.name == "UNKNOWN").first()

    if 'name' in kwargs and kwargs["name"] == "ORPHANED":
        # cran sets this when the maintainer is gone.
        # we don't want to make tons of these, it's just one 'person'.
        res = db.session.query(Person).filter(
            Person.name == "ORPHANED").first()

    if res is not None:
        return res

    or_filters = []

    if "github_login" in kwargs and kwargs["github_login"]:
        or_filters.append(Person.github_login == kwargs["github_login"])

    elif "email" in kwargs and kwargs["email"]:
        or_filters.append(Person.email == kwargs["email"])

    elif "name" in kwargs and kwargs["name"]:
        incoming_parsed_name = HumanName(kwargs["name"])
        dict_for_matching = {
            "first": incoming_parsed_name.first,
            "last": incoming_parsed_name.last
        }
        or_filters.append(Person.parsed_name.contains(dict_for_matching))

    if or_filters:
        query = db.session.query(Person).filter(or_(*or_filters))
        persons = query.all()
        res = find_best_match(persons, **kwargs)

    if res is not None:
        return res
    else:
        print u"minting a new person using {}".format(kwargs)

        new_person = force_make_person(**kwargs)
        #need this commit to handle matching people added previously in this chunk
        db.session.add(new_person)
        safe_commit()
        return new_person
Example #24
0
def add_all_new_packages(package_class):

    all_current_package_id_rows = db.session.query(package_class.id).all()
    all_current_package_ids = [row[0] for row in all_current_package_id_rows]

    all_names = package_class.get_all_live_package_names()

    for package_name in all_names:
        new_package = package_class(project_name=package_name)
        if new_package.id not in all_current_package_ids:
            print "\n\nadded new package:", new_package.id
            # new_package.refresh()
            db.session.add(new_package)
            safe_commit(db)

    print len(all_names)
def delete_person(orcid_id):
    Person.query.filter_by(orcid_id=orcid_id).delete()
    badge.Badge.query.filter_by(orcid_id=orcid_id).delete()
    product.Product.query.filter_by(orcid_id=orcid_id).delete()
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
def set_person_email(orcid_id, email, high_priority=False):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.email = email
    db.session.merge(my_person)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
Example #27
0
    def add_pmh_record(self, **kwargs):
        endpoint_id = kwargs.get("id", None)
        record_id = kwargs.get("recordid")
        my_repo = Endpoint.query.get(endpoint_id)
        print "my_repo", my_repo
        my_pmh_record = my_repo.get_pmh_record(record_id)
        my_pmh_record.mint_pages()

        # for my_page in my_pmh_record.pages:
        #     print "my_page", my_page
        #     my_page.scrape()
        my_pmh_record.delete_old_record()
        db.session.merge(my_pmh_record)
        # print my_pmh_record.pages

        safe_commit(db)
Example #28
0
def set_person_email(orcid_id, email, high_priority=False):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.email = email
    db.session.merge(my_person)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
def pull_from_orcid(orcid_id, high_priority=False):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.refresh(refsets, high_priority=high_priority)
    db.session.merge(my_person)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
Example #30
0
def save_openness_log(my_person):
    # make a new log
    new_openness_log = LogOpenness()
    new_openness_log.set_openness_columns(my_person)

    # see if we already have a log the same as this.  if so, nothing to do, return.
    q = LogOpenness.query.filter_by(orcid_id=my_person.orcid_id).order_by(
        LogOpenness.created.desc())
    most_recent_log = q.first()
    if most_recent_log:
        if new_openness_log.has_same_openness(most_recent_log):
            print u"no new openness to log for {}".format(my_person.orcid_id)
            return

    # nope!  is worth logging.  finish adding attributes and store in db
    new_openness_log.id = shortuuid.uuid()[0:10]
    new_openness_log.created = datetime.datetime.utcnow().isoformat()
    new_openness_log.orcid_id = my_person.orcid_id
    db.session.add(new_openness_log)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on new_openness_log {}".format(
            new_openness_log.orcid_id)
    print u"logged new openness for {}".format(my_person.orcid_id)
    return
Example #31
0
def link_twitter(orcid_id, twitter_creds):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.twitter_creds = twitter_creds


    oauth = OAuth1Session(
        os.getenv('TWITTER_CONSUMER_KEY'),
        client_secret=os.getenv('TWITTER_CONSUMER_SECRET'),
        resource_owner_key=twitter_creds["oauth_token"],
        resource_owner_secret=twitter_creds["oauth_token_secret"]
    )
    url = "https://api.twitter.com/1.1/account/verify_credentials.json?include_email=true"

    r = oauth.get(url)
    full_twitter_profile = r.json()
    # print "we got this back from Twitter!", full_twitter_profile

    full_twitter_profile.update(twitter_creds)
    my_person.twitter_creds = full_twitter_profile
    if my_person.email is None:
        my_person.email = full_twitter_profile["email"]

    my_person.twitter = full_twitter_profile["screen_name"]

    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
    return my_person
Example #32
0
def load_campaign(filename, campaign=None, limit=None):

    with open("data/" + filename, "r") as f:
        lines = f.read().split("\n")
        print "found {} ORCID lines".format(len(lines))

    print len(lines)

    if limit:
        lines = lines[:limit]

    total_start = time()
    row_num = 0
    for line in lines:
        row_num += 1

        # can have # as comments
        if line.startswith("#"):
            print "skipping comment line"
            continue

        loop_start = time()
        email = None

        if "," in line:
            (dirty_orcid, email, twitter) = line.split(",")
        else:
            dirty_orcid = line

        try:
            orcid_id = clean_orcid(dirty_orcid)
        except NoOrcidException:
            try:
                print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(
                    line)
            except UnicodeDecodeError:
                print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n"
            continue

        my_person = Person.query.filter_by(orcid_id=orcid_id).first()
        if my_person:
            print u"row {}, already have person {}, skipping".format(
                row_num, orcid_id)
        else:
            print u"row {}, making person {}".format(row_num, orcid_id)
            my_person = make_person(orcid_id, high_priority=False)
            my_person.campaign = campaign
            my_person.email = email
            my_person.twitter = twitter
            db.session.merge(my_person)
            commit_success = safe_commit(db)
            if not commit_success:
                print u"COMMIT fail on {}".format(my_person.orcid_id)

        print "row {}: finished {} in {}s\n".format(row_num, orcid_id,
                                                    elapsed(loop_start))

    print "finished load_campaign on {} profiles in {}s\n".format(
        len(lines), elapsed(total_start))
Example #33
0
    def _copy_raw_to_s3(self,
                        filename,
                        package_id,
                        num_rows=None,
                        error=None,
                        error_details=None):
        if u"." in filename:
            suffix = u".{}".format(filename.split(u".")[-1])
        else:
            suffix = u""

        object_name = "{}_{}{}".format(package_id, self.file_type_label(),
                                       suffix)
        bucket_name = self._raw_s3_bucket()

        s3_client.upload_file(filename, bucket_name, object_name)

        with get_db_cursor() as cursor:
            command = "delete from jump_raw_file_upload_object where package_id = '{}' and file = '{}'".format(
                package_id, self.file_type_label())
            cursor.execute(command)

        if error and not error_details:
            error_details_dict = {
                "no_useable_rows":
                "No usable rows found.",
                "error_reading_file":
                "Error reading this file. Try opening this file, save in .xlsx format, and upload that."
            }
            error_details = error_details_dict.get(
                "error",
                "Error processing file. Please email this file to [email protected] so the Unsub team can look into the problem."
            )

        new_object = RawFileUploadObject(package_id=package_id,
                                         file=self.file_type_label(),
                                         bucket_name=bucket_name,
                                         object_name=object_name,
                                         num_rows=num_rows,
                                         error=error,
                                         error_details=error_details)

        db.session.add(new_object)
        safe_commit(db)

        return "s3://{}/{}".format(bucket_name, object_name)
Example #34
0
def refresh_profile(orcid_id, high_priority=False):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.refresh(high_priority=high_priority)
    db.session.merge(my_person)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
    return my_person
Example #35
0
def update_refsets():
    print u"getting the badge percentile refsets...."
    refset_list_dict = defaultdict(list)
    q = db.session.query(
        Badge.name,
        Badge.value,
    )
    q = q.filter(Badge.value != None)
    rows = q.all()

    print u"query finished, now set the values in the lists"
    for row in rows:
        if row[1]:
            refset_list_dict[row[0]].append(row[1])

    num_in_refset = num_people_in_db()

    for name, unsorted_values in refset_list_dict.iteritems():
        print u"refreshing refset {}".format(name)

        assigner = get_badge_assigner(name)
        if assigner.pad_percentiles_with_zeros:
            # pad with zeros for all the people who didn't get the badge
            unsorted_values.extend([0] * (num_in_refset - len(unsorted_values)))

        # now sort
        # for testing!!!
        refset_list_dict[name] = sorted(unsorted_values)
        # refset_list_dict[name] = sorted(unsorted_values[0:200])

        # now pick out the cutoffs, minimum value at each of 100
        cutoffs = []
        for sublist in chunk_into_n_sublists(refset_list_dict[name], 100):
            sublist_values = sublist
            if sublist_values:
                cutoffs.append(min(sublist_values))

        this_badge_refset = Refset(name=name, cutoffs=cutoffs)
        print u"saving refset {} with cutoffs {}".format(name, cutoffs)

        db.session.merge(this_badge_refset)


    # and finally save it all
    safe_commit(db)
Example #36
0
    def set_scores(self):
        self.pagerank = 0
        self.num_downloads = 0
        self.num_citations = 0

        for pp in self.get_person_packages():
            # only count up academic packages

            if pp.package.is_academic:
                # only count up impact for packages in our main language
                if pp.package.language == self.main_language:
                    if pp.person_package_pagerank:
                        self.pagerank += pp.person_package_pagerank
                    if pp.person_package_num_downloads:
                        self.num_downloads += pp.person_package_num_downloads
                    if pp.person_package_num_citations:
                        self.num_citations += pp.person_package_num_citations
        safe_commit(db)
def make_person(orcid_id, high_priority=False):
    my_person = Person(orcid_id=orcid_id)
    db.session.add(my_person)
    print u"\nmade new person for {}".format(orcid_id)
    my_person.refresh(refsets, high_priority=high_priority)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
    return my_person
Example #38
0
    def add_pmh_record(self, **kwargs):
        endpoint_id = kwargs.get("id", None)
        record_id = kwargs.get("recordid")
        my_repo = Endpoint.query.get(endpoint_id)
        print "my_repo", my_repo
        my_pmh_record = my_repo.get_pmh_record(record_id)
        print "my_pmh_record", my_pmh_record

        my_pmh_record.mint_pages()

        # for my_page in my_pmh_record.pages:
        #     print "my_page", my_page
        #     my_page.scrape()

        db.session.merge(my_pmh_record)
        # print my_pmh_record.pages

        safe_commit(db)
Example #39
0
    def worker_run(self, **kwargs):
        chunk_size = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", None)
        queue_no = kwargs.get("queue", 0)

        if limit is None:
            limit = float("inf")

        index = 0
        num_updated = 0
        start_time = time()

        while num_updated < limit:
            new_loop_start_time = time()

            objects = self.fetch_queue_chunk(chunk_size, queue_no)

            if not objects:
                sleep(5)
                continue

            for o in objects:
                o.refresh()

            finish_batch_text = u'''
                update {queue_table}
                set finished = now(), started = null, priority = null
                where id = any(:ids)'''.format(
                queue_table=self.table_name(None))

            finish_batch_command = text(finish_batch_text).bindparams(
                ids=[o.id for o in objects])

            db.session.execute(finish_batch_command)

            commit_start_time = time()
            safe_commit(db) or logger.info(u"COMMIT fail")
            logger.info(u"commit took {} seconds".format(
                elapsed(commit_start_time, 2)))

            index += 1
            num_updated += chunk_size
            self.print_update(new_loop_start_time, len(objects), limit,
                              start_time, index)
Example #40
0
def run(retry_apis):
    start = time()

    journal_ids = db.session.query(Journal.issn_l).filter(
        or_(
            missing_field_filter(Journal.api_raw_crossref, retry_apis),
            missing_field_filter(Journal.api_raw_issn, retry_apis),
        )).all()

    logger.info('trying to update {} journals'.format(len(journal_ids)))

    chunk_size = 50
    for i in range(0, len(journal_ids), chunk_size):
        id_chunk = journal_ids[i:i + chunk_size]
        journals = Journal.query.filter(Journal.issn_l.in_(id_chunk)).all()

        for journal in journals:
            # try all issns, issn-l first
            issns = set(journal.issns)
            issns.discard(journal.issn_l)
            issns = [journal.issn_l] + list(issns)

            if journal.api_raw_crossref is None or (
                    retry_apis and journal.api_raw_crossref == {}):
                logger.info('getting crossref api response for {}'.format(
                    journal.issn_l))
                journal.api_raw_crossref = get_first_response(
                    call_crossref_api, issns) or {}

            if journal.api_raw_issn is None or (retry_apis and
                                                journal.api_raw_issn == {}):
                logger.info('getting issn api response for {}'.format(
                    journal.issn_l))
                journal.api_raw_issn = get_first_response(
                    call_issn_api, issns) or {}

            db.session.merge(journal)

        safe_commit(db)

    db.session.remove()

    logger.info('finished update in {}'.format(
        timedelta(seconds=elapsed(start))))
def load_campaign(filename, campaign=None, limit=None):

    with open("data/" + filename, "r") as f:
        lines = f.read().split("\n")
        print "found {} ORCID lines".format(len(lines))

    print len(lines)

    if limit:
        lines = lines[:limit]


    total_start = time()
    row_num = 0
    for line in lines:
        row_num += 1

        # can have # as comments
        if line.startswith("#"):
            print "skipping comment line"
            continue

        loop_start = time()
        email = None

        if "," in line:
            (dirty_orcid, email, twitter) = line.split(",")
        else:
            dirty_orcid = line

        try:
            orcid_id = clean_orcid(dirty_orcid)
        except NoOrcidException:
            try:
                print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line)
            except UnicodeDecodeError:
                print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n"
            continue

        my_person = Person.query.filter_by(orcid_id=orcid_id).first()
        if my_person:
            print u"row {}, already have person {}, skipping".format(row_num, orcid_id)
        else:
            print u"row {}, making person {}".format(row_num, orcid_id)
            my_person = make_person(orcid_id, store_in_db=True)
            my_person.campaign = campaign
            my_person.email = email
            my_person.twitter = twitter
            db.session.merge(my_person)
            commit_success = safe_commit(db)
            if not commit_success:
                print u"COMMIT fail on {}".format(my_person.orcid_id)

        print "row {}: finished {} in {}s\n".format(row_num, orcid_id, elapsed(loop_start))

    print "finished load_campaign on {} profiles in {}s\n".format(len(lines), elapsed(total_start))
Example #42
0
def save_email(orcid_id, contents):
    email = LogEmail()
    email.id = shortuuid.uuid()[0:10]
    email.sent = datetime.datetime.utcnow().isoformat()
    email.orcid_id = orcid_id
    email.contents = dict(contents)
    db.session.add(email)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on email {}".format(email.orcid_id)
Example #43
0
def save_email(orcid_id, contents):
    email = Email()
    email.id = shortuuid.uuid()[0:10]
    email.sent = datetime.datetime.utcnow().isoformat()
    email.orcid_id = orcid_id
    email.contents = dict(contents)
    db.session.add(email)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on email {}".format(email.orcid_id)
Example #44
0
def tweeted_quickly(orcid_id):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()

    if not my_person:
            print u"returning 404: orcid profile {} does not exist".format(orcid_id)
            abort_json(404, "That ORCID profile doesn't exist")

    my_person.tweeted_quickly = True
    success = safe_commit(db)
    return json_resp({"resp": "success"})
Example #45
0
def delete_person(orcid_id):
    # also need delete all the badges, products
    product.Product.query.filter_by(orcid_id=orcid_id).delete()
    badge.Badge.query.filter_by(orcid_id=orcid_id).delete()

    # and now delete the person.  have to do this after deleting the stuff above.
    Person.query.filter_by(orcid_id=orcid_id).delete()

    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)
Example #46
0
    def update_fn(self, cls, method_name, objects, index=1):

        # we are in a fork!  dispose of our engine.
        # will get a new one automatically
        # if is pooling, need to do .dispose() instead
        db.engine.dispose()

        start = time()
        num_obj_rows = len(objects)

        # logger.info(u"{pid} {repr}.{method_name}() got {num_obj_rows} objects in {elapsed} seconds".format(
        #     pid=os.getpid(),
        #     repr=cls.__name__,
        #     method_name=method_name,
        #     num_obj_rows=num_obj_rows,
        #     elapsed=elapsed(start)
        # ))

        for count, obj in enumerate(objects):
            start_time = time()

            if obj is None:
                return None

            method_to_run = getattr(obj, method_name)

            # logger.info(u"***")
            logger.info(u"*** #{count} starting {repr}.{method_name}() method".format(
                count=count + (num_obj_rows*index),
                repr=obj,
                method_name=method_name
            ))

            method_to_run()

            logger.info(u"finished {repr}.{method_name}(). took {elapsed} seconds".format(
                repr=obj,
                method_name=method_name,
                elapsed=elapsed(start_time, 4)
            ))

            # for handling the queue
            if not (method_name == "update" and obj.__class__.__name__ == "Pub"):
                obj.finished = datetime.datetime.utcnow().isoformat()
            # db.session.merge(obj)


        start_time = time()
        commit_success = safe_commit(db)
        if not commit_success:
            logger.info(u"COMMIT fail")
        logger.info(u"commit took {} seconds".format(elapsed(start_time, 2)))
        db.session.remove()  # close connection nicely
        return None  # important for if we use this on RQ
def save_repo_request_rows(rows):

    with open('out.csv','wb') as f:

        w = csv.DictWriter(f, fieldnames=RepoRequest.list_fieldnames(), encoding='utf-8-sig')

        for row in rows[1:]:  # skip header row
            my_repo_request = RepoRequest()
            my_repo_request.set_id_seed(row[0])
            column_num = 0
            for fieldname in RepoRequest.list_fieldnames():
                if fieldname != "id":
                    setattr(my_repo_request, fieldname, row[column_num])
                    column_num += 1

            w.writerow(my_repo_request.to_dict())
            print u"adding repo request {}".format(my_repo_request)
            db.session.merge(my_repo_request)

        safe_commit(db)
Example #48
0
def make_person(dirty_orcid_id, high_priority=False):
    orcid_id = clean_orcid(dirty_orcid_id)
    my_person = Person(orcid_id=orcid_id)
    db.session.add(my_person)
    print u"\nin make_person: made new person for {}".format(orcid_id)
    my_person.refresh(high_priority=high_priority)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(orcid_id)

    if my_person.invalid_orcid:
        raise OrcidDoesNotExist

    return my_person
Example #49
0
def update_fn(cls, method_name, obj_id_list, shortcut_data=None, index=1):

    # we are in a fork!  dispose of our engine.
    # will get a new one automatically
    db.engine.dispose()

    start = time()

    q = db.session.query(cls).options(orm.undefer('*')).filter(cls.id.in_(obj_id_list))

    obj_rows = q.all()
    num_obj_rows = len(obj_rows)
    print "{repr}.{method_name}() got {num_obj_rows} objects in {elapsed}sec".format(
        repr=cls.__name__,
        method_name=method_name,
        num_obj_rows=num_obj_rows,
        elapsed=elapsed(start)
    )

    for count, obj in enumerate(obj_rows):
        start_time = time()

        if obj is None:
            return None

        method_to_run = getattr(obj, method_name)

        print u"\n***\n{count}: starting {repr}.{method_name}() method".format(
            count=count + (num_obj_rows*index),
            repr=obj,
            method_name=method_name
        )

        if shortcut_data:
            method_to_run(shortcut_data)
        else:
            method_to_run()

        print u"finished {repr}.{method_name}(). took {elapsed}sec".format(
            repr=obj,
            method_name=method_name,
            elapsed=elapsed(start_time, 4)
        )

    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail"
    db.session.remove()  # close connection nicely
    return None  # important for if we use this on RQ
def create_person(dirty_orcid, campaign=None):

    try:
        orcid_id = clean_orcid(dirty_orcid)
    except NoOrcidException:
        print u"\n\nWARNING: no valid orcid_id in {}; skipping\n\n".format(dirty_orcid)
        raise

    my_person = add_or_overwrite_person_from_orcid_id(orcid_id, high_priority=False)

    if campaign:
        my_person.campaign = campaign
        db.session.add(my_person)
        success = safe_commit(db)
        if not success:
            print u"ERROR!  committing {}".format(my_person.orcid_id)
def just_add_twitter(filename, limit=None, create=True):

    with open("data/" + filename, "r") as f:
        lines = f.read().split("\n")
        print "found {} ORCID lines".format(len(lines))

    if limit:
        lines = lines[:limit]

    total_start = time()
    for line in lines:

        loop_start = time()

        email = None
        twitter = None

        if "," in line:
            (dirty_orcid, email, twitter) = line.split(",")
        else:
            dirty_orcid = line

        if twitter:
            twitter = twitter.replace("@", "")
            try:
                orcid_id = clean_orcid(dirty_orcid)
            except NoOrcidException:
                try:
                    print u"\n\nWARNING: no valid orcid_id in line {}; skipping\n\n".format(line)
                except UnicodeDecodeError:
                    print u"\n\nWARNING: no valid orcid_id and line throws UnicodeDecodeError; skipping\n\n"
                continue

            my_person = Person.query.filter_by(orcid_id=orcid_id).first()
            if my_person:
                my_person.twitter = twitter
                db.session.merge(my_person)
                commit_success = safe_commit(db)
                if not commit_success:
                    print u"COMMIT fail on {}".format(orcid_id)
                print u"added twitter {} to {}".format(twitter, orcid_id)
            else:
                print u"no person found with id {}".format(orcid_id)


    print "loaded {} profiles in {}s\n".format(len(lines), elapsed(total_start))
def create_person(dirty_orcid, campaign=None, store_in_db=False):

    try:
        orcid_id = clean_orcid(dirty_orcid)
    except NoOrcidException:
        print u"\n\nWARNING: no valid orcid_id in {}; skipping\n\n".format(dirty_orcid)
        raise

    if store_in_db:
        print u"storing in db"
        my_person = make_person(orcid_id, store_in_db=True)
        if campaign:
            my_person.campaign = campaign
            db.session.add(my_person)
            success = safe_commit(db)
            if not success:
                print u"ERROR!  committing {}".format(my_person.orcid_id)
    else:
        print u"NOT storing in db"
        my_person = make_person(orcid_id, store_in_db=False)
        print my_person
Example #53
0
def add_or_overwrite_person_from_orcid_id(orcid_id,
                                          high_priority=False):

    # if one already there, use it and overwrite.  else make a new one.
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    if my_person:
        db.session.merge(my_person)
        print u"\nusing already made person for {}".format(orcid_id)
    else:
        # make a person with this orcid_id
        my_person = Person(orcid_id=orcid_id)
        db.session.add(my_person)
        print u"\nmade new person for {}".format(orcid_id)

    my_person.refresh(high_priority=high_priority)

    # now write to the db
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(my_person.orcid_id)
    return my_person
Example #54
0
def refresh_fulltext(orcid_id):
    my_person = Person.query.filter_by(orcid_id=orcid_id).first()
    my_person.recalculate_openness()
    safe_commit(db)
    return json_resp(my_person.to_dict())
Example #55
0
    def get_unpaywall_events(self, rows=100):
        insights_client = geoip2.webservice.Client(os.getenv("MAXMIND_CLIENT_ID"), os.getenv("MAXMIND_API_KEY"))

        tar_gz_filename = "today-{}.tsv.gz".format(self.first_day)

        execute("rm {}".format(tar_gz_filename), check=False)  # clear it if there is already one there
        command_template = """curl --no-include -o {} -L -H "X-Papertrail-Token: {}" https://papertrailapp.com/api/v1/archives/{}/download"""

        command = command_template.format(tar_gz_filename, os.getenv("PAPERTRAIL_API_KEY"), self.first_day)
        execute(command)
        if execute("ls -lh {}".format(tar_gz_filename), check=False):
            execute("zgrep [email protected] {} > unpaywall_events.txt".format(tar_gz_filename), capture=True, check=False)

        else:
            # no file.  get the files for all the hours instead
            execute("rm unpaywall_events.txt", check=False)  # clear it if there is already one there, because appending
            for hour in range(24):
                day_with_hour = "{}-{:02d}".format(self.first_day, hour)
                command = command_template.format(tar_gz_filename, os.getenv("PAPERTRAIL_API_KEY"), day_with_hour)
                execute(command)
                execute("zgrep [email protected] {} >> unpaywall_events.txt".format(tar_gz_filename), capture=True, check=False)


        # writing into database

        fh = open("unpaywall_events.txt", "r")
        if execute("ls -lh unpaywall_events.txt", check=False):
            num_this_loop = 0
            for line in fh:
                #only keep lines that are the right kind of log lines
                if line and not (u"[email protected]" in line and
                                         u'\toadoi\t' in line and
                                         u'\theroku/router\t' in line and
                                         u'at=info method=GET path="/10' in line):
                    continue

                columns = line.split("\t")
                collected = columns[1]
                if not collected.startswith("20"):
                    # not a valid timestamp, skip this line
                    continue

                # at=info method=GET path="/[email protected]" host=api.oadoi.org request_id=7ae3022c-0dcd-44b7-ae7e-a888d8843d4f fwd="70.666.777.999" dyno=web.6 connect=1ms service=40ms status=200 bytes=774 protocol=https \n
                try:
                    doi = re.findall('path="/(.*)\[email protected]', line)[0]
                    doi = doi.lower()
                    id = re.findall('request_id=(.*?) ', line)[0]
                    ip = re.findall('fwd="(.*)"', line)[0]
                except IndexError:
                    # skip this line, it doesn't have a doi or ip or whatever, continue to next line
                    continue

                # print collected, doi, ip, id
                unpaywall_obj = UnpaywallEvent(doi=doi, ip=ip, collected=collected)
                db.session.merge(unpaywall_obj)
                insights = IpInsights.query.filter(IpInsights.ip==ip).first()
                if not insights:
                    try:
                        response_insights = insights_client.insights(ip)
                    except ValueError:
                        # this is what it throws if bad ip address
                        response_insights = None

                    if response_insights:
                        insight_dict = response_insights.raw
                        for key in ["city", "country", "continent", "registered_country"]:
                            if key in insight_dict and  "names" in insight_dict[key]:
                                insight_dict[key]["name"] = insight_dict[key]["names"]["en"]
                                del insight_dict[key]["names"]
                        for key in ["subdivisions"]:
                            if key in insight_dict:
                                my_list = []
                                for item in insight_dict[key]:
                                    if "names" in item:
                                        item["name"] = item["names"]["en"]
                                        del item["names"]
                                my_list.append(item)
                                insight_dict[key] = my_list
                        insights = IpInsights(ip=ip, insights=insight_dict)
                        db.session.merge(insights)

                    num_this_loop += 1

                    if num_this_loop > rows:
                        logger.info(u"committing")
                        safe_commit(db)
                        num_this_loop = 0

        logger.info(u"done everything, saving last ones")
        safe_commit(db)
def run_through_dois(filename=None, reverse=None, loggly=False):
    total_start = time()
    i = 0
    output_dicts = []
    fh = open(filename, "r")

    lines = fh.readlines()

    if reverse:
        logger.info(u"reverse!")
        lines.reverse()
        i = -1 * len(lines)

    dois = []
    for line in lines:
        dois.append(line.strip())

        # line = line.replace('"', '')
        # if u"," in line:
        #     split_line = line.split(",")
        #     if loggly:
        #         dois.append(split_line[1])
        #     else:
        #         dois.append(split_line[0])
        # else:
        #     dois.append(line.strip())

    # deduplicate, preserving order
    duplicated_dois = dois
    dois = []
    for doi in duplicated_dois:
        if doi not in dois:
            dois.append(doi)

    logger.info(u"length of deduped doi list: {}".format(len(dois)))

    for doi in dois:

        try:
            my_doi = clean_doi(doi)
        except NoDoiException:
            logger.info(u"bad doi: {}".format(doi))
            continue

        if not my_doi:
            logger.info(u"bad doi: {}".format(doi))
            continue

        my_pub = Oab.query.get(my_doi)
        if not my_pub:
            my_pub = Oab()
            db.session.add(my_pub)
        my_pub.id = my_doi
        my_doi_url = "http://doi.org/{}".format(my_doi)
        my_doi_url_encoded = urllib.quote_plus(my_doi_url)
        api_url = "https://api.openaccessbutton.org/availability?url={}".format(my_doi_url_encoded)
        headers = {"content-type": "application/json"}
        r = requests.get(api_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success with oab! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.api = r.json()
            flag_modified(my_pub, "api")
        else:
            logger.info(u"problem with oab, status_code {}".format(r.status_code))

        dissemin_url = "http://dissem.in/api/{}".format(my_doi)
        r = requests.get(dissemin_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success! with dissemin! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.dissemin = r.json()
            flag_modified(my_pub, "dissemin")
        else:
            logger.info(u"problem with dissemin, status_code {}".format(r.status_code))

        safe_commit(db)
        i += 1

    logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start, 2)))

    fh.close()
Example #57
0
    def call_pmh_endpoint(self,
                          first=None,
                          last=None,
                          chunk_size=50,
                          scrape=False):

        start_time = time()
        records_to_save = []
        num_records_updated = 0
        loop_counter = 0
        self.error = None

        (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last)

        if error:
            self.error = u"error in get_pmh_input_record: {}".format(error)
            return

        while pmh_input_record:
            loop_counter += 1
            # create the record
            my_pmh_record = pmh_record.PmhRecord()

            # set its vars
            my_pmh_record.repo_id = self.id_old  # delete once endpoint_ids are all populated
            my_pmh_record.endpoint_id = self.id
            my_pmh_record.rand = random()
            my_pmh_record.populate(pmh_input_record)

            if is_complete(my_pmh_record):
                my_pages = my_pmh_record.mint_pages()
                my_pmh_record.pages = my_pages
                # logger.info(u"made {} pages for id {}: {}".format(len(my_pages), my_pmh_record.id, [p.url for p in my_pages]))
                if scrape:
                    for my_page in my_pages:
                        my_page.scrape_if_matches_pub()
                records_to_save.append(my_pmh_record)
                db.session.merge(my_pmh_record)
                # logger.info(u"my_pmh_record {}".format(my_pmh_record))
            else:
                logger.info(u"pmh record is not complete")
                # print my_pmh_record
                pass

            if len(records_to_save) >= chunk_size:
                num_records_updated += len(records_to_save)
                last_record = records_to_save[-1]
                # logger.info(u"last record saved: {} for {}".format(last_record.id, self.id))
                safe_commit(db)
                records_to_save = []

            if loop_counter % 100 == 0:
                logger.info(u"iterated through 100 more items, loop_counter={} for {}".format(loop_counter, self.id))

            pmh_input_record = self.safe_get_next_record(pmh_records)

        # make sure to get the last ones
        if records_to_save:
            num_records_updated += len(records_to_save)
            last_record = records_to_save[-1]
            logger.info(u"saving {} last ones, last record saved: {} for {}, loop_counter={}".format(
                len(records_to_save), last_record.id, self.id, loop_counter))
            safe_commit(db)
        else:
            logger.info(u"finished loop, but no records to save, loop_counter={}".format(loop_counter))

        # if num_records_updated > 0:
        if True:
            logger.info(u"updated {} PMH records for endpoint_id={}, took {} seconds".format(
                num_records_updated, self.id, elapsed(start_time, 2)))
Example #58
0
def set_person_claimed_at(my_person):
    my_person.claimed_at = datetime.datetime.utcnow().isoformat()
    db.session.merge(my_person)
    commit_success = safe_commit(db)
    if not commit_success:
        print u"COMMIT fail on {}".format(my_person.orcid_id)
Example #59
0
def get_chorus_data(starting_offset=0, agency_id=None):
    requests_session = requests.Session()
    retries = Retry(total=10,
                backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    agencies = get_chorus_agencies()
    for agency in agencies:
        if agency_id:
            if int(agency["Agency_Id"]) != int(agency_id):
                print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"])
                continue
        if starting_offset:
            offset = starting_offset
        else:
            offset = 0

        logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"]))
        url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}"
        limit = 50
        total_results = None
        while total_results==None or offset < total_results:
            loop_start = time()
            url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit)
            print url
            try:
                r = requests_session.get(url, timeout=360)  # wait for 3 minutes
            except Exception, e:
                logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8")))
                r = None

            print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1))
            offset += limit

            if r:
                data = r.json()
                total_results = data["total_results"]
                logger.info(u"Has {} total results, {} remaining".format(
                    total_results, total_results - offset))


                items = data["items"]
                new_objects = []
                for item in items:
                    if item["DOI"]:
                        doi = clean_doi(item["DOI"])
                        new_objects.append(Chorus(id=doi, raw=item))

                ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()]
                objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db]
                if objects_to_add_to_db:
                    logger.info(u"adding {} items".format(len(objects_to_add_to_db)))
                    db.session.add_all(objects_to_add_to_db)
                    safe_commit(db)
                else:
                    logger.info(u"all of these items already in db")

            logger.info(u"sleeping for 2 seconds")
            sleep(2)