Ejemplo n.º 1
0
    def modelcat_abstract_to_sb(self,
                                model_catalog_id=None,
                                convert_to_html=True):
        if model_catalog_id is None:
            model_catalog_id = self.default_catalog_id

        username = input("ScienceBase User Name: ")

        try:
            sb = SbSession().loginc(username)
        except Exception as e:
            return e

        abstract_url = f'{self.pymodelcat_github_raw_path}master/{self.modelcat_abstract_file_name}'

        abstract_content = requests.get(abstract_url).text

        if convert_to_html:
            abstract_content = markdown.markdown(abstract_content)

        item_content = {"id": model_catalog_id, "body": abstract_content}

        try:
            return_content = sb.update_item(item_content)
        except Exception as e:
            return_content = e

        return return_content
Ejemplo n.º 2
0
 def __init__(self):
     self.sb = SbSession()
     self.params = {
         "max": 1000,
         "fields": "id"
     }
     self.acceptable_system_types = [
         None,
         "Data Release",
         "Folder",
         "Community",
         "Downloadable",
         "Mappable",
         "Map Service"
     ]
     self.acceptable_browse_categories = [
         None,
         "Physical Item",
         "Publication",
         "Data",
         "Project",
         "Image",
         "Map",
         "Data Release - In Progress",
         "Web Site",
         "Collection",
         "Software",
         "Data Release - Under Revision"
     ]
Ejemplo n.º 3
0
def db_connection():
    """
    Makes a connection to the SQLite database by first getting the appropriate data file from the ScienceBase Item,
    checking to see if the file already exists in the local path, and downloading/unzipping if necessary.

    :return: sqlite3 connection to the database
    """
    sb = SbSession()

    source_item = sb.get_item(usnvc_source_item)

    source_data_file = next(
        (f for f in source_item["files"] if f["title"] == "Source Data"
         and f["contentType"] == "application/zip"), None)
    if source_data_file is None:
        return None

    probable_file_name = source_data_file["name"].replace(".zip", "")

    if os.path.exists(probable_file_name):
        return sqlite3.connect(probable_file_name)

    zip_file = sb.download_file(source_data_file["url"],
                                source_data_file["name"])

    with ZipFile(zip_file, 'r') as zip_ref:
        db_file = zip_ref.namelist()[0]
        zip_ref.extractall()
        zip_ref.close()

    try:
        return sqlite3.connect(db_file)
    except:
        return None
Ejemplo n.º 4
0
 def test_ancestors_field_in_find_items(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     itemsJson = sb.find_items({'parentId':self.BETA_TEST_COMMUNITY_ID, 'fields':'parentId,ancestors'})
     self.assertTrue(itemsJson['items'] is not None)
     self.assertTrue(isinstance(itemsJson['items'], list))
     self.assertTrue(len(itemsJson['items']) > 0)
     item = itemsJson['items'][0]
     self.assertTrue(isinstance(item['ancestors'], list))
     self.assertTrue(item['parentId'] is not None)
     self.assertTrue(item['parentId'] in item['ancestors'])
Ejemplo n.º 5
0
    def __init__(self, authenticated=False):
        self.authenticated = authenticated
        self.sb_root_url = "https://www.sciencebase.gov/directory/people?format=json"
        self.sb_org_search_url = "https://www.sciencebase.gov/directory/organizations?format=json"
        self.sb_org_root = "https://www.sciencebase.gov/directory/organization/"
        self.sb_person_root = "https://www.sciencebase.gov/directory/person/"
        self.orcid_pattern = r"\d{4}-\d{4}-\d{4}-\w{4}"
        self.sb = SbSession()

        if authenticated:
            self.sb.login(input("User Name: "), getpass("Password: "))
Ejemplo n.º 6
0
 def test_publish_unpublish_item(self):    
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     item = sb.create_item({'title': "ACL Test", 'parentId': self.BETA_TEST_COMMUNITY_ID})
     self.assertFalse(sb.has_public_read(sb.get_permissions(item['id'])))
     acls = sb.publish_item(item['id'])
     self.assertTrue(sb.has_public_read(acls))
     acls = sb.unpublish_item(item['id'])
     self.assertFalse(sb.has_public_read(item['id']))
Ejemplo n.º 7
0
    def update_sb_person_identifiers(self, person_packages):
        '''
        Works through a list of emails and executes update operations on them to update or insert identifiers into the
        person documents in the ScienceBase Directory in order to facilitate linked data operations. The function uses
        sciencebasepy to establish an authenticated session with requests in order to issue updates to the ScienceBase
        Directory API. This comes with a username and password prompt.

        The function will raise an error if there are invalid email address strings instead of continuing. It will also
        raise an error if any of the emails result in person records that can't be retrieved with
        get_identified_sb_person().

        :param person_packages: One or more dictionaries containing the necessary identifiers to update ScienceBase
        Directory person documents. Keys include a minimum of an email address from which all other processes will be
        triggered. Keys can also include the URL form of the ScienceBase Directory ID, orcid, and wikidata_id.
        :return: Python dictionary containing lists of person documents that were updated and/or not updated (because
        they didn't have anything able to be updated)
        '''
        if not isinstance(person_packages, list):
            person_packages = [person_packages]

        update_package = [self.identified_sb_person(person_package) for person_package in person_packages]

        update_person_records = [i[0] for i in update_package if i[1]]
        no_update_person_records = [i[0] for i in update_package if not i[1]]

        if len(update_person_records) > 0:

            try:
                sb = SbSession()
                sb.login(input("User Name: "), getpass())
            except Exception as e:
                raise ValueError(f"Something went wrong in trying to authenticate to ScienceBase: {e}")

            try:
                for person in update_person_records:
                    put_link = person["link"]["href"]

                    sb._session.headers.update({'Content-Type': 'application/json'})
                    sb._session.put(
                        put_link,
                        data=json.dumps(person),
                        headers={
                            "content-type": "application/json",
                            "accept": "application/json"
                        }
                    )
            except Exception as e:
                raise ValueError(f"Something went wrong trying to send updates to ScienceBase: {e}")

        return {
            "updatedPersonRecords": update_person_records,
            "ignoredPersonRecords": no_update_person_records,
        }
Ejemplo n.º 8
0
    def __init__(self, username=None):
        self.default_catalog_id = '5e8de96182cee42d134687cc'
        self.meta_bucket_list = [
            "all", "json_response", "microdata", "json-ld", "opengraph",
            "microformat", "rdfa", "meta_content"
        ]

        if username is not None:
            self.sb = SbSession().loginc(username)
        else:
            self.sb = SbSession()

        self.sb_wl = Weblinks()
Ejemplo n.º 9
0
 def test_set_permissions(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     item = sb.create_item({'title': "ACL Test", 'parentId': sb.get_my_items_id()})
     acls = sb.get_permissions(item['id'])
     self.assertFalse('USER:spongebob@bikini_bottom.net' in acls['read']['acl'])
     acls['read']['acl'].append('USER:spongebob@bikini_bottom.net')
     sb.set_permissions(item['id'], acls)
     self.assertTrue('USER:spongebob@bikini_bottom.net' in acls['read']['acl'])
     
     sb.delete_item(item)
Ejemplo n.º 10
0
    def __init__(self, username=None):
        self.default_catalog_id = '5e8de96182cee42d134687cc'
        self.meta_bucket_list = [
            "all", "json_response", "microdata", "json-ld", "opengraph",
            "microformat", "rdfa", "meta_content"
        ]

        if username is not None:
            self.sb = SbSession().loginc(username)
        else:
            self.sb = SbSession()

        self.sb_wl = Weblinks()

        self.pymodelcat_github_raw_path = "https://raw.githubusercontent.com/usgs-biolab/pymodelcat/"
        self.modelcat_abstract_file_name = "usgs_model_cat_abstract.md"
Ejemplo n.º 11
0
    def test_relationships(self):
        sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
        item1 = sb.create_item({'title': "Project", 'parentId': sb.get_my_items_id()})
        item2 = sb.create_item({'title': "Product", 'parentId': sb.get_my_items_id()})
        result = sb.create_related_item_link(item1['id'], item2['id'])
        self.assertTrue(result['itemId'] == item1['id'])
        self.assertTrue(result['relatedItemId'] == item2['id'])

        results = sb.get_item_links(item1['id'])
        self.assertEqual(len(results), 1)

        sb.delete_item(item1)
        sb.delete_item(item2)
Ejemplo n.º 12
0
    def test_upload_shapefile_individual_no_scrape(self):
        sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
        file_dir = 'test/data/FHP_Great_Lakes_Basin_boundary'
        files = ["%s/%s" % (file_dir, f) for f in listdir(file_dir) if isfile(join(file_dir, f))]

        # Updating existing item with shapefile, uploading files individually

        item = sb.create_item({'parentId': sb.get_my_items_id(), 'title':'sciencebasepy Shapefile Upload Test'})
        for f in files:
            item = sb.upload_file_to_item(item, f, scrape_file=False)
        # Delete the item before the assertions to make sure it gets deleted
        sb.delete_item(item)
        self.assertIsNotNone(item)
        self.assertIsNotNone(item['files'])
        self.assertFalse('facets' in item)
Ejemplo n.º 13
0
    def test_upload_shapefile_no_scrape(self):
        sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
        file_dir = 'test/data/FHP_Great_Lakes_Basin_boundary'
        files = ["%s/%s" % (file_dir, f) for f in listdir(file_dir) if isfile(join(file_dir, f))]

        my_items_id = sb.get_my_items_id()
        # Creating new item from shapefile upload
        item = sb.upload_files_and_upsert_item({'parentId': my_items_id}, files, scrape_file=False)
        # Delete the item before the assertions to make sure it gets deleted
        sb.delete_item(item)
        self.assertIsNotNone(item)
        self.assertIsNotNone(item['files'])
        self.assertFalse('facets' in item)
Ejemplo n.º 14
0
 def test_create_get_update_delete_hidden_property(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     new_hidden_property = {'type': 'Note',
                            'value': 'test hidden note create'}
     hidden_property = sb.create_hidden_property(self.SB_CATALOG_ITEM_ID, new_hidden_property)
     hidden_property_id = hidden_property.get("id", None)
     self.assertTrue(hidden_property is not None)
     # print("hiddenpropid"+hidden_property_id)
     self.assertTrue(isinstance(hidden_property_id, int))
     get_hidden_property = sb.get_hidden_property(self.SB_CATALOG_ITEM_ID, str(hidden_property_id))
     self.assertTrue(get_hidden_property is not None)
     self.assertEqual(get_hidden_property.get("id", None), hidden_property_id)
     update_hidden_property = {'type': 'Note',
                    'value': 'test hidden note create'}
     update_hidden_property = sb.update_hidden_property(self.SB_CATALOG_ITEM_ID, str(hidden_property_id), update_hidden_property)
     self.assertTrue(update_hidden_property is not None)
     self.assertEqual(str(update_hidden_property.get("value")), 'test hidden note create')
     delete_hidden_property = sb.delete_hidden_property(self.SB_CATALOG_ITEM_ID, str(hidden_property_id))
     self.assertTrue(delete_hidden_property is not None)
     self.assertEqual(delete_hidden_property, True)
Ejemplo n.º 15
0
class Catbuilder:
    def __init__(self, username=None):
        self.default_catalog_id = '5e8de96182cee42d134687cc'
        self.meta_bucket_list = [
            "all", "json_response", "microdata", "json-ld", "opengraph",
            "microformat", "rdfa", "meta_content"
        ]

        if username is not None:
            self.sb = SbSession().loginc(username)
        else:
            self.sb = SbSession()

        self.sb_wl = Weblinks()

    def create_model_catalog(self,
                             parent_id=None,
                             title="USGS Model Catalog",
                             body=None,
                             delete_if_exists=True):
        if parent_id is None:
            parent_id = self.sb.get_my_items_id()

        if delete_if_exists:
            existing_items = json.loads(
                self.sb.get(
                    f"https://www.sciencebase.gov/catalog/items?format=json&parentId={parent_id}&lq=title:{title}"
                ))

            if existing_items["total"] > 0:
                for item in existing_items["items"]:
                    if item["hasChildren"]:
                        sb.delete_items(sb.get_child_ids(item["id"]))
                    sb.delete_item(item)

        model_catalog_item = {'title': title, 'parentId': parent_id}

        if body is not None:
            model_catalog_item["body"] = body

        return self.sb.create_item(model_catalog_item)

    def get_models(self,
                   model_catalog_id=None,
                   fields='title,webLinks,contacts,tags'):
        if model_catalog_id is None:
            model_catalog_id = self.default_catalog_id

        models = list()

        items = self.sb.find_items({
            'parentId': model_catalog_id,
            'fields': fields,
            'max': 100
        })
        while items and 'items' in items:
            for item in items['items']:
                del item["link"]
                del item["relatedItems"]
                models.append(item)
            items = self.sb.next(items)

        return models

    def load_models_spreadsheet(self,
                                file_path="USGS_models_named_models.xlsx"):
        output_link_columns = [
            "Output", "Output.1", "Output.2", "Output.3", "Output.4"
        ]

        usgs_models = pd.read_excel(file_path)

        # Replace NaN with None (makes it simpler to evaluate values)
        usgs_models = usgs_models.replace({pd.np.nan: None})

        # Put all of the output links into a list (makes it easier to process these later)
        usgs_models["output_links"] = usgs_models[
            output_link_columns].values.tolist()
        usgs_models = usgs_models.drop(columns=output_link_columns)

        # Drop any unnamed columns (blanks in the Excel file)
        usgs_models = usgs_models.drop(columns=[
            i for i in list(usgs_models.columns) if i.find("Unnamed") != -1
        ])

        return usgs_models

    def sb_party_to_contact(self, search_term):
        search_result = requests.get(
            f"https://www.sciencebase.gov/directory/people?q={search_term}&format=json&dataset=all&max=10"
        ).json()

        if search_result["total"] == 1:
            person_record = search_result["people"][0]

            sb_contact = {
                "name":
                person_record["displayName"],
                "type":
                "Contact",
                "oldPartyId":
                person_record["id"],
                "contactType":
                person_record["type"],
                "onlineResource":
                f"https://my.usgs.gov/catalog/Global/catalogParty/show/{person_record['id']}",
                "email":
                person_record["email"],
                "active":
                person_record["active"],
                "jobTitle":
                person_record["extensions"]["personExtension"]["jobTitle"],
                "firstName":
                person_record["extensions"]["personExtension"]["firstName"],
                "lastName":
                person_record["extensions"]["personExtension"]["lastName"]
            }

            if "orcId" in person_record.keys():
                sb_contact["orcId"] = person_record["orcId"]

        else:
            sb_contact = {
                "name": search_term,
                "type": "Contact",
                "email": search_term
            }

        return sb_contact

    def sb_web_link(self, url, title="Model Reference Link"):
        return {
            "type": "webLink",
            "typeLabel": "Web Link",
            "uri": url,
            "rel": "related",
            "title": title,
            "hidden": False
        }

    def build_model_documents(self, df_models=None):
        if df_models is None:
            df_models = self.load_models_spreadsheet()

        model_documents = list()

        for index, record in df_models.iterrows():
            new_model_item = {
                "parentId": model_catalog["id"],
                "title": record["Model Name"],
                "webLinks": list()
            }

            # Here we take the contact email addresses and use the sb_party_to_contact function
            # to look them up and make proper contacts for ScienceBase
            record_contacts = record["Contact(s)"].split(";")
            if len(record_contacts) > 0:
                new_model_item["contacts"] = [
                    self.sb_party_to_contact(contact)
                    for contact in record_contacts
                ]

            # Here we split the sometimes lists of model reference links and add them to web links
            for link in record["Link"].split(";"):
                new_model_item["webLinks"].append(self.sb_web_link(link))

            # Here we filter down to just output link values not already processed as a
            # model reference and containing an actual value
            for link in [
                    l for l in record["output_links"]
                    if l is not None and len(l.strip()) > 0 and
                    not l in [i["uri"] for i in new_model_item["webLinks"]]
            ]:
                new_model_item["webLinks"].append(
                    self.sb_web_link(link, "Model Output Data"))

            model_documents.append(new_model_item)

        return model_documents

    def model_catalog_list_out(self,
                               model_catalog_id,
                               include_contact=True,
                               include_ref_link=True,
                               include_sb_link=True,
                               return_data=None,
                               write_to_excel=True,
                               file_name="usgs_model_catalog.xlsx"):
        simple_model_list = list()
        column_list = ["Model Name"]
        if include_contact:
            column_list.append("Contact")

        if include_ref_link:
            column_list.append("Model Reference Link")

        if include_sb_link:
            column_list.append("ScienceBase Link")

        items = self.sb.find_items({
            "parentId": model_catalog_id,
            "fields": "title,webLinks,contacts"
        })
        while items and 'items' in items:
            for item in items['items']:
                simple_item = {"Model Name": item['title']}

                if include_contact:
                    simple_item["Contact"] = next(
                        (c["name"] for c in item["contacts"]), None)

                if include_ref_link:
                    simple_item["Model Reference Link"] = next(
                        (l["uri"] for l in item["webLinks"]
                         if l["title"] == "Model Reference Link"), None)

                if include_sb_link:
                    simple_item["ScienceBase Link"] = item["link"]["url"]

                simple_model_list.append(simple_item)

            items = self.sb.next(items)

        df_model_list = pd.DataFrame(simple_model_list)

        if write_to_excel:
            df_model_list.to_excel(file_name, index=False, columns=column_list)

        if return_data == "dataframe":
            return df_model_list

        if return_data == "dict":
            return simple_model_list

    def annotate_model_links(self, models=None, output_format="dataframe"):
        """Runs a list of models through a link annotation process

        :param models: List of ScienceBase Items describing models
        :param output_format: python list of dictionaries or dataframe (default)
        :param pickle_output: dump annotated items out to pickle file
        :return: Annotated model items in specified format
        """
        if models is None:
            models = self.get_models()

        annotated_items = list()

        for model in models:
            print([l["uri"] for l in model["webLinks"]])
            annotated_items.append(self.sb_wl.process_web_links(item=model))

        if output_format == "python":
            return annotated_items

        flattened_annotated_models = [
            self.flatten_json(i) for i in annotated_items
        ]

        return pd.DataFrame(flattened_annotated_models)

    def flatten_json(self, y):
        """ From @amirziai https://github.com/amirziai/flatten

        :return: Flattened dictionary suitable for loading to dataframe
        """
        out = {}

        def flatten(x, name=''):
            if type(x) is dict:
                for a in x:
                    flatten(x[a], name + a + '_')
            elif type(x) is list:
                i = 0
                for a in x:
                    flatten(a, name + str(i) + '_')
                    i += 1
            else:
                out[name[:-1]] = x

        flatten(y)
        return out

    def link_miner(self, annotated_item, output_type="dataframe"):
        mined_data = list()

        for link in annotated_item["webLinks"]:
            try:
                record = dict(
                    model_id=annotated_item["id"],
                    model_sb_link=
                    f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                    model_title=annotated_item["title"],
                    link_classification=link["title"],
                    link_url=link["uri"])
                record["info_type"] = "title"
                record["info_source"] = "Title Meta Tag"
                record["info_content"] = link["annotation"]["meta_content"][
                    "title"]
                mined_data.append(record)
            except:
                pass

            try:
                record = dict(
                    model_id=annotated_item["id"],
                    model_sb_link=
                    f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                    model_title=annotated_item["title"],
                    link_classification=link["title"],
                    link_url=link["uri"])
                record["info_type"] = "abstract"
                record["info_source"] = "Description Meta Tag"
                record["info_content"] = link["annotation"]["meta_content"][
                    "description"]
                mined_data.append(record)
            except:
                pass

            try:
                record = dict(
                    model_id=annotated_item["id"],
                    model_sb_link=
                    f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                    model_title=annotated_item["title"],
                    link_classification=link["title"],
                    link_url=link["uri"])
                record["info_type"] = "abstract"
                record["info_source"] = "Abstract Meta Tag"
                record["info_content"] = link["annotation"]["meta_content"][
                    "abstract"]
                mined_data.append(record)
            except:
                pass

            try:
                for prop in link["annotation"]["structured_data"]["microdata"][
                        0]["properties"].keys():
                    record = dict(
                        model_id=annotated_item["id"],
                        model_sb_link=
                        f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                        model_title=annotated_item["title"],
                        link_classification=link["title"],
                        link_url=link["uri"])
                    record["info_source"] = f"Microdata Property: {prop}"
                    record["info_type"] = prop
                    record["info_content"] = link["annotation"][
                        "structured_data"]["microdata"][0]["properties"][prop]
                    mined_data.append(record)
            except:
                pass

            try:
                for prop, value in link["annotation"]["structured_data"][
                        "opengraph"][0]["properties"]:
                    record = dict(
                        model_id=annotated_item["id"],
                        model_sb_link=
                        f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                        model_title=annotated_item["title"],
                        link_classification=link["title"],
                        link_url=link["uri"])
                    record["info_source"] = f"Microformat Property: {prop}"
                    record["info_type"] = prop
                    record["info_content"] = value
                    mined_data.append(record)
            except:
                pass

            try:
                for prop, value in link["annotation"][
                        "xml_meta_summary"].items():
                    record = dict(
                        model_id=annotated_item["id"],
                        model_sb_link=
                        f'https://www.sciencebase.gov/catalog/item/{annotated_item["id"]}',
                        model_title=annotated_item["title"],
                        link_classification=link["title"],
                        link_url=link["uri"])
                    record["info_source"] = f"XML Metadata Summary: {prop}"
                    record["info_type"] = prop
                    record["info_content"] = value
                    mined_data.append(record)
            except:
                pass

        if output_type == "dataframe":
            return pd.DataFrame(mined_data)
        else:
            return mined_data
Ejemplo n.º 16
0
dct_provenance_s = "U.S. Geological Survey"  # could be retrieved from json response, but this works
dc_creator_sm = ["U.S. Geological Survey"]
dc_publisher_sm = ["U.S. Geological Survey"]
dc_language_s = "English"
dc_subject_sm = ["Imagery and Base Maps"]
dct_spatial_sm = [""]  # We don't use spatial keywords at UW

# fields unique to University of Wisconsin
uw_deprioritize_item_b = True  # we want topo records to appear lower in search results so they don't overwhelm other items
uw_notice_s = ""
# end University of Wisconsin

###### End of constants

# Begin processing
sb = SbSession()  # create a new session

print("Processing...")
# Send query to Sciencebase and store json response in "items"
# See https://github.com/usgs/sciencebasepy/blob/master/Searching%20ScienceBase%20with%20ScienceBasePy.ipynb for syntax guidance
items = sb.find_items({
    'ancestors': base_item_id,
    'filter': 'extentQuery={"extent":' + str(extent_id) + '}',
    'fields': fields,
    'max': maxRecords
})
print("Found %s items" % items['total'])

while items and 'items' in items:
    for item in items['items']:
        #print(item)
Ejemplo n.º 17
0
class Directory:
    def __init__(self, authenticated=False):
        self.authenticated = authenticated
        self.sb_root_url = "https://www.sciencebase.gov/directory/people?format=json"
        self.sb_org_search_url = "https://www.sciencebase.gov/directory/organizations?format=json"
        self.sb_org_root = "https://www.sciencebase.gov/directory/organization/"
        self.sb_person_root = "https://www.sciencebase.gov/directory/person/"
        self.orcid_pattern = r"\d{4}-\d{4}-\d{4}-\w{4}"
        self.sb = SbSession()

        if authenticated:
            self.sb.login(input("User Name: "), getpass("Password: "******"email"
        elif re.search(self.orcid_pattern, criteria):
            q_operator = "q"
            verifier_operator = "orcId"
            verifier_criteria = criteria
        else:
            q_operator = "q"
            criteria = f"{criteria.split()[0]} {criteria.split()[-1]}"
            criteria = unidecode.unidecode(criteria)

        if verifier_operator is not None:
            unique = False

        query_url = f"{self.sb_root_url}&{q_operator}={criteria}"

        try:
            if self.authenticated:
                sb_results = self.sb._session.get(query_url).json()
            else:
                sb_results = requests.get(query_url).json()
        except:
            return None

        if len(sb_results["people"]) == 0 and attempt_last_name:
            name_criteria = criteria.split()[-1]
            query_url = f"{self.sb_root_url}&lastName={name_criteria}"
            try:
                if self.authenticated:
                    sb_results = self.sb._session.get(query_url).json()
                else:
                    sb_results = requests.get(query_url).json()
            except:
                return None
        elif len(sb_results["people"]) == 0 and not attempt_last_name:
            return None

        if unique and len(sb_results["people"]) == 1:
            return sb_results["people"][0]

        if not unique and verifier_operator is not None and verifier_criteria is not None:
            return next(
                (i for i in sb_results["people"] if verifier_operator in i
                 and i[verifier_operator] == verifier_criteria), None)

        if unique and len(sb_results["people"]) > 1:
            list_active = [i for i in sb_results["people"] if i["active"]]
            if len(list_active) == 1:
                return list_active[0]

        if not unique and len(sb_results["people"]) > 1:
            return sb_results["people"]

        return None

    def person_query_urls(self, limit=1000):
        query_url = f"{self.sb_root_url}&max=1"
        r_starter_query = requests.get(query_url).json()
        total_records = int(r_starter_query["total"])
        limit_for_offset = int(limit)
        upper_range = int((total_records / limit_for_offset) + 1)

        result_urls = list()
        for page_num in range(0, upper_range):
            result_urls.append(
                f"{self.sb_root_url}&max={limit}&offset={page_num * limit}")

        return result_urls

    def all_people(self):

        people_listing = list()

        next_url = f"{self.sb_root_url}&max=1000"
        while next_url is not None:
            sb_results = self.sb._session.get(next_url).json()
            if "people" in sb_results and len(sb_results["people"]) > 0:
                people_listing.extend(sb_results["people"])
            if "nextlink" in sb_results:
                next_url = sb_results["nextlink"]["url"]
            else:
                next_url = None

        filtered_people = [
            i for i in people_listing
            if i["distinguishedName"] is not None and i["email"] is not None
            and "OU=Shared Mailboxes" not in i["distinguishedName"]
            and "OU=Service Accounts" not in i["distinguishedName"]
            and "usgs.gov" in i["email"]
        ]

        return filtered_people

    def all_orgs(self):
        org_listing = list()

        next_url = f"{self.sb_org_search_url}&max=1000"
        while next_url is not None:
            sb_results = self.sb._session.get(next_url).json()
            if "organizations" in sb_results and len(
                    sb_results["organizations"]) > 0:
                org_listing.extend(sb_results["organizations"])
            if "nextlink" in sb_results:
                next_url = sb_results["nextlink"]["url"]
            else:
                next_url = None

        return org_listing
Ejemplo n.º 18
0
    def test_add_delete_user_acl(self):
        sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
        item = sb.create_item({'title': "ACL Test", 'parentId': sb.get_my_items_id()})
        acls = sb.get_permissions(item['id'])
        self.assertFalse('USER:[email protected]' in acls['read']['acl'])
        self.assertFalse('USER:[email protected]' in acls['write']['acl'])

        sb.add_acl_user_read("*****@*****.**", item['id'])
        acls = sb.add_acl_user_write("*****@*****.**", item['id'])
        self.assertTrue('USER:[email protected]' in acls['read']['acl'])
        self.assertTrue('USER:[email protected]' in acls['write']['acl'])

        sb.remove_acl_user_read("*****@*****.**", item['id'])
        acls = sb.remove_acl_user_write("*****@*****.**", item['id'])
        self.assertFalse('USER:[email protected]' in acls['read']['acl'])
        self.assertFalse('USER:[email protected]' in acls['write']['acl'])

        sb.delete_item(item)
Ejemplo n.º 19
0
    def __init__(self,
                 operation_mode="local",
                 cache_root=None,
                 cache_manager=None):
        self.description = "Set of functions for assembling the SGCN database"
        # This item_id gives all 112 state/year combos to process
        self.sgcn_root_item = '56d720ece4b015c306f442d5'

        # This item_id is our test location that gives just a few state/year combos
        #self.sgcn_root_item = '5ef51d8082ced62aaae69f05'  OBSOLETE Don't use

        self.resources_path = 'resources/'
        self.cache_manager = cache_manager

        self.sb = SbSession()
        self.sgcn_base_item = self.get_sb_item_with_retry(self.sgcn_root_item)

        self.historic_national_list_file = next(
            (f["url"] for f in self.sgcn_base_item["files"]
             if f["title"] == "Historic 2005 SWAP National List"), None)
        self.sgcn_itis_overrides_file = next(
            (f["url"] for f in self.sgcn_base_item["files"]
             if f["title"] == "SGCN ITIS Overrides"), None)

        self.sppin_collections = [
            "itis", "worms", "gbif", "ecos", "natureserve", "iucn", "gap"
        ]

        if operation_mode == "local":
            self.source_data_folder = "sgcn"
            self.source_metadata_folder = "sgnc_meta"
            self.mq_folder = "mq"
            self.sppin_folder = "sppin"
            self.raw_data_folder = "raw"

            if cache_root is None:
                if os.getenv("DATA_CACHE") is None:
                    raise ValueError(
                        "When operating this system locally, you must either supply an explicit cache_"
                        "location to a local path or include the DATA_CACHE variable in your environment "
                        "variables.")
                else:
                    self.cache_base = f'{os.getenv("DATA_CACHE")}'
            else:
                self.cache_base = cache_root

            self.source_data_path = f"{self.cache_base}/{self.source_data_folder}"
            self.source_metadata_path = f"{self.cache_base}/{self.source_metadata_folder}"
            self.mq_path = f"{self.cache_base}/{self.mq_folder}"
            self.sppin_path = f"{self.cache_base}/{self.sppin_folder}"
            self.raw_data_path = f"{self.cache_base}/{self.raw_data_folder}"

            try:
                os.makedirs(self.cache_base)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.source_data_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.source_metadata_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.mq_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.mq_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.raw_data_path)
            except FileExistsError:
                pass

            self.sql_metadata = pysppin.utils.Sql(
                cache_location=self.source_metadata_path)
            self.sql_data = pysppin.utils.Sql(
                cache_location=self.source_data_path)
            self.sql_mq = pysppin.utils.Sql(cache_location=self.mq_path)
            self.sql_sppin = pysppin.utils.Sql(cache_location=self.sppin_path)
        else:
            if self.cache_manager is None:
                raise ValueError(
                    "When operating this system you must supply cache_manager")
            self.raw_data_path = ""
Ejemplo n.º 20
0
Contact: [email protected]

Notes
----------
damaccessionnumber = dam_science_id
"""

# Import packages
import requests
import re
from sciencebasepy import SbSession
import pandas as pd
import io
import numpy as np

sb = SbSession()

######################################################################
######################################################################


def get_science_data_url(
        doi_meta="https://api.datacite.org/works/10.5066/P9IGEC9G"):
    """Get url for newest version of Dam Removal Science Database.

    Checks DOI for newest version of the Dam Removal Science Database.
    Returns download url for most recent version of databases as CSV.

    Parameters
    ----------
    doi_meta: str
Ejemplo n.º 21
0
    def test_add_delete_role_acl(self):
        sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
        item = sb.create_item({'title': "ACL Test", 'parentId': sb.get_my_items_id()})
        acls = sb.get_permissions(item['id'])
        self.assertFalse('ROLE:ScienceBase_DataAdmin' in acls['read']['acl'])
        self.assertFalse('ROLE:ScienceBase_DataAdmin' in acls['write']['acl'])

        sb.add_acl_role_read("ScienceBase_DataAdmin", item['id'])
        acls = sb.add_acl_role_write("ScienceBase_DataAdmin", item['id'])
        self.assertTrue('ROLE:ScienceBase_DataAdmin' in acls['read']['acl'])
        self.assertTrue('ROLE:ScienceBase_DataAdmin' in acls['write']['acl'])
    
        sb.remove_acl_role_read("ScienceBase_DataAdmin", item['id'])
        acls = sb.remove_acl_role_write("ScienceBase_DataAdmin", item['id'])
        self.assertFalse('ROLE:ScienceBase_DataAdmin' in acls['read']['acl'])
        self.assertFalse('ROLE:ScienceBase_DataAdmin' in acls['write']['acl'])

        sb.delete_item(item)
Ejemplo n.º 22
0
class Sgcn:
    def __init__(self,
                 operation_mode="local",
                 cache_root=None,
                 cache_manager=None):
        self.description = "Set of functions for assembling the SGCN database"
        # This item_id gives all 112 state/year combos to process
        self.sgcn_root_item = '56d720ece4b015c306f442d5'

        # This item_id is our test location that gives just a few state/year combos
        #self.sgcn_root_item = '5ef51d8082ced62aaae69f05'  OBSOLETE Don't use

        self.resources_path = 'resources/'
        self.cache_manager = cache_manager

        self.sb = SbSession()
        self.sgcn_base_item = self.get_sb_item_with_retry(self.sgcn_root_item)

        self.historic_national_list_file = next(
            (f["url"] for f in self.sgcn_base_item["files"]
             if f["title"] == "Historic 2005 SWAP National List"), None)
        self.sgcn_itis_overrides_file = next(
            (f["url"] for f in self.sgcn_base_item["files"]
             if f["title"] == "SGCN ITIS Overrides"), None)

        self.sppin_collections = [
            "itis", "worms", "gbif", "ecos", "natureserve", "iucn", "gap"
        ]

        if operation_mode == "local":
            self.source_data_folder = "sgcn"
            self.source_metadata_folder = "sgnc_meta"
            self.mq_folder = "mq"
            self.sppin_folder = "sppin"
            self.raw_data_folder = "raw"

            if cache_root is None:
                if os.getenv("DATA_CACHE") is None:
                    raise ValueError(
                        "When operating this system locally, you must either supply an explicit cache_"
                        "location to a local path or include the DATA_CACHE variable in your environment "
                        "variables.")
                else:
                    self.cache_base = f'{os.getenv("DATA_CACHE")}'
            else:
                self.cache_base = cache_root

            self.source_data_path = f"{self.cache_base}/{self.source_data_folder}"
            self.source_metadata_path = f"{self.cache_base}/{self.source_metadata_folder}"
            self.mq_path = f"{self.cache_base}/{self.mq_folder}"
            self.sppin_path = f"{self.cache_base}/{self.sppin_folder}"
            self.raw_data_path = f"{self.cache_base}/{self.raw_data_folder}"

            try:
                os.makedirs(self.cache_base)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.source_data_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.source_metadata_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.mq_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.mq_path)
            except FileExistsError:
                pass

            try:
                os.makedirs(self.raw_data_path)
            except FileExistsError:
                pass

            self.sql_metadata = pysppin.utils.Sql(
                cache_location=self.source_metadata_path)
            self.sql_data = pysppin.utils.Sql(
                cache_location=self.source_data_path)
            self.sql_mq = pysppin.utils.Sql(cache_location=self.mq_path)
            self.sql_sppin = pysppin.utils.Sql(cache_location=self.sppin_path)
        else:
            if self.cache_manager is None:
                raise ValueError(
                    "When operating this system you must supply cache_manager")
            self.raw_data_path = ""

    def testWormsAndITISConnections(self):
        print('Testing connection to WoRMS and ITIS...')
        try:
            res = requests.get(
                "http://www.marinespecies.org/rest/AphiaRecordsByName/Typhlatya monae?like=false&marine_only=false&offset="
            )
            print(
                '    http GET http://www.marinespecies.org...: {}'.format(res))
            res = requests.get(
                "https://services.itis.gov/?wt=json&rows=10&q=nameWOInd:Megaptera novaeangliae"
            )
            print('    http GET https://services.itis.gov...: {}'.format(res))
        except Exception as e:
            print('    exception on http GET: {}'.format(e))

    def cache_sgcn_metadata(self, return_data=False):
        '''
        The SGCN collection item contains a number of metadata files that help to control and augment the process of
        building the SGCN integrated database. For running this process locally, it is more efficient to cache these
        data in a Sqlite database that can be referenced rather than having to retrieve them from ScienceBase every
        time they need to be consulted.

        :param return_data: Set to true to return the actual data structures instead of just a list of tables
        :return: List of table names created in caching process
        '''
        sgcn_collection = self.get_sb_item_with_retry(self.sgcn_root_item)

        if return_data:
            table_list = dict()
        else:
            table_list = list()

        for file in sgcn_collection["files"]:
            exception = None
            retries = 5
            start_time = time.time()
            for this_try in range(1, retries):
                backoff = math.pow(2, this_try - 1)
                try:
                    r_file = requests.get(file["url"])
                    if r_file.status_code != 200:
                        reason = "code ({}) {}".format(r_file.status_code,
                                                       r_file.reason)
                        raise Exception(reason)
                    exception = None
                    break
                except Exception as e:
                    print(
                        'failure to fetch : {}. Will retry {} more times...Sleeping ({})'
                        .format(file["url"], retries - this_try, backoff))
                    time.sleep(backoff)
                    exception = e
            if exception:
                elapsed_time = "{:.2f}".format(time.time() - start_time)
                raise Exception(
                    "({} seconds) error trying to fetch : {} : {}".format(
                        elapsed_time, file["url"], exception))

            if file["contentType"] == "text/plain":
                data_content = list()
                for item in r_file.text.split("\n"):
                    data_content.append({"scientific_name": item})
            else:
                data_content = r_file.json()

            if return_data:
                table_list[file["title"]] = data_content

            try:
                self.sql_metadata.bulk_insert("sgcn_meta", file["title"],
                                              data_content)
                if not return_data:
                    table_list.append(file["title"])
            except:
                if not return_data:
                    table_list.append(f'{file["title"]} - ALREADY CACHED')

        return table_list

    def get_sb_item_with_retry(self, sgcn_root_item):
        exception = None
        retries = 5
        start_time = time.time()
        for this_try in range(1, retries):
            try:
                sgcn_collection = self.sb.get_item(sgcn_root_item)
                return sgcn_collection
            except Exception as e:
                backoff = math.pow(2, this_try - 1)
                print(
                    'failure to fetch sgcn_root_item: {}. Will retry {} more times...Sleeping ({})'
                    .format(sgcn_root_item, retries - this_try, backoff))
                time.sleep(backoff)
                exception = e
        elapsed_time = "{:.2f}".format(time.time() - start_time)
        raise Exception(
            "({} elapsed time) error trying to fetch sgcn_root_item: {} : {}".
            format(elapsed_time, sgcn_root_item, exception))

    def check_historic_list(self, scientific_name, metadata_cache=None):
        '''
        This function takes a scientific name and checks to see if it was included in the 2005 SWAP list

        :param scientificname: Scientific name string
        :param metadata_cache: A dictionary of the metadata used for prcessing species in the pipeline, optional
        :return: True if the name is in the historic list, otherwise False
        '''
        if metadata_cache:
            return len([
                spec
                for spec in metadata_cache["Historic 2005 SWAP National List"]
                if spec["scientific_name"] == scientific_name
            ]) > 0

        check_records = self.sql_metadata.get_select_records(
            "sgcn_meta", "Historic 2005 SWAP National List",
            "scientific_name = ?", scientific_name)

        if check_records is None:
            return False
        else:
            return True

    def check_itis_override(self, scientific_name, metadata_cache=None):
        '''
        This function takes the original scientific name found in certain source records and finds a corresponding
        ITIS identifier to be used in lieu of name lookup.

        :param scientific_name: Scientific name string
        :param metadata_cache: A dictionary of the metadata used for prcessing species in the pipeline, optional
        :return: ITIS TSN identifier in URL form
        '''
        if metadata_cache:
            records = [
                spec for spec in metadata_cache["SGCN ITIS Overrides"]
                if spec["ScientificName_original"] == scientific_name
            ]
            if len(records):
                return records[0]["taxonomicAuthorityID"]
            return None

        check_records = self.sql_metadata.get_select_records(
            "sgcn_meta", "SGCN ITIS Overrides", "ScientificName_original = ?",
            scientific_name)

        if check_records is None:
            return None

        return check_records[0]["taxonomicAuthorityID"]

    def cache_raw_data(self):
        '''
        After having some trouble crop up occasionally where reading files from ScienceBase came up with a urlopen
        error, this function takes another approach of simply trying to get all files and download them to a local
        cache.

        :return: List of files cached
        '''

        processable_items = self.get_processable_items()
        report = {
            "files_written": list(),
            "files_in_cache": list(),
            "file_download_errors": list()
        }

        for item in processable_items:
            file_name = item["source_file_url"].split("%2F")[-1]
            file_path = f"{self.raw_data_path}/{file_name}"

            if os.path.isfile(file_path):
                report["files_in_cache"].append(file_path)
            else:
                try:
                    item_file_content = requests.get(item["source_file_url"])
                    report["files_written"].append(file_path)
                    with open(file_path, "w") as f:
                        f.write(item_file_content.text)
                        f.close()
                except:
                    report["file_download_errors"].append(
                        item["sciencebase_item_id"])
                    pass

        return report

    def get_processable_items(self):
        '''
        Retrieves the items from the ScienceBase collection that have the necessary parameters for processing. It
        checks a process log (not yet in place) to determine whether or not the Process File has already been processed.

        :return: Summarized list of items with just the properties necessary to run the process
        '''
        params = {
            "parentId": self.sgcn_root_item,
            "fields": "title,dates,files,tags",
            "max": 1000
        }

        self.testWormsAndITISConnections()

        items = self.sb.find_items(params)

        source_items = list()
        while items and 'items' in items:
            source_items.extend(items["items"])
            items = self.sb.next(items)

        processable_sgcn_items = [
            {
                "sciencebase_item_id":
                i["link"]["url"],
                "state":
                next(t["name"] for t in i["tags"] if t["type"] == "Place"),
                "year":
                next(d["dateString"] for d in i["dates"]
                     if d["type"] == "Collected"),
                "source_file_url":
                next(f["url"] for f in i["files"]
                     if f["title"] == "Process File"),
                "source_file_date":
                next(f["dateUploaded"] for f in i["files"]
                     if f["title"] == "Process File")
            } for i in source_items
            if next((f for f in i["files"]
                     if f["title"] == "Process File"), None) is not None
        ]

        unprocessed_items = [
            i for i in processable_sgcn_items
            if self.check_source_url(i["source_file_url"]) is None
        ]

        return unprocessed_items

    def get_schema(self, schema):
        schema_file = pkg_resources.resource_filename(
            'pysgcn', f'resources/{schema}.json')

        with open(schema_file, "r") as f:
            schema = json.load(f)
            f.close()

        return schema

    def check_source_url(self, source_file_url):
        '''
        This is intended to check a processing log that doesn't yet exist to see if an item's file has already been
        processed. It should shift to using an API that takes the item and returns an existing processing provenance
        record. The parameters used here are the source file URL and the source file date. We could probably get away
        with just using the URL as that should be unique in the ScienceBase architecture today for any new file loaded
        to the items. In future, though, we may have some other source platform that would need to combine a date on a
        file with some other parameter.

        :param item: Simplified SGCN source item dictionary
        :return: Intended to return a process log record if an item has been processed; otherwise returns None
        '''
        if self.cache_manager:  # in the pipeline we always process all files
            return None

        return self.sql_data.get_select_records("sgcn", "sgcn",
                                                "source_file_url = ?",
                                                source_file_url)

    def build_sppin_key(self, scientific_name, itis_override_id):
        if itis_override_id is not None:
            return f"TSN:{itis_override_id.split(':')[-1]}"
        else:
            return f"Scientific Name:{scientific_name}"

    def process_sgcn_source_item(self,
                                 item,
                                 output_type="dict",
                                 metadata_cache=None):
        '''
        This function handles the process of pulling a source file from ScienceBase, reading the specified file via
        HTTP into a Pandas dataframe, infusing a little bit of additional source metadata into each record, infusing
        some additional information from the source collection, and then returning a ready and mostly harmonized
        data structure for further processing.

        :param item: Dictionary containing the summarized item message created and queued in the
        get_processable_items function
        :param output_type: Can be one of - dict, dataframe, or json - defaults to dict
        :param metadata_cache: A dictionary of the metadata used for prcessing species in the pipeline, optional
        :return: Returns a flattened data structure/table in one of a few specified formats
        '''
        file_name = item["source_file_url"].split("%2F")[-1]
        file_path = f"{self.raw_data_path}/{file_name}"

        if os.path.isfile(file_path):
            file_access_path = file_path
        else:
            file_access_path = item["source_file_url"]

        try:
            df_src = pd.read_csv(file_access_path, delimiter="\t")
        except UnicodeDecodeError:
            df_src = pd.read_csv(file_access_path,
                                 delimiter="\t",
                                 encoding='latin1')

        # Make lower case columns to deal with slight variation in source files
        df_src.columns = map(str.lower, df_src.columns)

        # Include the source item identifier
        df_src["sciencebase_item_id"] = item["sciencebase_item_id"]

        # Include a processing date
        df_src["record_processed"] = datetime.utcnow().isoformat()

        # Set the file date and url from the ScienceBase file to each record in the dataset for future reference
        df_src["source_file_date"] = item["source_file_date"]
        df_src["source_file_url"] = item["source_file_url"]

        # Set the state name from the ScienceBase Item tag if needed
        if "state" not in df_src.columns:
            df_src["state"] = item["state"]

        # Set the reporting year from the ScienceBase Item date if needed
        if "year" not in df_src.columns:
            df_src["year"] = item["year"]

        # Get rid of the reported '2005 SWAP' column because we can't count on it and it's too messy
        if "2005 swap" in df_src.columns:
            df_src.drop("2005 swap", axis=1, inplace=True)

        # Standardize naming of the reported taxonomic group column (though we may get rid of this eventually)
        if "taxonomy group" in df_src.columns:
            df_src.rename(columns={"taxonomy group": "taxonomic category"},
                          inplace=True)

        # Take care of the one weird corner case
        if "taxonomy group (use drop down box)" in df_src.columns:
            df_src.rename(columns={
                "taxonomy group (use drop down box)":
                "taxonomic category"
            },
                          inplace=True)

        # Make sure blank common name and taxonomic category values are "", otherwise their value is NaN (invalid json)
        df_src["common name"] = df_src.apply(
            lambda x: ""
            if isinstance(x["common name"], float) else x["common name"],
            axis=1)
        df_src["taxonomic category"] = df_src.apply(lambda x: "" if isinstance(
            x["taxonomic category"], float) else x["taxonomic category"],
                                                    axis=1)

        # Clean up the scientific name string for lookup by applying the function from bis_utils
        df_src["clean_scientific_name"] = df_src.apply(
            lambda x: common_utils.clean_scientific_name(x["scientific name"]),
            axis=1)

        # Check the historic list and flag any species names that should be considered part of the 2005 National List
        df_src["historic_list"] = df_src.apply(
            lambda x: self.check_historic_list(x["scientific name"],
                                               metadata_cache),
            axis=1)

        # Check to see if there is an explicit ITIS identifier that should be applied to the species name (ITIS Overrides)
        df_src["itis_override_id"] = df_src.apply(
            lambda x: self.check_itis_override(x["scientific name"],
                                               metadata_cache),
            axis=1)

        # Set up the search_key property for use in linking other discovered data from sppin processing
        df_src["sppin_key"] = df_src.apply(lambda x: self.build_sppin_key(
            x["clean_scientific_name"], x["itis_override_id"]),
                                           axis=1)

        if output_type == "dataframe":
            return df_src
        elif output_type == "dict":
            return df_src.to_dict("records")
        elif output_type == "json":
            return df_src.to_json(orient="records")

    def cache_item_data(self,
                        item,
                        send_record_to_mq=True,
                        send_spp_to_mq=True):
        '''
        This function handles the process of caching (or retrieving from cache if it already exists) a single SGCN
        item's data file. If the file doesn't already exist in the cash, it will fire the process_sgcn_source_item
        function to pull the file from ScienceBase, build it into a dataframe, and then cache as a feather file.

        :param item: Dictionary containing the summarized item message created and queued in the
        get_processable_items function
        :param send_record_to_mq: Send each extracted source record to message queue for further processing
        :param send_spp_to_mq: Extract species names from the source dataset and send to message queue for processing
        :return: Dataset as a Pandas dataframe
        '''
        if self.check_source_url(item["source_file_url"]) is not None:
            raise ValueError(
                "Source file has already been processed and included in the database"
            )

        dataset = self.process_sgcn_source_item(item)

        if send_record_to_mq:
            for record in dataset:
                self.sql_mq.insert_record(db_name="mq",
                                          table_name="mq_source_records",
                                          record=record,
                                          mq=True)

        if send_spp_to_mq:
            for msg in self.sppin_messages(dataset=dataset):
                self.sql_mq.insert_record(db_name="mq",
                                          table_name="mq_itis_check",
                                          record=msg,
                                          mq=True)

        return dataset

    def sppin_messages(self,
                       dataset=None,
                       scientific_name_list=None,
                       name_source=None):
        '''
        This function extracts the takes either a source dataset (list of dicts) or a list of scientific names and
        packages the necessary message structure for further processing. It uses or creates the sppin_key property
        used throughout data generated with pySppIn methods to link information together.

        :param dataset: Source dataset in dictionary format
        :param scientific_name_list: List of names to assemble
        :param name_source: String value with information on where a list of names comes from
        :return: List of message body dictionary structures containing necessary information for executing lookup
        processes
        '''
        if dataset is None and scientific_name_list is None:
            raise ValueError(
                "You must supply either a dataset (list of dicts) or a list of scientific names"
            )

        if dataset is not None and scientific_name_list is not None:
            raise ValueError(
                "You can only process a dataset (list of dicts) or a list of scientific names, not both"
            )

        mq_list = None

        if dataset is not None:
            mq_list = [{
                "source": {
                    "type": "ScienceBase Source File",
                    "sciencebase_source_item":
                    dataset[0]['sciencebase_item_id'],
                    "sciencebase_source_file": dataset[0]['source_file_url'],
                    "sciencebase_source_file_date":
                    dataset[0]['source_file_date']
                },
                "sppin_key": sppin_key
            } for sppin_key in list(set([i["sppin_key"] for i in dataset]))]

        if scientific_name_list is not None:
            mq_list = [{
                "source": {
                    "type": "List of Scientific Names",
                    "name_source": name_source
                },
                "sppin_key": f"Scientific Name:{name}"
            } for name in list(set([n for n in scientific_name_list]))]

        return mq_list

    def check_sppin_key(self, message_body, sppin_collections=None):
        '''
        Uses the message_body format of a queued scientific name or identifier, checks and parses the sppin_key
        parameter into its parts, and checks a specified set of Species Information containers for records.

        :param message_body: Dictionary containing a queued species identifier
        :param sppin_collections: List of the SppIn collections to search
        :return: Dictionary containing sppin_key, sppin_key type, sppin_key value, and sppin_data from specified
        collections
        '''
        if "sppin_key" not in message_body.keys():
            raise ValueError(
                "The message body must contain the sppin_key parameter")

        sppin_key_parts = message_body["sppin_key"].split(":")

        if len(sppin_key_parts) < 2:
            raise ValueError(
                "Your sppin_key parameter could not be successfully parsed")

        if sppin_collections is None:
            sppin_collections = self.sppin_collections

        sppin_data = dict()
        for collection in sppin_collections:
            sppin_data[collection] = self.sql_sppin.sppin_key_current_record(
                collection, message_body["sppin_key"])

        return message_body["sppin_key"], sppin_key_parts[0], sppin_key_parts[
            1], sppin_data

    def process_itis_result(self, itis_result):
        '''
        This function processes a set of results from ITIS to summarize data for use in SGCN, extract additional names
        for processing through information gathering functions, and set up WoRMS processing if a scientific name
        is not found in ITIS.

        :param itis_result: Dictionary with ITIS data structure returned from the pysppin module
        :return: Dictionary containing ITIS summary properties needed for this application and lists of messages for
        name processing in information gathering functions and WoRMS. Any of these can be None.
        '''
        sppin_key = itis_result["sppin_key"]
        sppin_key_type = itis_result["sppin_key"].split(":")[0]
        sppin_key_value = itis_result["sppin_key"].split(":")[1]

        name_list = list()
        if sppin_key_type == "Scientific Name":
            name_list = [sppin_key_value]
        itis_summary_msg = None
        name_queue = None
        worms_queue = None

        # BCB-1556
        class_name = None

        if "data" not in itis_result.keys() or isinstance(
                itis_result["data"], float):
            worms_queue = self.sppin_messages(scientific_name_list=name_list,
                                              name_source="ITIS Search")

        else:
            # BCB-1556
            data = itis_result["data"]
            for datum in data:
                for tax in datum['biological_taxonomy']:
                    if tax['rank'].lower() == "class":
                        class_name = tax['name']
                        break

            name_list.extend([i["nameWInd"] for i in itis_result["data"]])
            name_list.extend([i["nameWOInd"] for i in itis_result["data"]])

            valid_itis_doc = next((i for i in itis_result["data"]
                                   if i["usage"] in ["valid", "accepted"]),
                                  None)

            if valid_itis_doc is None:
                worms_queue = self.sppin_messages(
                    scientific_name_list=name_list, name_source="ITIS Search")
            else:
                itis_summary_msg = itis_result["summary"]
                itis_summary_msg["sppin_key"] = sppin_key

        if len(name_list) > 0:
            name_queue = self.sppin_messages(scientific_name_list=list(
                set(name_list)),
                                             name_source="ITIS Search")

        # BCB-1556
        if itis_summary_msg:
            itis_summary_msg[
                "class_name"] = class_name if class_name else "none"
        return itis_summary_msg, name_queue, worms_queue

    def process_worms_result(self, worms_result):
        '''
        This function processes a result from the WoRMS search function in pysppin and returns a summary result and
        additional names for processing through other information gathering functions.

        :param worms_result: Dictionary with WoRMS result from pysppin
        :return: Summary properties for processing in SGCN and a list of name messages for further processing
        '''
        sppin_key = worms_result["sppin_key"]
        sppin_key_value = worms_result["sppin_key"].split(":")[1]

        name_list = [sppin_key_value]
        name_queue = None
        worms_summary_msg = None

        # BCB-1556
        class_name = None

        if "data" in worms_result.keys():
            name_list.extend(
                [i["scientificname"] for i in worms_result["data"]])

            valid_worms_doc = next(
                (i for i in worms_result["data"] if i["status"] == "accepted"),
                None)

            if valid_worms_doc is not None:
                worms_summary_msg = worms_result["summary"]
                worms_summary_msg["sppin_key"] = sppin_key
                # BCB-1556
                data = worms_result["data"]
                for datum in data:
                    for tax in datum['biological_taxonomy']:
                        if tax['rank'].lower() == "class":
                            class_name = tax['name']
                            break

        if len(name_list) > 0:
            name_queue = self.sppin_messages(scientific_name_list=list(
                set(name_list)),
                                             name_source="WoRMS Search")

        # BCB-1556
        if worms_summary_msg:
            worms_summary_msg[
                "class_name"] = class_name if class_name else "none"

        return worms_summary_msg, name_queue

    def process_sppin_source_search_term(self,
                                         message_queue,
                                         sppin_source,
                                         message_id=None,
                                         message_body=None):
        '''
        This function operates any of the basic pySppIn gatherers that use the sppin_key parameter to lookup by
        Scientific Name or ITIS TSN. It fires the check_sppin_key function to both parse the sppin_key parameter and
        check the cache for an existing record. That function currently defaults to looking for records in the last
        30 days but this can be configured based on the situation.

        :param message_queue: the name of the message queue to process
        :param sppin_source: the species information source to operate against
        :param message_id: identifier for the message containing the search term (if None, the function will fire the
        get_message function to attempt to retrieve a message from the specified queue to process)
        :param message_body: body of the message containing the search term and other details
        :return: Text string indicating whether or not anything was found and cached or if a record already existed
        when the function ran
        '''
        if sppin_source not in self.sppin_collections:
            raise ValueError(
                "The sppin_source parameter must be one of a list of configured data collections"
            )

        if message_id is None:
            message = self.get_message(message_queue)
            if message is None:
                raise ValueError(
                    "There is no available message that can be processed")

            message_id = message["id"]
            message_body = message["body"]

        try:
            sppin_key, sppin_key_type, sppin_key_value, sppin_data = self.check_sppin_key(
                message_body, sppin_collections=[sppin_source])
        except Exception as e:
            return e

        if sppin_data[sppin_source] is not None:
            try:
                self.delete_message(message_queue, message_id)
            except:
                pass
            return f"ALREADY CACHED: {sppin_key}"

        if message_body["source"]["type"] == "ScienceBase Source File":
            name_source = message_body["source"]["sciencebase_source_file"]
            source_date = message_body["source"][
                "sciencebase_source_file_date"]
        else:
            name_source = message_body["source"]["name_source"]
            source_date = datetime.utcnow().isoformat()

        taxa_summary_msg = None
        name_queue = None
        worms_queue = None

        # Run the different types of pysppin processors
        if sppin_source == "itis":
            source_results = pysppin.itis.ItisApi().search(
                sppin_key, name_source=name_source, source_date=source_date)

            taxa_summary_msg, name_queue, worms_queue = self.process_itis_result(
                source_results)

        elif sppin_source == "worms":
            source_results = pysppin.worms.Worms().search(
                sppin_key, name_source="SGCN", source_date=source_date)

            taxa_summary_msg, name_queue = self.process_worms_result(
                source_results)

        elif sppin_source == "gbif":
            source_results = pysppin.gbif.Gbif().summarize_us_species(
                sppin_key, name_source=name_source)

        elif sppin_source == "ecos":
            source_results = pysppin.ecos.Tess().search(sppin_key)

        elif sppin_source == "iucn":
            source_results = pysppin.iucn.Iucn().search_species(
                sppin_key, name_source=name_source)

        elif sppin_source == "natureserve":
            source_results = pysppin.natureserve.Natureserve().search(
                sppin_key, name_source=name_source)

        # Pass on messages to additional queues
        if taxa_summary_msg is not None:
            self.queue_message(queue_name="mq_taxa_summary",
                               message=taxa_summary_msg)

        if name_queue is not None:
            self.queue_message(queue_name=[
                "mq_ecos_check", "mq_iucn_check", "mq_natureserve_check",
                "mq_gbif_check"
            ],
                               message=name_queue)

        if worms_queue is not None:
            self.queue_message(queue_name="mq_worms_check",
                               message=worms_queue)

        # Insert results into appropriate sppin container
        self.cache_sppin(sppin_source=sppin_source, sppin_data=source_results)

        # Delete processed message
        try:
            self.delete_message(message_queue, message_id)
        except:
            pass

        return f"MESSAGE PROCESSED: {sppin_key}"

    def process_sgcn_source_record(self, record):
        '''
        This function processes an individual source record from any SGCN source, validates it against a schema,
        and pushes a valid record into a database.

        :param record:
        :return: nothing
        '''
        schema = self.get_schema("sgcn_source_records_schema")

        for record in pysppin_utils.validate_data(record, schema):
            if record["valid"]:
                self.sql_data.insert_record("sgcn",
                                            "sgcn",
                                            record["record"],
                                            mq=False)
            else:
                self.queue_message(queue_name="mq_invalid_source",
                                   message=record["record"])

    def queue_message(self, queue_name, message):
        if isinstance(queue_name, str):
            if isinstance(message, dict):
                self.sql_mq.insert_record("mq", queue_name, message, mq=True)
            elif isinstance(message, list):
                for msg in message:
                    self.sql_mq.insert_record("mq", queue_name, msg, mq=True)

        elif isinstance(queue_name, list):
            for q in queue_name:
                if isinstance(message, dict):
                    self.sql_mq.insert_record("mq", q, message, mq=True)
                elif isinstance(message, list):
                    for msg in message:
                        self.sql_mq.insert_record("mq", q, msg, mq=True)

    def get_message(self, queue_name):
        return self.sql_mq.get_single_record("mq", queue_name)

    def delete_message(self, queue_name, identifier):
        return self.sql_mq.delete_record("mq", queue_name, identifier)

    def get_records_by_sppin_key(self, sppin_key, ids_only=False):
        records = self.sql_data.get_select_records("sgcn", "sgcn",
                                                   "sppin_key = ?", sppin_key)

        if ids_only and records is not None:
            return [i["id"] for i in records]

        return records

    def update_taxa_summary_data(self, sppin_key, summary):
        '''
        This function infuses taxonomic authority summary properties into master SGCN records based on the sppin_key
        identifier.

        :param sppin_key: Compound key containing the type of value and value, either scientific name or ITIS TSN
        :param summary: Dictionary containing key value pairs of summary information
        :return: Summary list of updates committed
        '''
        taxonomic_authority = summary["taxonomic_authority_url"].split(
            "/")[2].split(".")[1].lower()

        sgcn_records = self.get_records_by_sppin_key(sppin_key)

        if sgcn_records is None or len(sgcn_records) == 0:
            return None

        if taxonomic_authority == "marinespecies":
            ids_to_update = [
                i["id"] for i in sgcn_records
                if i["taxonomic_authority_url"] is None
            ]
        else:
            ids_to_update = [i["id"] for i in sgcn_records]

        return self.sql_data.insert_sppin_props(db_name="sgcn",
                                                table_name="sgcn",
                                                props=summary,
                                                identifiers=ids_to_update)

    def cache_sppin(self, sppin_source, sppin_data, cache_type="sqlite"):
        '''
        Caches sppin data into a data store. The cache_type parameter specifies where to send the data.

        :param sppin_source: Logical name of the sppin source
        :param sppin_data: Dictionary containing sppin data to be cached
        :param cache_type: Set to a particular type to control where the data is sent. Defaults to local processing
        into a sqlite database
        :return: Dependent on the cache_type. In the case of sqlite, returns the unique identifier of the inserted
        record
        '''
        if cache_type == "sqlite":
            return self.sql_sppin.insert_record("sppin", sppin_source,
                                                sppin_data)

        else:
            return None

# Pipeline processing methods

    def validate_data(self, record):
        '''
        This function processes an individual source record from any SGCN source, validates it against a schema,
        and returns whether the record is valid.

        :param record:
        :return: Boolean, True if the data matches the schema, False otherwise
        '''
        schema = self.get_schema("sgcn_source_records_schema")
        validation = pysppin_utils.validate_data(record, schema)

        return validation[0]["valid"]

    # The below methods replace the functionality of process_sppin_source_search_term for the pipeline
    def gather_taxa_summary(self, message):
        '''
        Attempt to create a taxaonomic summary from itis. If itis doesn't have a match, create a taxonomic summary from WoRMS. 
        Return the taxonomic summary along with name processing information.

        :param message: message containing the search term and other details
        :return: Dictionary containing ITIS summary properties needed for this application and lists of messages for
        name processing in information gathering functions and WoRMS. Any of these can be None.
        '''
        taxa_summary_msg, name_queue, worms_queue = self.search_itis(message)

        if worms_queue is not None:
            worms_summary = self.search_worms(worms_queue)
            if worms_summary[0] is not None:
                # BCB-1569: This appears to be missing from all WoRMS entries
                if 'commonname' not in worms_summary[0].keys(
                ) and 'common name' in message.keys():
                    worms_summary[0]['commonname'] = message['common name']

            return worms_summary

        if taxa_summary_msg is not None:
            if 'commonname' not in taxa_summary_msg.keys(
            ) and 'common name' in message.keys():
                taxa_summary_msg['commonname'] = message['common name']
        return taxa_summary_msg, name_queue

    def search_itis(self, message):
        '''
        Search the cache for an existing record from itis. If none exists search itis. Return the processed itis information.

        :param message: Message containing the search term and other details
        :return: Dictionary containing ITIS summary properties needed for this application and lists of messages for
        name processing in information gathering functions and WoRMS. Any of these can be None.
        '''
        message_body = self.sppin_messages(dataset=[message])[0]
        get_data = lambda sppin_key, name_source, source_date: pysppin.itis.ItisApi(
        ).search(sppin_key, name_source=name_source, source_date=source_date)

        source_results = self.create_or_return_cache('itis', message_body,
                                                     get_data)

        return self.process_itis_result(source_results)

    def search_worms(self, message):
        '''
        Search the cache for an existing record from worms. If none exists search worms.
        Return the processed worms information.

        :param message: Message containing the search term and other details
        :return: Summary properties for processing in SGCN and a list of name messages for further processing
        '''
        get_data = lambda sppin_key, name_source, source_date: pysppin.worms.Worms(
        ).search(sppin_key, name_source="SGCN", source_date=source_date)

        source_results = self.create_or_return_cache('worms', message,
                                                     get_data)

        return self.process_worms_result(source_results)

    def gather_additional_cache_resources(self, name_queue, sppin_source):
        '''
        Search the cache for an existing record from the sppin source. If none exists create one.

        :param name_queue: Name message for gathering additional data
        :param sppin_source: The species information source to operate against
        '''
        if sppin_source == "gbif":
            source_results = self.create_or_return_cache(
                'gbif', name_queue, self.search_gbif)
        elif sppin_source == "ecos":
            source_results = self.create_or_return_cache(
                'ecos', name_queue, self.search_ecos)
        elif sppin_source == "iucn":
            source_results = self.create_or_return_cache(
                'iucn', name_queue, self.search_iucn)
        elif sppin_source == "natureserve":
            source_results = self.create_or_return_cache(
                'natureserve', name_queue, self.search_natureserve)

    def search_ecos(self, sppin_key, name_source, source_date):
        print('Search ECOS')
        return pysppin.ecos.Tess().search(sppin_key)

    def search_iucn(self, sppin_key, name_source, source_date):
        print('Search IUCN')
        return pysppin.iucn.Iucn().search_species(sppin_key,
                                                  name_source=name_source)

    def search_natureserve(self, sppin_key, name_source, source_date):
        print('Search NatureServe')
        return pysppin.natureserve.Natureserve().search(
            sppin_key, name_source=name_source)

    def search_gbif(self, sppin_key, name_source, source_date):
        print('Search GBIF')
        return pysppin.gbif.Gbif().summarize_us_species(
            sppin_key, name_source=name_source)

    def create_or_return_cache(self, sppin_source, message, get_data):
        '''
        Search the cache for the data. If it doesn't exist retreive the data and store it in the cache.
        Return the cached data.

        :param sppin_source: The information source (used to create the cache key)
        :param message: Message containing the search term and other details
        :param get_data: Function to retrieve the data if it's not in the cache
        The function should take 3 params: sppin_key, name_source, source_data
        :return: The results of the sppin source data retrieval 
        '''
        message = message if not isinstance(message, list) else message[0]
        if self.cache_manager:
            sppin_key = message["sppin_key"]
            key = "{}:{}".format(sppin_source, sppin_key)

            source_results = self.cache_manager.get_from_cache(key)
            if not source_results:
                name_source, source_date = self.get_source_data(message)
                source_results = get_data(sppin_key, name_source, source_date)
                # THIS SLEEP IS IMPORTANT.  We MUST guarantee that we don't hit the
                # WoRMS site any more than twice per second or they will block us.
                # We originally had this at 0.5 sec, but since our lambdas operate
                # at a concurrency of 2, we have to increase this to 1.0
                if sppin_source == "worms":
                    time.sleep(1.000)
                # Only cache results if they're successfully found
                if self.success(source_results):
                    self.cache_manager.add_to_cache(key, source_results)

            return source_results
        else:
            raise ValueError(
                "A cache_manager must be provided for non local processing.")

    def success(self, source_results):
        if not source_results:
            return False

        if not 'processing_metadata' in source_results.keys():
            return False

        if not 'status' in source_results['processing_metadata'].keys():
            return False

        if not source_results['processing_metadata']['status']:
            return False

        if not source_results['processing_metadata']['status'].lower(
        ) == "success":
            return False

        return True

    def get_source_data(self, message_body):
        '''
        Get source data from the message.

        :param message_body: Body of the message containing source details
        :return: The name and the creation date of the source
        '''
        if message_body["source"]["type"] == "ScienceBase Source File":
            name_source = message_body["source"]["sciencebase_source_file"]
            source_date = message_body["source"][
                "sciencebase_source_file_date"]
        else:
            name_source = message_body["source"]["name_source"]
            source_date = datetime.utcnow().isoformat()
        return name_source, source_date
Ejemplo n.º 23
0
 def test_has_public_read(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     acls = sb.get_permissions(sb.get_my_items_id())
     self.assertFalse(sb.has_public_read(acls))   
Ejemplo n.º 24
0
class Search:
    def __init__(self):
        self.sb = SbSession()
        self.params = {
            "max": 1000,
            "fields": "id"
        }
        self.acceptable_system_types = [
            None,
            "Data Release",
            "Folder",
            "Community",
            "Downloadable",
            "Mappable",
            "Map Service"
        ]
        self.acceptable_browse_categories = [
            None,
            "Physical Item",
            "Publication",
            "Data",
            "Project",
            "Image",
            "Map",
            "Data Release - In Progress",
            "Web Site",
            "Collection",
            "Software",
            "Data Release - Under Revision"
        ]

    def search_snapshot(self, system_type=None, browse_category=None, q=None, fields="id"):
        '''
        Function is designed to return a snapshot of ScienceBase items at a point in time with a processing_metadata
        structure we are using in the Biogeographic Information System. It adds a little bit of logic to the
        sciencebasepy API to handle setting up specific filters of interest to our work.

        :param system_type: If not None, accepts one of the available special item types in ScienceBase
        :param browse_category: If not None, accepts one of the available browse category values in ScienceBase
        :param q: query term(s)
        :param fields: Comma delimited string of ScienceBase Item fields to return
        :return: processing_metadata and list of items returned from search
        '''

        result = bis_utils.processing_metadata()
        result["processing_metadata"]["status"] = "failure"
        result["processing_metadata"]["status_message"] = "Search failed"

        if system_type not in self.acceptable_system_types:
            result["processing_metadata"]["status_message"] = \
                f"systemType must be one of: {self.acceptable_system_types}"
            return result

        if browse_category not in self.acceptable_browse_categories:
            result["processing_metadata"]["status_message"] = \
                f"browseCategory must be one of: {self.acceptable_browse_categories}"
            return result

        parameters = {
            "fields": fields,
            "max": self.params["max"]
        }

        filters = list()

        if system_type is not None:
            filters.append(f"systemType={system_type}")

        if browse_category is not None:
            filters.append(f"browseCategory={browse_category}")

        if len(filters) > 0:
            for index, filter in enumerate(filters):
                parameters[f"filter{index}"] = filter

        if q is not None:
            parameters["q"] = q

#        try:
        items = self.sb.find_items(parameters)
        result["processing_metadata"]["api"] = items['selflink']['url']
        result["parameters"] = parameters
        if len(items['items']) == 0:
            result["processing_metadata"]["status"] = "success"
            result["processing_metadata"]["status_message"] = "no items found"
            return result
        else:
            result["data"] = list()
            while items and 'items' in items:
                result["data"].extend(items['items'])
                items = self.sb.next(items)
            result["processing_metadata"]["status"] = "success"
            result["processing_metadata"]["status_message"] = f'Number items found: {len(result["data"])}'
            return result
Ejemplo n.º 25
0
 def test_print_acls(self): 
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     acls = sb.get_permissions(sb.get_my_items_id())
     sb.print_acls(acls)
Ejemplo n.º 26
0
 def test_get_item(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     item = sb.get_item(self.SB_CATALOG_ITEM_ID)
     self.assertTrue(item is not None)
     self.assertEqual(item['title'], 'ScienceBase Catalog')
Ejemplo n.º 27
0
 def test_ancestors_field_in_get_item(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     item = sb.get_item(self.BETA_TEST_COMMUNITY_ID, {'fields':'parentId,ancestors'})
     self.assertTrue(isinstance(item['ancestors'], list))
     self.assertTrue(item['parentId'] is not None)
     self.assertTrue(item['parentId'] in item['ancestors'])
Ejemplo n.º 28
0
 def test_get_hidden_properties(self):
     sb = SbSession('beta').login(self.TEST_USER, self.TEST_PASSWORD)
     hidden_properties = sb.get_hidden_properties(self.SB_CATALOG_ITEM_ID)
     self.assertTrue(str(hidden_properties.get("value")) is not None)
Ejemplo n.º 29
0
from sciencebasepy import SbSession

FILE_NAME = 'tests/resources/sample_error.png'

sb = SbSession()
CREATE_ITEM = True

# Login to ScienceBase
username = input("Username:  "******"some_item_id"

# Upload a File using GraphQL to the test item
print(sb.upload_cloud_file_to_item(item_id, FILE_NAME))