def make_occurrence_df(CollectionsChoice, SpecimensSplit, InstituteCol, CatalogCol): Collections = [CollectionsChoice] * len(SpecimensSplit) OccurrenceIDs = [] for i in range(len(SpecimensSplit)): catalognumber = SpecimensSplit.iloc[i, CatalogCol] #fullcatalognumber = ['Uw ' + catalognumber] Query = { "institutioncode": SpecimensSplit.iloc[i, InstituteCol], "catalognumber": catalognumber, "collectioncode": CollectionsChoice } api = idigbio.json() #shorten TempRecords = api.search_records(rq=Query) ### ! Future: if this query doesn't return anything, there's something very wrong. Flag. OccurrenceIDs.append( TempRecords['items'][0]['indexTerms']['occurrenceid']) SpecimenNumbers = SpecimensSplit.iloc[:, CatalogCol] #FullSpecimenNumbers = ['Uw ' + x for x in list(SpecimenNumbers)] SpecimenDictionary = { 'Institution': list(SpecimensSplit.iloc[:, InstituteCol]), 'Collection': Collections, 'CatalogNumber': list(SpecimenNumbers), 'OccurrenceID': OccurrenceIDs } SpecimenDf = pd.DataFrame.from_dict(SpecimenDictionary) return SpecimenDf
def tree_build(): opts = {} # Hardcoded options, potentially expose opts["data_file"] = "data.nex" opts["seq_type"] = "dna" opts["fields"] = ["uuid"] opts["sort"] = ["uuid"] opts["rq"] = request.args.get("rq") opts["limit"] = request.args.get("limit", 10) # Generate a uuid job id opts["job_id"] = str(uuid.uuid4()) idb = idigbio.json() results = idb.search_records(rq=opts["rq"], limit=opts["limit"], fields=opts["fields"], sort=opts["sort"]) idb_uuids = [] for rec in results["items"]: idb_uuids.append(rec["indexTerms"]["uuid"]) db = Database() opts["raw_seqs"] = {} for seq in db.sess.query(Sequence).filter(Sequence.idb_uuid.in_(idb_uuids)).filter(Sequence.can_use == True): # The "-" char messes up MrBayes even in the taxon name string field. # Change that here and it will percolate through the output without # affecting the data sources on the front end. opts["raw_seqs"][seq.idb_uuid.replace("-", "_")] = seq.seq pipeline.delay(opts) return jsonify({"job_id": opts["job_id"], "raw_seqs": opts["raw_seqs"], "rq": opts["rq"]})
def get_api_client(current_user=None): if current_user is None: current_user = get_current_user() env = os.getenv("IDIGBIO_ENV", "prod") api = idigbio.json(env=env, user=current_user.user_uuid, password=current_user.auth_key) return api
def find_options(InstitutionCode, CatalogNumber): #for now, using only the first specimen to find correct collection. #design query to find all collections in an institution that contain the first specimen number Query = { "institutioncode": InstitutionCode, "catalognumber": CatalogNumber } api = idigbio.json() #shorten # Search for records containing first institution code and catalog number MyRecordList = api.search_records(rq=Query) return MyRecordList
def get_api_client(current_user=None): if current_user is None: current_user = get_current_user() env = os.getenv("IDIGBIO_ENV", "prod") api = idigbio.json( env=env, user=current_user.user_uuid, password=current_user.auth_key ) return api
def matchName( scientific_name ): # PBDB with BHL OCR Text query pbdb = PBDBBHL( scientific_name ) # Python client for iDigBio api = idigbio.json() record_list = api.search_records(rq={"scientificname": scientific_name}) if not record_list: # try order level record_list = api.search_records(rq={"order": scientific_name}) # Run through scoring algorithm return scoring.scorePubs(pbdb.records, record_list, 3)
def main(): '''Main function for testing purposes only ''' #Initialize idigbio API api = idigbio.json() #Define query dictionary rq = {"genus":"panthera"} #Assign query results result = api.search_records(rq, limit=30) table_name = "records1" #Create database table createSchema(result, table_name)
def main(): '''Main function for testing purposes only ''' #Initialize idigbio API api = idigbio.json() #Define query dictionary rq = {"genus": "himantura"} #Assign query results result = api.search_records(rq, limit=5000) table_name = "stingrays" #Use query results to create database table with all needed fields TableSchemaCreator.createSchema(result, table_name) #Populate database table with values in query populateTable(result, table_name)
def createTable(rq, table_name, limit=5000): '''Function that allows user to create a table in a PostgreSQL database that contains the results of a query to idigbio. Takes rq dictionary (containing query params), table_name string (name of table to be created in DB) and limit int (max no. of records to be returned) as arguments. ''' #Initialize idigbio's API api = idigbio.json() #Conduct query through API results = api.search_records(rq, limit) #Create table & appropriate fields based on query result createSchema(results, table_name) #Enter data in query into table populateTable(results, table_name)
def tree_build(): opts = {} # Hardcoded options, potentially expose opts["data_file"] = "data.nex" opts["seq_type"] = "dna" opts["fields"] = ["uuid"] opts["sort"] = ["uuid"] opts["rq"] = request.args.get("rq") opts["limit"] = request.args.get("limit", 10) # Generate a uuid job id opts["job_id"] = str(uuid.uuid4()) idb = idigbio.json() results = idb.search_records(rq=opts["rq"], limit=opts["limit"], fields=opts["fields"], sort=opts["sort"]) idb_uuids = [] for rec in results["items"]: idb_uuids.append(rec["indexTerms"]["uuid"]) db = Database() opts["raw_seqs"] = {} for seq in db.sess.query(Sequence).filter( Sequence.idb_uuid.in_(idb_uuids)).filter(Sequence.can_use == True): # The "-" char messes up MrBayes even in the taxon name string field. # Change that here and it will percolate through the output without # affecting the data sources on the front end. opts["raw_seqs"][seq.idb_uuid.replace("-", "_")] = seq.seq pipeline.delay(opts) return jsonify({ "job_id": opts["job_id"], "raw_seqs": opts["raw_seqs"], "rq": opts["rq"] })
def main(): api = idigbio.json( env="prod", user=os.environ.get("IDB_API_UUID"), password=os.environ.get("IDB_API_KEY") ) for url in HARVEST_LIST_URLS: try: r = requests.get(url) r.raise_for_status() header = None for line in csv.reader(StringIO(r.text)): if header is None: header = line else: line_dict = dict(zip(header,line)) mime = "text/csv" if line_dict["archive_format"] == "DwCA": mime = "application/zip" cron_line = line_dict["cron"] if PUSH_ALL or (len(cron_line) > 0 and pycron.is_now(cron_line)): logger.info(line_dict) try: api_r = api.addurl(line_dict["archive_url"], media_type="guoda", mime_type=mime) logger.info(api_r) except: print api_r.content logger.exception("Error Pushing URL") else: logger.debug("Skipping %s", line_dict["name"]) except: logger.exception("Error Fetching %s", url)
writer.writerow(outputheaderrow) # inputset will hold the lines we actually want to work on inputset = set() smallinputset = set() # zerorecordset will hold the lines that don't have any matching records zerorecordsset = set() def getmatchingcount(apiobj,thingtocount): num = api.count_records(rq={searchfield: thingtocount}) return num count = 0 api = idigbio.json() print "Reading input file, getting counts for each value..." with open(inputfile, 'r') as f: for line in f: stripped = line.strip() la_record_count = getmatchingcount(api,stripped) if header_needs_skipped: header_needs_skipped = False else: if la_record_count > 5: inputset.add(stripped) elif la_record_count == 0: zerorecordsset.add(stripped) else:
args = parser.parse_args() buff = '' hr = '================================================================\n' buff = buff + hr buff = buff + ''' DATA INGESTION STATUS REPORT ''' buff = buff + datetime.date.today().strftime("%B %d, %Y") + "\n\n" buff = buff + hr + "\n" api = idigbio.json() record_count = locale.format("%d", api.count_records(), grouping=True) media_record_count = locale.format("%d", api.count_media(), grouping=True) recordset_count = locale.format("%d", api.count_recordsets(), grouping=True) # Paused count is the count of recordsets where ingest is true, # and paused is also true. Will not count paused recordsets where # ingest is false. db = PostgresDB() sql = """ SELECT count(*) FROM recordsets WHERE ingest = true AND ingest_is_paused = true; """ db_r = db.fetchone(sql) paused_count = db_r["count"] # Updated recordsets is an approximation based on the number of items
#raw_seqs = {} #raw_seqs["Tarsius_syrichta"] = "AAGTTTCATTGGAGCCACCACTCTTATAATTGCCCATGGCCTCACCTCCTCCCTATTATTTTGCCTAGCAAATACAAACTACGAACGAGTCCACAGTCGAACAATAGCACTAGCCCGTGGCCTTCAAACCCTATTACCTCTTGCAGCAACATGATGACTCCTCGCCAGCTTAACCAACCTGGCCCTTCCCCCAACAATTAATTTAATCGGTGAACTGTCCGTAATAATAGCAGCATTTTCATGGTCACACCTAACTATTATCTTAGTAGGCCTTAACACCCTTATCACCGCCCTATATTCCCTATATATACTAATCATAACTCAACGAGGAAAATACACATATCATATCAACAATATCATGCCCCCTTTCACCCGAGAAAATACATTAATAATCATACACCTATTTCCCTTAATCCTACTATCTACCAACCCCAAAGTAATTATAGGAACCATGTACTGTAAATATAGTTTAAACAAAACATTAGATTGTGAGTCTAATAATAGAAGCCCAAAGATTTCTTATTTACCAAGAAAGTA-TGCAAGAACTGCTAACTCATGCCTCCATATATAACAATGTGGCTTTCTT-ACTTTTAAAGGATAGAAGTAATCCATCGGTCTTAGGAACCGAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATTTATTTTCATCCTCCATTTTACTATCACTTACACTCTTAATTACCCCATTTATTATTACAACAACTAAAAAATATGAAACACATGCATACCCTTACTACGTAAAAAACTCTATCGCCTGCGCATTTATAACAAGCCTAGTCCCAATGCTCATATTTCTATACACAAATCAAGAAATAATCATTTCCAACTGACATTGAATAACGATTCATACTATCAAATTATGCCTAAGCTT" #raw_seqs["Lemur_catta"] = "AAGCTTCATAGGAGCAACCATTCTAATAATCGCACATGGCCTTACATCATCCATATTATTCTGTCTAGCCAACTCTAACTACGAACGAATCCATAGCCGTACAATACTACTAGCACGAGGGATCCAAACCATTCTCCCTCTTATAGCCACCTGATGACTACTCGCCAGCCTAACTAACCTAGCCCTACCCACCTCTATCAATTTAATTGGCGAACTATTCGTCACTATAGCATCCTTCTCATGATCAAACATTACAATTATCTTAATAGGCTTAAATATGCTCATCACCGCTCTCTATTCCCTCTATATATTAACTACTACACAACGAGGAAAACTCACATATCATTCGCACAACCTAAACCCATCCTTTACACGAGAAAACACCCTTATATCCATACACATACTCCCCCTTCTCCTATTTACCTTAAACCCCAAAATTATTCTAGGACCCACGTACTGTAAATATAGTTTAAA-AAAACACTAGATTGTGAATCCAGAAATAGAAGCTCAAAC-CTTCTTATTTACCGAGAAAGTAATGTATGAACTGCTAACTCTGCACTCCGTATATAAAAATACGGCTATCTCAACTTTTAAAGGATAGAAGTAATCCATTGGCCTTAGGAGCCAAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATCTATTATCCTCTTTCACCCTTGTCACACTGATTATCCTAACTTTACCTATCATTATAAACGTTACAAACATATACAAAAACTACCCCTATGCACCATACGTAAAATCTTCTATTGCATGTGCCTTCATCACTAGCCTCATCCCAACTATATTATTTATCTCCTCAGGACAAGAAACAATCATTTCCAACTGACATTGAATAACAATCCAAACCCTAAAACTATCTATTAGCTT" #raw_seqs["Homo_sapiens"] = "AAGCTTCACCGGCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTGTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTA-CGACCCCTTATTTACCGAGAAAGCT-CACAAGAACTGCTAACTCATGCCCCCATGTCTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTT" #raw_seqs["Pan"] = "AAGCTTCACCGGCGCAATTATCCTCATAATCGCCCACGGACTTACATCCTCATTATTATTCTGCCTAGCAAACTCAAATTATGAACGCACCCACAGTCGCATCATAATTCTCTCCCAAGGACTTCAAACTCTACTCCCACTAATAGCCTTTTGATGACTCCTAGCAAGCCTCGCTAACCTCGCCCTACCCCCTACCATTAATCTCCTAGGGGAACTCTCCGTGCTAGTAACCTCATTCTCCTGATCAAATACCACTCTCCTACTCACAGGATTCAACATACTAATCACAGCCCTGTACTCCCTCTACATGTTTACCACAACACAATGAGGCTCACTCACCCACCACATTAATAACATAAAGCCCTCATTCACACGAGAAAATACTCTCATATTTTTACACCTATCCCCCATCCTCCTTCTATCCCTCAATCCTGATATCATCACTGGATTCACCTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTCA-CGACCCCTTATTTACCGAGAAAGCT-TATAAGAACTGCTAATTCATATCCCCATGCCTGACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCCATCCGTTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGTATACTACCATAACCACCTTAACCCTAACTCCCTTAATTCTCCCCATCCTCACCACCCTCATTAACCCTAACAAAAAAAACTCATATCCCCATTATGTGAAATCCATTATCGCGTCCACCTTTATCATTAGCCTTTTCCCCACAACAATATTCATATGCCTAGACCAAGAAGCTATTATCTCAAACTGGCACTGAGCAACAACCCAAACAACCCAGCTCTCCCTAAGCTT" #opts["raw_seqs"] = raw_seqs # Build raw seqs from db lookup given ids, consider doing this as a worker # in the future if we move away from a static database. #idb_uuids = ["23984", #"995440", #"81211", #"5559384"] idb = idigbio.json() rq_ = {"genus": "acer"} limit_ = 10 fields_ = ["uuid"] sort_ = ["uuid"] results = idb.search_records(rq=rq_, limit=limit_, fields=fields_, sort=sort_) #print len(results["items"]) #print results["items"][0]["indexTerms"]["genus"] #exit(0) idb_uuids = [] for rec in results["items"]: idb_uuids.append(rec["indexTerms"]["uuid"]) print idb_uuids db = Database()
def idigbioQuery(rq, limit=None): ''' Function that conducts user's query to iDigBio using its python API. Takes a "rq" python dictionary containing the user's search terms as first argument and an optional "limit" argument which specifies the no. of records returned. Returns a python dictionary containing the results for the user's query to iDigBio. ''' #Define API from idigbio library being used api = idigbio.json() #Determine no. of records query would result in record_count = api.count_records(rq) '''Case #1: Limit is not given''' if limit == None: #Simple case: Query size is smaller than 5k records, return query as is if record_count <= 5000: results = api.search_records(rq, limit=5000) return results #Complex case: Query larger than 5k, break up query into 5k sized portions if record_count > 5000: #Records will be sorted by uuid to avoid overlapping queries sort = "uuid" #Defining needed paramteres for offset management offset_jump = 5000 #No. of records skipped each query, max. 5000 offset_count = record_count // offset_jump #No. of offsets needed offset = 5000 #The offset parameter passed to idigbio API #First query, initializing results dictionary results = api.search_records(rq, limit=offset_jump, sort=sort) #Iterate through offsets and perform query for i in range(1, offset_count + 1): #Conduct query with offset query_results = api.search_records(rq, limit=offset_jump, sort=sort, offset=offset) #Merge offset query results list to results dictionary list results["items"].extend(query_results["items"]) #Iterate to next offset offset += offset_jump return results '''Case #2: Limit has been given''' if limit != None: #Simple case: Limit given is below or equal to 5000, return query as is if limit <= 5000: results = api.search_records(rq, limit) return results #Complex case: Limit given is above 5k if limit > 5000: #Records will be sorted by uuid to avoid overlapping queries sort = "uuid" #Defining parameters for offset management offset_jump = 5000 #No. of records jumped each query offset_count = limit // offset_jump #No. of offsets needed offset_remainder = limit % offset_jump #Remainder from last offset offset = 5000 #Offset parameter passed to idigbio API #Initial query, offset=0, initialization of results dict results = api.search_records(rq, limit=offset_jump, sort=sort) #Iterate through next offset queries for i in range(1, offset_count + 1): if i == (offset_count): #If last offset, query for what is left over based on original limit query_results = api.search_records(rq, limit=offset_remainder, sort=sort, offset=offset) else: #Conduct offset queries, starting at offset = 5000 query_results = api.search_records(rq, limit=offset_jump, sort=sort, offset=offset) #Add the query results to the results dict results["items"].extend(query_results["items"]) #Iterate to next offset offset += offset_jump return results