def make_occurrence_df(CollectionsChoice, SpecimensSplit, InstituteCol,
                       CatalogCol):
    Collections = [CollectionsChoice] * len(SpecimensSplit)
    OccurrenceIDs = []
    for i in range(len(SpecimensSplit)):
        catalognumber = SpecimensSplit.iloc[i, CatalogCol]
        #fullcatalognumber = ['Uw ' + catalognumber]
        Query = {
            "institutioncode": SpecimensSplit.iloc[i, InstituteCol],
            "catalognumber": catalognumber,
            "collectioncode": CollectionsChoice
        }
        api = idigbio.json()  #shorten
        TempRecords = api.search_records(rq=Query)
        ### ! Future: if this query doesn't return anything, there's something very wrong. Flag.
        OccurrenceIDs.append(
            TempRecords['items'][0]['indexTerms']['occurrenceid'])
    SpecimenNumbers = SpecimensSplit.iloc[:, CatalogCol]
    #FullSpecimenNumbers = ['Uw ' + x for x in list(SpecimenNumbers)]
    SpecimenDictionary = {
        'Institution': list(SpecimensSplit.iloc[:, InstituteCol]),
        'Collection': Collections,
        'CatalogNumber': list(SpecimenNumbers),
        'OccurrenceID': OccurrenceIDs
    }
    SpecimenDf = pd.DataFrame.from_dict(SpecimenDictionary)
    return SpecimenDf
Beispiel #2
0
def tree_build():
    opts = {}
    # Hardcoded options, potentially expose
    opts["data_file"] = "data.nex"
    opts["seq_type"] = "dna"
    opts["fields"] = ["uuid"]
    opts["sort"] = ["uuid"]

    opts["rq"] = request.args.get("rq")
    opts["limit"] = request.args.get("limit", 10)

    # Generate a uuid job id
    opts["job_id"] = str(uuid.uuid4())

    idb = idigbio.json()
    results = idb.search_records(rq=opts["rq"], limit=opts["limit"], 
                                 fields=opts["fields"], sort=opts["sort"])

    idb_uuids = []
    for rec in results["items"]:
        idb_uuids.append(rec["indexTerms"]["uuid"])

    db = Database()
    opts["raw_seqs"] = {}
    for seq in db.sess.query(Sequence).filter(Sequence.idb_uuid.in_(idb_uuids)).filter(Sequence.can_use == True):
        # The "-" char messes up MrBayes even in the taxon name string field.
        # Change that here and it will percolate through the output without
        # affecting the data sources on the front end.
        opts["raw_seqs"][seq.idb_uuid.replace("-", "_")] = seq.seq

    pipeline.delay(opts)
    return jsonify({"job_id": opts["job_id"], "raw_seqs": opts["raw_seqs"], "rq": opts["rq"]})
Beispiel #3
0
def get_api_client(current_user=None):
    if current_user is None:
        current_user = get_current_user()

    env = os.getenv("IDIGBIO_ENV", "prod")
    api = idigbio.json(env=env,
                       user=current_user.user_uuid,
                       password=current_user.auth_key)
    return api
def find_options(InstitutionCode, CatalogNumber):
    #for now, using only the first specimen to find correct collection.
    #design query to find all collections in an institution that contain the first specimen number
    Query = {
        "institutioncode": InstitutionCode,
        "catalognumber": CatalogNumber
    }
    api = idigbio.json()  #shorten
    # Search for records containing first institution code and catalog number
    MyRecordList = api.search_records(rq=Query)
    return MyRecordList
def get_api_client(current_user=None):
    if current_user is None:
        current_user = get_current_user()

    env = os.getenv("IDIGBIO_ENV", "prod")
    api = idigbio.json(
        env=env,
        user=current_user.user_uuid,
        password=current_user.auth_key
    )
    return api
def matchName( scientific_name ):

  # PBDB with BHL OCR Text query
  pbdb = PBDBBHL( scientific_name )

  # Python client for iDigBio
  api = idigbio.json()
  record_list = api.search_records(rq={"scientificname": scientific_name})

  if not record_list:
    # try order level
    record_list = api.search_records(rq={"order": scientific_name})

  # Run through scoring algorithm
  return scoring.scorePubs(pbdb.records, record_list, 3) 
Beispiel #7
0
def main():
    '''Main function for testing purposes only
    ''' 
    #Initialize idigbio API
    api = idigbio.json()
    
    #Define query dictionary
    rq = {"genus":"panthera"}
    
    #Assign query results
    result = api.search_records(rq, limit=30)
    
    table_name = "records1"
    
    #Create database table
    createSchema(result, table_name)
Beispiel #8
0
def main():
    '''Main function for testing purposes only
    '''
    #Initialize idigbio API
    api = idigbio.json()

    #Define query dictionary
    rq = {"genus": "himantura"}

    #Assign query results
    result = api.search_records(rq, limit=5000)

    table_name = "stingrays"

    #Use query results to create database table with all needed fields
    TableSchemaCreator.createSchema(result, table_name)

    #Populate database table with values in query
    populateTable(result, table_name)
Beispiel #9
0
def createTable(rq, table_name, limit=5000):
    '''Function that allows user to create a table in a PostgreSQL database
       that contains the results of a query to idigbio.
       
       Takes rq dictionary (containing query params),
       table_name string (name of table to be created in DB) and
       limit int (max no. of records to be returned) as arguments.
    '''
    #Initialize idigbio's API
    api = idigbio.json()

    #Conduct query through API
    results = api.search_records(rq, limit)

    #Create table & appropriate fields based on query result
    createSchema(results, table_name)

    #Enter data in query into table
    populateTable(results, table_name)
Beispiel #10
0
def tree_build():
    opts = {}
    # Hardcoded options, potentially expose
    opts["data_file"] = "data.nex"
    opts["seq_type"] = "dna"
    opts["fields"] = ["uuid"]
    opts["sort"] = ["uuid"]

    opts["rq"] = request.args.get("rq")
    opts["limit"] = request.args.get("limit", 10)

    # Generate a uuid job id
    opts["job_id"] = str(uuid.uuid4())

    idb = idigbio.json()
    results = idb.search_records(rq=opts["rq"],
                                 limit=opts["limit"],
                                 fields=opts["fields"],
                                 sort=opts["sort"])

    idb_uuids = []
    for rec in results["items"]:
        idb_uuids.append(rec["indexTerms"]["uuid"])

    db = Database()
    opts["raw_seqs"] = {}
    for seq in db.sess.query(Sequence).filter(
            Sequence.idb_uuid.in_(idb_uuids)).filter(Sequence.can_use == True):
        # The "-" char messes up MrBayes even in the taxon name string field.
        # Change that here and it will percolate through the output without
        # affecting the data sources on the front end.
        opts["raw_seqs"][seq.idb_uuid.replace("-", "_")] = seq.seq

    pipeline.delay(opts)
    return jsonify({
        "job_id": opts["job_id"],
        "raw_seqs": opts["raw_seqs"],
        "rq": opts["rq"]
    })
Beispiel #11
0
def main():
    api = idigbio.json(
        env="prod",
        user=os.environ.get("IDB_API_UUID"),
        password=os.environ.get("IDB_API_KEY")
    )

    for url in HARVEST_LIST_URLS:
        try:
            r = requests.get(url)
            r.raise_for_status()

            header = None
            for line in csv.reader(StringIO(r.text)):
                if header is None:
                    header = line
                else:
                    line_dict = dict(zip(header,line))

                    mime = "text/csv"
                    if line_dict["archive_format"] == "DwCA":
                        mime = "application/zip"

                    cron_line = line_dict["cron"]
                    if PUSH_ALL or (len(cron_line) > 0 and pycron.is_now(cron_line)):
                        logger.info(line_dict)
                        try:
                            api_r = api.addurl(line_dict["archive_url"], media_type="guoda", mime_type=mime)
                            logger.info(api_r)
                        except:
                            print api_r.content
                            logger.exception("Error Pushing URL")
                        
                    else:
                        logger.debug("Skipping %s", line_dict["name"])

        except:
            logger.exception("Error Fetching %s", url)
    writer.writerow(outputheaderrow)


# inputset will hold the lines we actually want to work on
inputset = set()
smallinputset = set()
# zerorecordset will hold the lines that don't have any matching records
zerorecordsset = set()


def getmatchingcount(apiobj,thingtocount):  
    num = api.count_records(rq={searchfield: thingtocount})
    return num

count = 0
api = idigbio.json()

print "Reading input file, getting counts for each value..."

with open(inputfile, 'r') as f:
    for line in f:
        stripped = line.strip()
        la_record_count = getmatchingcount(api,stripped)
        if header_needs_skipped:
            header_needs_skipped = False
        else:
            if la_record_count > 5:
                inputset.add(stripped)
            elif la_record_count == 0:
                zerorecordsset.add(stripped)
            else:
Beispiel #13
0
args = parser.parse_args()

buff = ''

hr = '================================================================\n'

buff = buff + hr

buff = buff + '''
DATA INGESTION STATUS REPORT
'''

buff = buff + datetime.date.today().strftime("%B %d, %Y") + "\n\n"
buff = buff + hr + "\n"

api = idigbio.json()
record_count = locale.format("%d", api.count_records(), grouping=True)
media_record_count = locale.format("%d", api.count_media(), grouping=True)
recordset_count = locale.format("%d", api.count_recordsets(), grouping=True)

# Paused count is the count of recordsets where ingest is true,
# and paused is also true.  Will not count paused recordsets where
# ingest is false.
db = PostgresDB()
sql = """
      SELECT count(*) FROM recordsets WHERE ingest = true AND ingest_is_paused = true;
"""
db_r = db.fetchone(sql)
paused_count = db_r["count"]

# Updated recordsets is an approximation based on the number of items
Beispiel #14
0
#raw_seqs = {}
#raw_seqs["Tarsius_syrichta"] = "AAGTTTCATTGGAGCCACCACTCTTATAATTGCCCATGGCCTCACCTCCTCCCTATTATTTTGCCTAGCAAATACAAACTACGAACGAGTCCACAGTCGAACAATAGCACTAGCCCGTGGCCTTCAAACCCTATTACCTCTTGCAGCAACATGATGACTCCTCGCCAGCTTAACCAACCTGGCCCTTCCCCCAACAATTAATTTAATCGGTGAACTGTCCGTAATAATAGCAGCATTTTCATGGTCACACCTAACTATTATCTTAGTAGGCCTTAACACCCTTATCACCGCCCTATATTCCCTATATATACTAATCATAACTCAACGAGGAAAATACACATATCATATCAACAATATCATGCCCCCTTTCACCCGAGAAAATACATTAATAATCATACACCTATTTCCCTTAATCCTACTATCTACCAACCCCAAAGTAATTATAGGAACCATGTACTGTAAATATAGTTTAAACAAAACATTAGATTGTGAGTCTAATAATAGAAGCCCAAAGATTTCTTATTTACCAAGAAAGTA-TGCAAGAACTGCTAACTCATGCCTCCATATATAACAATGTGGCTTTCTT-ACTTTTAAAGGATAGAAGTAATCCATCGGTCTTAGGAACCGAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATTTATTTTCATCCTCCATTTTACTATCACTTACACTCTTAATTACCCCATTTATTATTACAACAACTAAAAAATATGAAACACATGCATACCCTTACTACGTAAAAAACTCTATCGCCTGCGCATTTATAACAAGCCTAGTCCCAATGCTCATATTTCTATACACAAATCAAGAAATAATCATTTCCAACTGACATTGAATAACGATTCATACTATCAAATTATGCCTAAGCTT"
#raw_seqs["Lemur_catta"] = "AAGCTTCATAGGAGCAACCATTCTAATAATCGCACATGGCCTTACATCATCCATATTATTCTGTCTAGCCAACTCTAACTACGAACGAATCCATAGCCGTACAATACTACTAGCACGAGGGATCCAAACCATTCTCCCTCTTATAGCCACCTGATGACTACTCGCCAGCCTAACTAACCTAGCCCTACCCACCTCTATCAATTTAATTGGCGAACTATTCGTCACTATAGCATCCTTCTCATGATCAAACATTACAATTATCTTAATAGGCTTAAATATGCTCATCACCGCTCTCTATTCCCTCTATATATTAACTACTACACAACGAGGAAAACTCACATATCATTCGCACAACCTAAACCCATCCTTTACACGAGAAAACACCCTTATATCCATACACATACTCCCCCTTCTCCTATTTACCTTAAACCCCAAAATTATTCTAGGACCCACGTACTGTAAATATAGTTTAAA-AAAACACTAGATTGTGAATCCAGAAATAGAAGCTCAAAC-CTTCTTATTTACCGAGAAAGTAATGTATGAACTGCTAACTCTGCACTCCGTATATAAAAATACGGCTATCTCAACTTTTAAAGGATAGAAGTAATCCATTGGCCTTAGGAGCCAAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATCTATTATCCTCTTTCACCCTTGTCACACTGATTATCCTAACTTTACCTATCATTATAAACGTTACAAACATATACAAAAACTACCCCTATGCACCATACGTAAAATCTTCTATTGCATGTGCCTTCATCACTAGCCTCATCCCAACTATATTATTTATCTCCTCAGGACAAGAAACAATCATTTCCAACTGACATTGAATAACAATCCAAACCCTAAAACTATCTATTAGCTT"
#raw_seqs["Homo_sapiens"] = "AAGCTTCACCGGCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTGTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTA-CGACCCCTTATTTACCGAGAAAGCT-CACAAGAACTGCTAACTCATGCCCCCATGTCTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTT"
#raw_seqs["Pan"] = "AAGCTTCACCGGCGCAATTATCCTCATAATCGCCCACGGACTTACATCCTCATTATTATTCTGCCTAGCAAACTCAAATTATGAACGCACCCACAGTCGCATCATAATTCTCTCCCAAGGACTTCAAACTCTACTCCCACTAATAGCCTTTTGATGACTCCTAGCAAGCCTCGCTAACCTCGCCCTACCCCCTACCATTAATCTCCTAGGGGAACTCTCCGTGCTAGTAACCTCATTCTCCTGATCAAATACCACTCTCCTACTCACAGGATTCAACATACTAATCACAGCCCTGTACTCCCTCTACATGTTTACCACAACACAATGAGGCTCACTCACCCACCACATTAATAACATAAAGCCCTCATTCACACGAGAAAATACTCTCATATTTTTACACCTATCCCCCATCCTCCTTCTATCCCTCAATCCTGATATCATCACTGGATTCACCTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTCA-CGACCCCTTATTTACCGAGAAAGCT-TATAAGAACTGCTAATTCATATCCCCATGCCTGACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCCATCCGTTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGTATACTACCATAACCACCTTAACCCTAACTCCCTTAATTCTCCCCATCCTCACCACCCTCATTAACCCTAACAAAAAAAACTCATATCCCCATTATGTGAAATCCATTATCGCGTCCACCTTTATCATTAGCCTTTTCCCCACAACAATATTCATATGCCTAGACCAAGAAGCTATTATCTCAAACTGGCACTGAGCAACAACCCAAACAACCCAGCTCTCCCTAAGCTT"
#opts["raw_seqs"] = raw_seqs

# Build raw seqs from db lookup given ids, consider doing this as a worker
# in the future if we move away from a static database.

#idb_uuids = ["23984",
#"995440",
#"81211",
#"5559384"]

idb = idigbio.json()
rq_ = {"genus": "acer"}
limit_ = 10
fields_ = ["uuid"]
sort_ = ["uuid"]
results = idb.search_records(rq=rq_, limit=limit_, fields=fields_, sort=sort_)
#print len(results["items"])
#print results["items"][0]["indexTerms"]["genus"]
#exit(0)
idb_uuids = []
for rec in results["items"]:
    idb_uuids.append(rec["indexTerms"]["uuid"])

print idb_uuids

db = Database()
Beispiel #15
0
#raw_seqs = {}
#raw_seqs["Tarsius_syrichta"] = "AAGTTTCATTGGAGCCACCACTCTTATAATTGCCCATGGCCTCACCTCCTCCCTATTATTTTGCCTAGCAAATACAAACTACGAACGAGTCCACAGTCGAACAATAGCACTAGCCCGTGGCCTTCAAACCCTATTACCTCTTGCAGCAACATGATGACTCCTCGCCAGCTTAACCAACCTGGCCCTTCCCCCAACAATTAATTTAATCGGTGAACTGTCCGTAATAATAGCAGCATTTTCATGGTCACACCTAACTATTATCTTAGTAGGCCTTAACACCCTTATCACCGCCCTATATTCCCTATATATACTAATCATAACTCAACGAGGAAAATACACATATCATATCAACAATATCATGCCCCCTTTCACCCGAGAAAATACATTAATAATCATACACCTATTTCCCTTAATCCTACTATCTACCAACCCCAAAGTAATTATAGGAACCATGTACTGTAAATATAGTTTAAACAAAACATTAGATTGTGAGTCTAATAATAGAAGCCCAAAGATTTCTTATTTACCAAGAAAGTA-TGCAAGAACTGCTAACTCATGCCTCCATATATAACAATGTGGCTTTCTT-ACTTTTAAAGGATAGAAGTAATCCATCGGTCTTAGGAACCGAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATTTATTTTCATCCTCCATTTTACTATCACTTACACTCTTAATTACCCCATTTATTATTACAACAACTAAAAAATATGAAACACATGCATACCCTTACTACGTAAAAAACTCTATCGCCTGCGCATTTATAACAAGCCTAGTCCCAATGCTCATATTTCTATACACAAATCAAGAAATAATCATTTCCAACTGACATTGAATAACGATTCATACTATCAAATTATGCCTAAGCTT"
#raw_seqs["Lemur_catta"] = "AAGCTTCATAGGAGCAACCATTCTAATAATCGCACATGGCCTTACATCATCCATATTATTCTGTCTAGCCAACTCTAACTACGAACGAATCCATAGCCGTACAATACTACTAGCACGAGGGATCCAAACCATTCTCCCTCTTATAGCCACCTGATGACTACTCGCCAGCCTAACTAACCTAGCCCTACCCACCTCTATCAATTTAATTGGCGAACTATTCGTCACTATAGCATCCTTCTCATGATCAAACATTACAATTATCTTAATAGGCTTAAATATGCTCATCACCGCTCTCTATTCCCTCTATATATTAACTACTACACAACGAGGAAAACTCACATATCATTCGCACAACCTAAACCCATCCTTTACACGAGAAAACACCCTTATATCCATACACATACTCCCCCTTCTCCTATTTACCTTAAACCCCAAAATTATTCTAGGACCCACGTACTGTAAATATAGTTTAAA-AAAACACTAGATTGTGAATCCAGAAATAGAAGCTCAAAC-CTTCTTATTTACCGAGAAAGTAATGTATGAACTGCTAACTCTGCACTCCGTATATAAAAATACGGCTATCTCAACTTTTAAAGGATAGAAGTAATCCATTGGCCTTAGGAGCCAAAAA-ATTGGTGCAACTCCAAATAAAAGTAATAAATCTATTATCCTCTTTCACCCTTGTCACACTGATTATCCTAACTTTACCTATCATTATAAACGTTACAAACATATACAAAAACTACCCCTATGCACCATACGTAAAATCTTCTATTGCATGTGCCTTCATCACTAGCCTCATCCCAACTATATTATTTATCTCCTCAGGACAAGAAACAATCATTTCCAACTGACATTGAATAACAATCCAAACCCTAAAACTATCTATTAGCTT"
#raw_seqs["Homo_sapiens"] = "AAGCTTCACCGGCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTGTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTTA-CGACCCCTTATTTACCGAGAAAGCT-CACAAGAACTGCTAACTCATGCCCCCATGTCTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTT"
#raw_seqs["Pan"] = "AAGCTTCACCGGCGCAATTATCCTCATAATCGCCCACGGACTTACATCCTCATTATTATTCTGCCTAGCAAACTCAAATTATGAACGCACCCACAGTCGCATCATAATTCTCTCCCAAGGACTTCAAACTCTACTCCCACTAATAGCCTTTTGATGACTCCTAGCAAGCCTCGCTAACCTCGCCCTACCCCCTACCATTAATCTCCTAGGGGAACTCTCCGTGCTAGTAACCTCATTCTCCTGATCAAATACCACTCTCCTACTCACAGGATTCAACATACTAATCACAGCCCTGTACTCCCTCTACATGTTTACCACAACACAATGAGGCTCACTCACCCACCACATTAATAACATAAAGCCCTCATTCACACGAGAAAATACTCTCATATTTTTACACCTATCCCCCATCCTCCTTCTATCCCTCAATCCTGATATCATCACTGGATTCACCTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTCA-CGACCCCTTATTTACCGAGAAAGCT-TATAAGAACTGCTAATTCATATCCCCATGCCTGACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCCATCCGTTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACCATGTATACTACCATAACCACCTTAACCCTAACTCCCTTAATTCTCCCCATCCTCACCACCCTCATTAACCCTAACAAAAAAAACTCATATCCCCATTATGTGAAATCCATTATCGCGTCCACCTTTATCATTAGCCTTTTCCCCACAACAATATTCATATGCCTAGACCAAGAAGCTATTATCTCAAACTGGCACTGAGCAACAACCCAAACAACCCAGCTCTCCCTAAGCTT"
#opts["raw_seqs"] = raw_seqs

# Build raw seqs from db lookup given ids, consider doing this as a worker
# in the future if we move away from a static database.

#idb_uuids = ["23984",
#"995440",
#"81211",
#"5559384"]

idb = idigbio.json()
rq_ = {"genus": "acer"}
limit_ = 10
fields_ = ["uuid"]
sort_ = ["uuid"]
results = idb.search_records(rq=rq_, limit=limit_, fields=fields_, sort=sort_)
#print len(results["items"])
#print results["items"][0]["indexTerms"]["genus"]
#exit(0)
idb_uuids = []
for rec in results["items"]:
    idb_uuids.append(rec["indexTerms"]["uuid"])

print idb_uuids

db = Database()
Beispiel #16
0
def idigbioQuery(rq, limit=None):
    '''
    Function that conducts user's query to iDigBio using its python API. Takes
    a "rq" python dictionary containing the user's search terms as first argument
    and an optional "limit" argument which specifies the no. of records returned.
    Returns a python dictionary containing the results for the user's query to
    iDigBio.
    '''
    #Define API from idigbio library being used
    api = idigbio.json()
        
    #Determine no. of records query would result in
    record_count = api.count_records(rq)
   
    '''Case #1: Limit is not given'''
    if limit == None:
        #Simple case: Query size is smaller than 5k records, return query as is
        if record_count <= 5000:
            results = api.search_records(rq, limit=5000)
            return results
        
        #Complex case: Query larger than 5k, break up query into 5k sized portions
        if record_count > 5000:
            #Records will be sorted by uuid to avoid overlapping queries
            sort = "uuid"
            
            #Defining needed paramteres for offset management
            offset_jump = 5000 #No. of records skipped each query, max. 5000
            offset_count = record_count // offset_jump #No. of offsets needed
            offset = 5000 #The offset parameter passed to idigbio API
            
            #First query, initializing results dictionary
            results = api.search_records(rq, limit=offset_jump, sort=sort)
            
            #Iterate through offsets and perform query
            for i in range(1, offset_count + 1):
                #Conduct query with offset
                query_results = api.search_records(rq, limit=offset_jump, sort=sort, offset=offset)
                
                #Merge offset query results list to results dictionary list
                results["items"].extend(query_results["items"])
                
                #Iterate to next offset
                offset += offset_jump
                
            return results
        
    '''Case #2: Limit has been given'''
    if limit != None:
        #Simple case: Limit given is below or equal to 5000, return query as is
        if limit <= 5000:
            results = api.search_records(rq, limit)
            return results
        
        #Complex case: Limit given is above 5k
        if limit > 5000:
            #Records will be sorted by uuid to avoid overlapping queries
            sort = "uuid"
            
            #Defining parameters for offset management
            offset_jump = 5000 #No. of records jumped each query
            offset_count = limit // offset_jump #No. of offsets needed
            offset_remainder = limit % offset_jump #Remainder from last offset
            offset = 5000 #Offset parameter passed to idigbio API
            
            #Initial query, offset=0, initialization of results dict
            results = api.search_records(rq, limit=offset_jump, sort=sort)
            
            #Iterate through next offset queries
            for i in range(1, offset_count + 1):   
                if i == (offset_count):
                    #If last offset, query for what is left over based on original limit
                    query_results = api.search_records(rq, limit=offset_remainder, sort=sort, offset=offset)
                else:
                    #Conduct offset queries, starting at offset = 5000
                    query_results = api.search_records(rq, limit=offset_jump, sort=sort, offset=offset)
                    
                #Add the query results to the results dict
                results["items"].extend(query_results["items"])
                
                #Iterate to next offset
                offset += offset_jump
            
            return results