def species_schemas(species_id, base_url, headers_get): """ Retrieves the species and all the schemas for that species. Parameters ---------- species_id : str The integer identifier of the species in the Chewie-NS. base_url : str Base URL of the Chewie-NS. headers_get : dict HTTP headers for GET requests. Returns ------- res : list of dict The first dictionary contains the species URI and name and the following dictionaries contain the URI and name for all schemas associated with the species. """ endpoint_list = ['species', species_id] # unpack list of sequential endpoints and pass to create URI res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1] res = res.json() return res
def schema_stats(species_id, base_url, headers_get): """ Retrieves schema properties, number of loci and number of alleles for all schemas of a species in the Chewie-NS. Parameters ---------- species_id : str The integer identifier of the species in the Chewie-NS. base_url : str Base URL of the Chewie-NS. headers_get : dict HTTP headers for GET requests. Returns ------- res : list of dict or None List with one dict per schema ot NoneType if it was not possible to retrieve information. """ endpoint_list = ['stats', 'species', species_id, 'totals'] # unpack list of sequential endpoints and pass to create URI res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1] status_code = res.status_code if status_code not in [200, 201]: res = None else: res = res.json()['message'] return res
def schema_loci(schema_uri, headers_get): """ Retrieves the list of loci for a schema. Parameters ---------- schema_uri : str The URI of the schema in the Chewie-NS. headers_get : dict HTTP headers for GET requests. Returns ------- loci : dict A dictionary with loci URIs as keys and loci names as values. """ # get the list of loci loci_url, loci_res = cr.simple_get_request(schema_uri, headers_get, ['loci']) loci_res = loci_res.json()['Loci'] # locus URI to locus name loci = {} for locus in loci_res: loci[str(locus['locus']['value'])] = locus['name']['value'] return loci
def check_compressed(schema_uri, headers_get): """ Determines if there is a compressed version of a schema. Parameters ---------- schema_uri : str The URI of the schema in the Chewie-NS. headers_get : dict HTTP headers for GET requests. Returns ------- list A list with the following elements: - The URI for the compressed version of the schema (str). - The timestamp of the compressed version. Indicates the last modification date of the schema at time of compression. """ zip_uri, zip_response = cr.simple_get_request( schema_uri, headers_get, ['zip'], parameters={'request_type': 'check'}) zip_info = zip_response.json() if 'zip' in zip_info: zip_file = zip_info['zip'][0] zip_date = zip_file.split('_')[-1].split('.zip')[0] else: zip_date = None return [zip_uri, zip_date]
def get_fasta_seqs(url, headers_get, schema_date): """ Retrieves the DNA sequences of a locus in the Chewie-NS. Parameters ---------- url : str Endpoint URL to make the request. headers_get : dict HTTP headers for GET requests. schema_date : str The function will only retrieve alleles that were inserted up to this date. Returns ------- tuple Tuple with the following elements: - URI of the locus. - Response object with the DNA sequences that were downloaded. """ payload = {'date': schema_date} tries = 0 max_tries = 3 downloaded = False while downloaded is False: res = cr.simple_get_request(url, headers_get, [], payload, False, 180)[1] tries += 1 if res.status_code in [200, 201] or tries == max_tries: downloaded = True return (url.rstrip('/fasta'), res)
def download_compressed(zip_uri, species_name, schema_name, download_folder, headers_get): """ Downloads and extracts a ZIP archive with a ready-to-use version of a schema in the Chewie-NS. Parameters ---------- zip_uri : str Endpoint URL to make the request to download the compressed schema. species_name : str Scientific name of the schema species. schema_name : str Name of the schema in the Chewie-NS. download_folder : str Path to the directory to which the ZIP archive will be saved. headers_get : dict HTTP headers for GET requests. Returns ------- schema_path : str ZIP archive contents will be extracted to this directory. """ zip_name = '{0}{1}_{2}.zip'.format(species_name[0].lower(), species_name.split(' ')[-1], schema_name) schema_path = os.path.join(download_folder, zip_name.split('.zip')[0]) fo.create_directory(schema_path) # download ZIP archive url, zip_response = cr.simple_get_request( zip_uri, headers_get, parameters={'request_type': 'download'}) zip_path = os.path.join(schema_path, zip_name) open(zip_path, 'wb').write(zip_response.content) # uncompress print('Decompressing schema...') shutil.unpack_archive(zip_path, extract_dir=schema_path) # delete ZIP os.remove(zip_path) return schema_path
def species_schemas_count(base_url, headers_get): """ Returns the number of schemas per species in the Chewie-NS. Parameters ---------- base_url : str Base URL of the Chewie-NS. headers_get : dict HTTP headers for GET requests. Returns ------- info : list of list A list with a sublist per species. Each sublist contains the species identifier, the name of the species and the total number of schemas. """ endpoint_list = ['stats', 'species'] # unpack list of sequential endpoints and pass to create URI res = cr.simple_get_request(base_url, headers_get, endpoint_list)[1] res = res.json() if 'message' in res: res = res['message'] else: sys.exit('Could not retrieve species info.') if len(res) == 0: sys.exit('Could not retrieve species info.') else: info = [] for s in res: sid = s['species']['value'].split('/')[-1] name = s['name']['value'] num_schemas = s['schemas']['value'] info.append([sid, name, num_schemas]) # sort by species identifier info = sorted(info, key=lambda x: int(x[0])) return info
def download_ptf(ptf_hash, download_folder, schema_id, species_id, species_name, headers_get, base_url): """ Downloads the Prodigal training file for a schema. Parameters ---------- ptf_hash : str Unique identifier of the Prodigal training file (BLAKE2 hash). download_folder : str Path to the directory to which the Prodigal training file should be saved. schema_id : str The identifier of the schema in the Chewie-NS. species_id : str The identifier of the schema's species in the Chewie-NS. species_name : str Scientific name of the schema species. headers_get : dict HTTP headers for GET requests. base_url : str Base URL of the Chewie Nomenclature server. Returns ------- ptf_file : str Path to the Prodigal training file. """ ptf_url, ptf_response = cr.simple_get_request( base_url, headers_get, ['species', species_id, 'schemas', schema_id, 'ptf']) ptf_file = os.path.join(download_folder, '{0}.trn'.format(species_name.replace(' ', '_'))) open(ptf_file, 'wb').write(ptf_response.content) return ptf_file
def main(schema_directory, cpu_cores, nomenclature_server, submit, blast_path, update_profiles): # get ns configs local_date, schema_uri = pv.read_configs(schema_directory, '.ns_config') # get schema and species identifiers schema_id = schema_uri.split('/')[-1] species_id = schema_uri.split('/')[-3] if nomenclature_server is None: nomenclature_server = schema_uri.split('species/')[0] if submit is True and 'tutorial' not in nomenclature_server: print('\nOnly authorized registered users may submit new alleles.') token = cr.capture_login_credentials(nomenclature_server) else: token = '' # GET request headers headers_get = ct.HEADERS_GET_JSON headers_get['Authorization'] = token # determine current user ID and Role if submit is True and 'tutorial' not in nomenclature_server: user_id, user_role, user_auth = cr.user_info(nomenclature_server, headers_get) # verify if user has authorization to submit url = cr.make_url(nomenclature_server, 'auth', 'check') response = cr.simple_get_request(url, headers_get)[1] if response.status_code == 200: user_auth = True else: sys.exit('Current user has no authorization to submit novel ' 'alleles.\nYou can request authorization to submit ' 'novel alleles by sending an e-mail to: ' '*****@*****.**') print('User id: {0}'.format(user_id)) print('User role: {0}\n'.format(user_role)) else: user_id = '' user_role = '' user_auth = True if 'tutorial' in nomenclature_server else False # POST requests headers headers_post = ct.HEADERS_POST_JSON headers_post['Authorization'] = token headers_post['user_id'] = user_id # POST headers to send binary data headers_post_bytes = ct.HEADERS_POST headers_post_bytes['Authorization'] = token headers_post_bytes['user_id'] = user_id schema_params = pv.read_configs(schema_directory, '.schema_config') # verify that local configs have a single value per parameter if all([ len(schema_params[k]) == 1 for k in schema_params if k != 'chewBBACA_version' ]) is not True: sys.exit('Cannot sync schema with multiple values per parameter.') # check if schema exists in the NS schema_name, ns_params = cr.get_species_schemas(schema_id, species_id, nomenclature_server, headers_get)[2:] # verify that local configs match NS configs # add window size if all([ str(schema_params[k][0]) == ns_params[k]['value'] for k in schema_params if k not in ['chewBBACA_version', 'window_size'] ]) is not True: sys.exit('Local configs do not match Chewie-NS configs.') # Get the name of the species from the provided id # or vice-versa species_id, species_name = cr.species_ids(species_id, nomenclature_server, headers_get) print('Schema id: {0}'.format(schema_id)) print('Schema name: {0}'.format(schema_name)) print("Schema's species: {0} (id={1})".format(species_name, species_id)) print('Last synced: {0}'.format(local_date)) # get last modification date # setting syncing date to last modification date will allow # all users to sync even when the schema is locked and being # updated by another user ns_date = ns_params['last_modified']['value'] print('\nRemote schema was last modified on: {0}'.format(ns_date)) # exit if remote schema has not been updated since last # sync date and current user does not wish to submit new alleles if local_date == ns_date and submit is False: sys.exit('\nRemote schema has not been updated since last sync ' 'process. Local schema is up-to-date.') # Create a temporary dir for the new alleles temp_dir = os.path.join(os.path.dirname(schema_directory), 'temp') if not os.path.exists(temp_dir): os.mkdir(temp_dir) # retrieve alleles added to schema after last sync date print('\nRetrieving alleles added to remote schema ' 'after {0}...'.format(local_date)) loci_alleles, server_time, count = retrieve_latest(local_date, schema_uri, headers_get, ns_date) print('Retrieved {0} alleles for {1} loci.' ''.format(count, len(loci_alleles))) # Get schema files from genes list file genes_list = os.path.join(schema_directory, '.genes_list') genes = fo.pickle_loader(genes_list) # update loci structure not_in_ns, pickled_loci, \ updated, not_update, \ rearranged = update_loci_files(loci_alleles, genes, schema_directory, temp_dir) total_local = sum([len(v[0]) for k, v in not_in_ns.items()]) print('Local schema has {0} novel alleles for {1} ' 'loci.'.format(total_local, len(not_in_ns))) # check if there are any changes to make if len(pickled_loci) == 0: shutil.rmtree(temp_dir) sys.exit('Remote schema has not been altered and local schema ' 'does not have novel alleles.') results = {} attributed = 0 if submit is True and user_auth is True and len(not_in_ns) > 0: # attempt to lock schema lock_res = cr.simple_post_request( nomenclature_server, headers_post, ['species', species_id, 'schemas', schema_id, 'lock'], data=json.dumps({'action': 'lock'}))[1] # if schema is already locked user cannot send alleles lock_status = lock_res.status_code if lock_status == 403: print('Schema is already locked. Another user might be updating ' 'the schema. Please repeat the syncing process after a ' 'while to add your new alleles to the Chewie-NS.\n The ' 'process will now update your local schema with the alleles ' 'retrieved from the Chewie-NS.') else: # after locking, check if date matches ns_date date_res = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'modified'])[1] date_value = (date_res.json()).split(' ')[-1] if date_value != ns_date: print('Data retrieved from the Chewie-NS has an older ' 'timestamp than current schema timestamp. Schema ' 'might have been updated before this syncing process. ' 'Please repeat the syncing process in order to add ' 'your new alleles to the schema. The process will now ' 'update your local schema with the alleles retrieved ' 'from the Chewie-NS.') # unlock schema lock_res = cr.simple_post_request( nomenclature_server, headers_post, ['species', species_id, 'schemas', schema_id, 'lock'], data=json.dumps({'action': 'unlock'}))[1] else: print( 'Collecting data and creating files to submit local alleles...' ) # get list of loci for schema in the NS loci_res = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'loci'])[1] # get loci files names from response for l in loci_res.json()['Loci']: locus_name = l['name']['value'] + '.fasta' locus_uri = l['locus']['value'] if locus_name in not_in_ns: not_in_ns[locus_name].append(locus_uri) # create files with length values to update length_files = create_lengths_files(not_in_ns, temp_dir) # create new alleles data alleles_files, \ loci_ids, \ loci_names = create_alleles_files(not_in_ns, nomenclature_server, user_id, species_name, species_id, schema_id, temp_dir) # compress files with new alleles zipped_files = [ '{0}.zip'.format(file) for file in alleles_files ] list(map(fo.file_zipper, alleles_files, zipped_files)) alleles_data = list(zip(zipped_files, loci_ids, loci_names)) print('Sending and inserting new alleles...') failed, \ start_count = upload_alleles_data(alleles_data, length_files, nomenclature_server, headers_post, headers_post_bytes, species_id, schema_id) # track progress through endpoint # set time limit for task completion (seconds) print() time_limit = 2100 current_time = 0 status = 'Updating' start_count = int(start_count.json()['nr_alleles']) while status != 'Complete' and (current_time < time_limit): insertion_status = cr.simple_get_request( nomenclature_server, headers_get, [ 'species', species_id, 'schemas', schema_id, 'loci', 'locus', 'update' ])[1] insertion_status = insertion_status.json() if 'message' in insertion_status: status = 'Complete' results = insertion_status['identifiers'] current_count = int(insertion_status['nr_alleles']) inserted = current_count - start_count print('\r', ' Inserted {0} alleles.'.format(inserted), end='') time.sleep(2) current_time += 2 if current_time != time_limit: # determine alleles that were attributed an identifier repeated = sum([len(r[0]) for l, r in results.items()]) attributed = sum([len(r[1]) for l, r in results.items()]) print( '\nThe Chewie-NS inserted {0} new alleles and detected ' '{1} repeated alleles.'.format(attributed, repeated)) else: print( '\nCould not retrieve allele identifiers assigned by ' 'Chewie-NS. Will adapt schema with retrieved alleles. ' 'Please repeat the syncing process in order to assign ' 'the new identifiers for the submitted alleles.') # remove files in temp folder fo.remove_files(length_files) fo.remove_files(alleles_files) fo.remove_files(zipped_files) # change pickled files to FASTA files for locus, pick in pickled_loci.items(): rearranged = pickle_to_fasta(locus, pick, temp_dir, results, rearranged) # change identifiers in SQLite DB if len(rearranged) > 0 and update_profiles is True: print('\nUpdating local allele identifiers...') altered = ps.update_profiles(schema_directory, rearranged) if altered is not None: print('Updated {0} profiles.\n'.format(altered)) else: print('Could not find local SQLite database to upload profiles.\n') # Re-determine the representative sequences if attributed > 0 or count > 0: PrepExternalSchema.main(temp_dir, schema_directory, cpu_cores, float(schema_params['bsr'][0]), int(schema_params['minimum_locus_length'][0]), 11, '', None, blast_path) # delete invalid alleles and genes files parent_dir = os.path.dirname(schema_directory) files = [ os.path.join(parent_dir, file) for file in os.listdir(parent_dir) if 'invalid' in file ] fo.remove_files(files) # get last modification date last_modified = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'modified'])[1] last_modified = (last_modified.json()).split(' ')[-1] server_time = last_modified # update NS config file with latest server time ns_configs = os.path.join(schema_directory, '.ns_config') fo.pickle_dumper([server_time, schema_uri], ns_configs) print('Received {0} new alleles for {1} loci and sent ' '{2} for {3} loci. '.format(count, len(pickled_loci), attributed, len(not_in_ns))) # delete temp directory shutil.rmtree(temp_dir) # delete pre-computed BSR values from 'short' directory # representatives might have changed and BSR values are outdated short_dir = os.path.join(schema_directory, 'short') bsr_files = [ os.path.join(short_dir, f) for f in os.listdir(short_dir) if f.endswith('_bsr.txt') ] fo.remove_files(bsr_files)
def retrieve_alleles(loci_new_alleles, server_time, schema_uri, count, headers_get, ns_date): """ Retrieves alleles added to a schema in the Chewie-NS during a time interval, up to the maximum number of alleles that the server returns at a time (50000). Parameters ---------- loci_new_alleles : dict A dictionary with loci identifiers as keys and dictionaries with alleles identifiers and DNA sequences as values. server_time : str The function will return alleles added to the schema after this date (format Y-%m-%dT%H:%M:%S). schema_uri : str The URI of the schema in the Chewie-NS. count : int The cumulative number of sequences that has been returned. headers_get : dict HTTP headers for GET requests. ns_date : str The function will return alleles added to the schema up to this date (format Y-%m-%dT%H:%M:%S). Returns ------- A list with the following variables: loci_new_alleles : dict Input `loci_new_alleles` dictionary with alleles returned in the current and previous iterations. server_time : str The date of insertion of the last allele returned by the Chewie-NS. count : int The cumulative number of sequences that has been returned. """ # request the new alleles starting on the date given url = cr.make_url(schema_uri, 'loci') payload = {'local_date': server_time, 'ns_date': ns_date} # get the new alleles response = cr.simple_get_request(url, headers_get, parameters=payload)[1] response_content = response.json() # get info about sequences that were added since last date new_alleles = response_content['newAlleles'] # get headers info response_headers = response.headers if len(new_alleles) > 0: # get date of last added allele server_time = response_headers['Last-Allele'] # group retrieved alleles by locus for allele in new_alleles: locus = '{0}{1}'.format(allele['locus_name']['value'], '.fasta') allele_id = allele['allele_id']['value'] sequence = allele['nucSeq']['value'] loci_new_alleles[locus][allele_id] = sequence # keep count of the number of retrieved alleles count += len(new_alleles) else: # get current server date if no alleles were added # since last server time server_time = response_headers['Server-Date'] return (loci_new_alleles, server_time, count)