def read_configs(schema_path, filename): """ Reads file with schema config values. Parameters ---------- schema_path : str Path to the schema's directory. filename : str Name of the file that contains the config values. Returns ------- configs : dict Dictionary with config names as keys and config values as values. """ config_file = os.path.join(schema_path, filename) if os.path.isfile(config_file): # Load configs dictionary configs = fo.pickle_loader(config_file) else: sys.exit('Could not find a valid config file.') return configs
def save_extracted_cds(genome, identifier, orf_file, protein_table, cds_file): """ Extracts coding sequences from a genome assembly based on Prodigal's gene predictions. Writes coding sequences to a FASTA file and information about coding sequences to a TSV file. Parameters ---------- genome : str Path to the FASTA file with the FASTA sequences for a genome. identifier : str Genome identifier to add to FASTA records headers and to the first field in the TSV file. orf_file : str Path to the file with Prodigal results. protein_table : str Path to the TSV file to which coding sequences information will be written. cds_file : str Path to the FASTA file to which coding sequences will be written. Returns ------- total_cds : int Total number of coding sequences extracted from the genome. """ # import contigs for current genome/assembly contigs = fao.import_sequences(genome) # extract coding sequences from contigs reading_frames = fo.pickle_loader(orf_file) genome_info = extract_genome_cds(reading_frames, contigs, 1) # save coding sequences to file # create records and write them to file cds_lines = fao.create_fasta_lines(genome_info[0], identifier) fo.write_lines(cds_lines, cds_file) write_protein_table(protein_table, identifier, genome_info[1]) total_cds = len(genome_info[0]) return total_cds
def main(input_files, output_directory, protein_table, blast_score_ratio, cpu_cores, taxa, proteome_matches, no_cleanup, blast_path): # create output directory fo.create_directory(output_directory) # create temp directory temp_directory = fo.join_paths(output_directory, ['temp']) fo.create_directory(temp_directory) # validate input files genes_list = fo.join_paths(temp_directory, ['listGenes.txt']) genes_list = pv.check_input_type(input_files, genes_list) loci_paths = fo.read_lines(genes_list) schema_directory = os.path.dirname(loci_paths[0]) schema_basename = fo.file_basename(schema_directory) print('Schema: {0}'.format(schema_directory)) print('Number of loci: {0}'.format(len(loci_paths))) # find annotations based on reference proteomes for species proteome_results = {} if taxa is not None: proteome_results = proteome_annotations(schema_directory, temp_directory, taxa, blast_score_ratio, cpu_cores, proteome_matches, blast_path) # find annotations in SPARQL endpoint print('\nQuerying UniProt\'s SPARQL endpoint...') config_file = fo.join_paths(input_files, '.schema_config') if os.path.isfile(config_file) is True: config = fo.pickle_loader(config_file) translation_table = config.get('translation_table', [11])[0] else: translation_table = 11 sparql_results = sparql_annotations(loci_paths, translation_table, cpu_cores) loci_info = {} if protein_table is not None: # read cds_info table # read "cds_info.tsv" file created by CreateSchema table_lines = fo.read_tabular(protein_table) for l in table_lines[1:]: # create locus identifier based on genome identifier and # cds identifier in file locus_id = l[0].replace('_', '-') locus_id = locus_id + '-protein{0}'.format(l[-2]) loci_info[locus_id] = l annotations = join_annotations(sparql_results, proteome_results, loci_info) # table header header = ['Locus_ID'] if len(loci_info) > 0: header += table_lines[0] header += ['Uniprot_Name', 'UniProt_URL'] if len(proteome_results) > 0: header.extend(['Proteome_ID', 'Proteome_Product', 'Proteome_Gene_Name', 'Proteome_Species', 'Proteome_BSR']) loci_info_bool = True if len(loci_info) > 0 else False output_table = create_annotations_table(annotations, output_directory, header, schema_basename, loci_info_bool) if no_cleanup is False: shutil.rmtree(temp_directory) print('\n\nThe table with new information can be found at:' '\n{0}'.format(output_table))
def main(schema_directory, cpu_cores, nomenclature_server, submit, blast_path, update_profiles): # get ns configs local_date, schema_uri = pv.read_configs(schema_directory, '.ns_config') # get schema and species identifiers schema_id = schema_uri.split('/')[-1] species_id = schema_uri.split('/')[-3] if nomenclature_server is None: nomenclature_server = schema_uri.split('species/')[0] if submit is True and 'tutorial' not in nomenclature_server: print('\nOnly authorized registered users may submit new alleles.') token = cr.capture_login_credentials(nomenclature_server) else: token = '' # GET request headers headers_get = ct.HEADERS_GET_JSON headers_get['Authorization'] = token # determine current user ID and Role if submit is True and 'tutorial' not in nomenclature_server: user_id, user_role, user_auth = cr.user_info(nomenclature_server, headers_get) # verify if user has authorization to submit url = cr.make_url(nomenclature_server, 'auth', 'check') response = cr.simple_get_request(url, headers_get)[1] if response.status_code == 200: user_auth = True else: sys.exit('Current user has no authorization to submit novel ' 'alleles.\nYou can request authorization to submit ' 'novel alleles by sending an e-mail to: ' '*****@*****.**') print('User id: {0}'.format(user_id)) print('User role: {0}\n'.format(user_role)) else: user_id = '' user_role = '' user_auth = True if 'tutorial' in nomenclature_server else False # POST requests headers headers_post = ct.HEADERS_POST_JSON headers_post['Authorization'] = token headers_post['user_id'] = user_id # POST headers to send binary data headers_post_bytes = ct.HEADERS_POST headers_post_bytes['Authorization'] = token headers_post_bytes['user_id'] = user_id schema_params = pv.read_configs(schema_directory, '.schema_config') # verify that local configs have a single value per parameter if all([ len(schema_params[k]) == 1 for k in schema_params if k != 'chewBBACA_version' ]) is not True: sys.exit('Cannot sync schema with multiple values per parameter.') # check if schema exists in the NS schema_name, ns_params = cr.get_species_schemas(schema_id, species_id, nomenclature_server, headers_get)[2:] # verify that local configs match NS configs # add window size if all([ str(schema_params[k][0]) == ns_params[k]['value'] for k in schema_params if k not in ['chewBBACA_version', 'window_size'] ]) is not True: sys.exit('Local configs do not match Chewie-NS configs.') # Get the name of the species from the provided id # or vice-versa species_id, species_name = cr.species_ids(species_id, nomenclature_server, headers_get) print('Schema id: {0}'.format(schema_id)) print('Schema name: {0}'.format(schema_name)) print("Schema's species: {0} (id={1})".format(species_name, species_id)) print('Last synced: {0}'.format(local_date)) # get last modification date # setting syncing date to last modification date will allow # all users to sync even when the schema is locked and being # updated by another user ns_date = ns_params['last_modified']['value'] print('\nRemote schema was last modified on: {0}'.format(ns_date)) # exit if remote schema has not been updated since last # sync date and current user does not wish to submit new alleles if local_date == ns_date and submit is False: sys.exit('\nRemote schema has not been updated since last sync ' 'process. Local schema is up-to-date.') # Create a temporary dir for the new alleles temp_dir = os.path.join(os.path.dirname(schema_directory), 'temp') if not os.path.exists(temp_dir): os.mkdir(temp_dir) # retrieve alleles added to schema after last sync date print('\nRetrieving alleles added to remote schema ' 'after {0}...'.format(local_date)) loci_alleles, server_time, count = retrieve_latest(local_date, schema_uri, headers_get, ns_date) print('Retrieved {0} alleles for {1} loci.' ''.format(count, len(loci_alleles))) # Get schema files from genes list file genes_list = os.path.join(schema_directory, '.genes_list') genes = fo.pickle_loader(genes_list) # update loci structure not_in_ns, pickled_loci, \ updated, not_update, \ rearranged = update_loci_files(loci_alleles, genes, schema_directory, temp_dir) total_local = sum([len(v[0]) for k, v in not_in_ns.items()]) print('Local schema has {0} novel alleles for {1} ' 'loci.'.format(total_local, len(not_in_ns))) # check if there are any changes to make if len(pickled_loci) == 0: shutil.rmtree(temp_dir) sys.exit('Remote schema has not been altered and local schema ' 'does not have novel alleles.') results = {} attributed = 0 if submit is True and user_auth is True and len(not_in_ns) > 0: # attempt to lock schema lock_res = cr.simple_post_request( nomenclature_server, headers_post, ['species', species_id, 'schemas', schema_id, 'lock'], data=json.dumps({'action': 'lock'}))[1] # if schema is already locked user cannot send alleles lock_status = lock_res.status_code if lock_status == 403: print('Schema is already locked. Another user might be updating ' 'the schema. Please repeat the syncing process after a ' 'while to add your new alleles to the Chewie-NS.\n The ' 'process will now update your local schema with the alleles ' 'retrieved from the Chewie-NS.') else: # after locking, check if date matches ns_date date_res = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'modified'])[1] date_value = (date_res.json()).split(' ')[-1] if date_value != ns_date: print('Data retrieved from the Chewie-NS has an older ' 'timestamp than current schema timestamp. Schema ' 'might have been updated before this syncing process. ' 'Please repeat the syncing process in order to add ' 'your new alleles to the schema. The process will now ' 'update your local schema with the alleles retrieved ' 'from the Chewie-NS.') # unlock schema lock_res = cr.simple_post_request( nomenclature_server, headers_post, ['species', species_id, 'schemas', schema_id, 'lock'], data=json.dumps({'action': 'unlock'}))[1] else: print( 'Collecting data and creating files to submit local alleles...' ) # get list of loci for schema in the NS loci_res = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'loci'])[1] # get loci files names from response for l in loci_res.json()['Loci']: locus_name = l['name']['value'] + '.fasta' locus_uri = l['locus']['value'] if locus_name in not_in_ns: not_in_ns[locus_name].append(locus_uri) # create files with length values to update length_files = create_lengths_files(not_in_ns, temp_dir) # create new alleles data alleles_files, \ loci_ids, \ loci_names = create_alleles_files(not_in_ns, nomenclature_server, user_id, species_name, species_id, schema_id, temp_dir) # compress files with new alleles zipped_files = [ '{0}.zip'.format(file) for file in alleles_files ] list(map(fo.file_zipper, alleles_files, zipped_files)) alleles_data = list(zip(zipped_files, loci_ids, loci_names)) print('Sending and inserting new alleles...') failed, \ start_count = upload_alleles_data(alleles_data, length_files, nomenclature_server, headers_post, headers_post_bytes, species_id, schema_id) # track progress through endpoint # set time limit for task completion (seconds) print() time_limit = 2100 current_time = 0 status = 'Updating' start_count = int(start_count.json()['nr_alleles']) while status != 'Complete' and (current_time < time_limit): insertion_status = cr.simple_get_request( nomenclature_server, headers_get, [ 'species', species_id, 'schemas', schema_id, 'loci', 'locus', 'update' ])[1] insertion_status = insertion_status.json() if 'message' in insertion_status: status = 'Complete' results = insertion_status['identifiers'] current_count = int(insertion_status['nr_alleles']) inserted = current_count - start_count print('\r', ' Inserted {0} alleles.'.format(inserted), end='') time.sleep(2) current_time += 2 if current_time != time_limit: # determine alleles that were attributed an identifier repeated = sum([len(r[0]) for l, r in results.items()]) attributed = sum([len(r[1]) for l, r in results.items()]) print( '\nThe Chewie-NS inserted {0} new alleles and detected ' '{1} repeated alleles.'.format(attributed, repeated)) else: print( '\nCould not retrieve allele identifiers assigned by ' 'Chewie-NS. Will adapt schema with retrieved alleles. ' 'Please repeat the syncing process in order to assign ' 'the new identifiers for the submitted alleles.') # remove files in temp folder fo.remove_files(length_files) fo.remove_files(alleles_files) fo.remove_files(zipped_files) # change pickled files to FASTA files for locus, pick in pickled_loci.items(): rearranged = pickle_to_fasta(locus, pick, temp_dir, results, rearranged) # change identifiers in SQLite DB if len(rearranged) > 0 and update_profiles is True: print('\nUpdating local allele identifiers...') altered = ps.update_profiles(schema_directory, rearranged) if altered is not None: print('Updated {0} profiles.\n'.format(altered)) else: print('Could not find local SQLite database to upload profiles.\n') # Re-determine the representative sequences if attributed > 0 or count > 0: PrepExternalSchema.main(temp_dir, schema_directory, cpu_cores, float(schema_params['bsr'][0]), int(schema_params['minimum_locus_length'][0]), 11, '', None, blast_path) # delete invalid alleles and genes files parent_dir = os.path.dirname(schema_directory) files = [ os.path.join(parent_dir, file) for file in os.listdir(parent_dir) if 'invalid' in file ] fo.remove_files(files) # get last modification date last_modified = cr.simple_get_request( nomenclature_server, headers_get, ['species', species_id, 'schemas', schema_id, 'modified'])[1] last_modified = (last_modified.json()).split(' ')[-1] server_time = last_modified # update NS config file with latest server time ns_configs = os.path.join(schema_directory, '.ns_config') fo.pickle_dumper([server_time, schema_uri], ns_configs) print('Received {0} new alleles for {1} loci and sent ' '{2} for {3} loci. '.format(count, len(pickled_loci), attributed, len(not_in_ns))) # delete temp directory shutil.rmtree(temp_dir) # delete pre-computed BSR values from 'short' directory # representatives might have changed and BSR values are outdated short_dir = os.path.join(schema_directory, 'short') bsr_files = [ os.path.join(short_dir, f) for f in os.listdir(short_dir) if f.endswith('_bsr.txt') ] fo.remove_files(bsr_files)
def pickle_to_fasta(locus, pickled_file, temp_dir, identifiers, reassigned): """ Creates FASTA files with the information contained in a pickled file. Parameters ---------- locus : str The identifier of the locus with '.fasta' suffix. pickled_file : str Path to the pickled file with a dictionary that has integer identifiers as keys and a tuple with two elements: the identifier that should be assigned to the allele (might differ from the key if the allele is new, in which case it starts with '*') and the DNA sequence of the allele. temp_dir : str Path to the directory where the output FASTA file will be created. identifiers : dict The `zip_res` variable returned by the :py:func:`upload_alleles_data` function. It will be used to change allele identifiers that were successfully inserted into the Chewie-NS. Returns ------- fasta_path : str Path to the FASTA file created by this function """ locus_id = locus.rstrip('.fasta') locus_int = locus_id.split('-')[-1].lstrip('0') if locus_int in identifiers: repeated = identifiers[locus_int][0] attributed = identifiers[locus_int][1] else: repeated = {} attributed = {} inv_reassigned = {} if locus in reassigned: inv_reassigned = {v: k for k, v in reassigned[locus].items()} locus_sequences = fo.pickle_loader(pickled_file) natsorted_locus = sorted(locus_sequences) fasta_path = os.path.join(temp_dir, locus) records = [] for seqid in natsorted_locus: recid = locus_sequences[seqid][0] seq = locus_sequences[seqid][1] seq_hash = hashlib.sha256(seq.encode('utf-8')).hexdigest() # switch by the identifier attributed by the Chewie-NS if seq_hash in attributed: new_recid = attributed[seq_hash] if recid in inv_reassigned: old_id = inv_reassigned[recid] reassigned[locus][old_id] = new_recid else: if locus not in reassigned: reassigned[locus] = {recid: new_recid} else: reassigned[locus][recid] = new_recid recid = new_recid elif seq_hash in repeated: new_recid = repeated[seq_hash] if recid in inv_reassigned: old_id = inv_reassigned[recid] reassigned[locus][old_id] = new_recid else: if locus not in reassigned: reassigned[locus] = {recid: new_recid} else: reassigned[locus][recid] = new_recid recid = new_recid record = '>{0}_{1}\n{2}'.format(locus_id, recid, seq) records.append(record) fasta_text = '\n'.join(records) with open(fasta_path, 'w') as fp: fp.write(fasta_text) os.remove(pickled_file) return reassigned
def upload_alleles_data(alleles_data, length_files, base_url, headers_post, headers_post_bytes, species_id, schema_id): """ Uploads files with the data to insert alleles and the length values for the sequences of each locus. Parameters ---------- alleles_data : list List with tuples, one per locus, that contain the path to the ZIP archive with the data to insert alleles, the identifier of the locus, the locus file hash and the basename of the locus file. length_files : list List with paths to the pickled files that contain a dictionary with sequences hashes as keys and sequence length as values. base_url : str Base URL of the Nomenclature server. headers_post : dict HTTP headers for POST requests that accept JSON formatted data. headers_post_bytes : dict HTTP headers for POST requests that support file upload. species_id : int The identifier of the schema's species in the NS. schema_id : int The identifier of the schema in the NS. Returns ------- failed : list of str List with the identifiers of the loci whose alleles data could not be fully uploaded. zip_res : dict A dictionary with the response returned by the last POST method. It has loci identifiers as keys and lists with two dictionaries as values (the dictionaries have sequences hashes as keys and sequence identifiers in the Chewie-NS as values. The first dictionary has the hashes of the sequences that were sent to the Chewie-NS but that were already present in the loci and the identifiers of those repeated alleles that were sent to the Chewie-NS. The second dictionary has the same structure but for the sequences that were accepted and inserted into each locus). """ uploaded = 0 failed = [] for i, a in enumerate(alleles_data): locus_id = a[1] # get length of alleles from current locus current_len = length_files[i] data = fo.pickle_loader(current_len) data = {locus_id: data[next(iter(data))]} data = {'content': data} # send data to the NS send_url = cr.make_url(base_url, 'species', species_id, 'schemas', schema_id, 'loci', locus_id, 'lengths') lengths_res = cr.simple_post_request(send_url, headers_post, data=json.dumps(data))[1] length_status = lengths_res.status_code # get path to ZIP archive with data to insert alleles current_zip = a[0] # send data to insert alleles in the NS zip_url = cr.make_url(base_url, 'species', species_id, 'schemas', schema_id, 'loci', locus_id, 'update') if alleles_data[i] == alleles_data[-1]: headers_post_bytes['complete'] = 'True' zip_res = cr.upload_file(current_zip, os.path.basename(current_zip), zip_url, headers_post_bytes, False) # determine if upload was successful zip_status = zip_res.status_code # determine if upload was successful if length_status not in [200, 201] or zip_status not in [200, 201]: failed.append(locus_id) elif length_status in [200, 201] and zip_status in [200, 201]: uploaded += 1 print('\r', ' Sent data for alleles of ' '{0}/{1} loci.'.format(uploaded, len(alleles_data)), end='') return [failed, zip_res]