def single_schema(species_id, schema_id, virtuoso_graph, local_sparql, base_url): """ """ start = time.time() start_date = dt.datetime.now() start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S') logging.info('Started determination of loci and alleles counts at: {0}'.format(start_date_str)) # create species uri species_uri = '{0}species/{1}'.format(base_url, species_id) species_result = aux.get_data(SPARQLWrapper(local_sparql), sparql_queries.SELECT_SINGLE_SPECIES.format(virtuoso_graph, species_uri)) result_data = species_result['results']['bindings'] if len(result_data) == 0: logging.warning('Could not find species with identifier {0}. ' 'Aborting.\n\n'.format(species_id)) sys.exit(1) schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id) schema_info = aux.get_data(SPARQLWrapper(local_sparql), (sparql_queries.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri))) schema_properties = schema_info['results']['bindings'] if len(schema_properties) == 0: logging.warning('Could not find properties values for schema with identifier {0}. ' 'Aborting.\n\n'.format(schema_id)) sys.exit(1) last_modified = schema_properties[0]['last_modified']['value'] # list files in folder computed_dir = Config.PRE_COMPUTE computed_files = os.listdir(computed_dir) # check if folder with schema alleles lengths files exists lengths_dir = '{0}_{1}_lengths'.format(species_id, schema_id) # get files with species prefix species_prefix = 'loci_{0}'.format(species_id) species_files = [f for f in computed_files if f.startswith(species_prefix)] species_file = os.path.join(computed_dir, '{0}.json'.format(species_prefix)) if len(species_files) == 0: create_file(species_file, {'message': []}) if lengths_dir in computed_files: lengths_dir = os.path.join(computed_dir, lengths_dir) fast_update(schema_uri, last_modified, species_file, lengths_dir) else: full_update(schema_uri, last_modified, species_file, virtuoso_graph, local_sparql) end = time.time() delta = end - start print(delta)
def full_update(schema, last_modified, file, virtuoso_graph, local_sparql): """ """ schema_id = int(schema.split('/')[-1]) current_file = file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) json_schemas = json_data['message'] schemas_indexes = {int(s['schema'].split('/')[-1]): i for i, s in enumerate(json_schemas)} # if the schema is in the json file if schema_id in schemas_indexes: current_schema = json_schemas[schemas_indexes[schema_id]] # get modification date in json file json_date = current_schema['last_modified'] virtuoso_date = last_modified if json_date == virtuoso_date: logging.info('Information about number of loci and number of alleles for schema {0} is up-to-date.'.format(schema)) elif json_date != virtuoso_date: result = aux.get_data(SPARQLWrapper(local_sparql), (sparql_queries.COUNT_SINGLE_SCHEMA_LOCI_ALLELE.format(virtuoso_graph, schema))) result_data = result['results']['bindings'] loci_data = [{'locus': r['locus']['value'], 'nr_alleles': r['nr_alleles']['value']} for r in result_data] proc_data = {'schema': schema, 'last_modified': virtuoso_date, 'loci': loci_data} json_data['message'][schemas_indexes[schema_id]] = proc_data with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile) logging.info('Updated data for schema {0}'.format()) # new schema that is not in the json file elif schema_id not in schemas_indexes: result = aux.get_data(SPARQLWrapper(local_sparql), (sparql_queries.COUNT_SINGLE_SCHEMA_LOCI_ALLELE.format(virtuoso_graph, schema))) result_data = result['results']['bindings'] loci_data = [{'locus': r['locus']['value'], 'nr_alleles': r['nr_alleles']['value']} for r in result_data] proc_data = {'schema': schema, 'last_modified': last_modified, 'loci': loci_data} if len(result_data) > 0: json_data['message'].append(proc_data) with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile)
def schema_loci(schema_uri, local_sparql, virtuoso_graph): """ Gets the list of loci for a schema. Parameters ---------- schema_uri : str The URI of the schema in the Chewie-NS. Returns ------- loci_list : list of tup A list with tuples. Each tuple has two elements, a locus name and a locus URI. """ # get loci loci_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema_uri))) # check if schema has loci loci_list = loci_result['results']['bindings'] if loci_list != []: loci_list = [(l['name']['value'], l['locus']['value']) for l in loci_list] return loci_list
def get_species(local_sparql, virtuoso_graph): """ Gets the list of species in the Chewie-NS. This function has no arguments but expects that the SPARQL endpoint and default Virtuoso Graph be set as OS environment variables. Returns ------- species_list : dict A dictionary with species URIs as keys and species names as values. None if species has no schemas. """ # get the list of species in NS species_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SPECIES.format(virtuoso_graph, ' typon:name ?name. '))) species = species_result['results']['bindings'] if len(species) == 0: species_list = None else: species_list = { s['species']['value']: s['name']['value'] for s in species } return species_list
def determine_date(schema_uri, local_sparql, virtuoso_graph): """ Gets the last modification date for a schema. Parameters ---------- schema_uri : str The URI of the schema in the Chewie-NS. Returns ------- A list with the following variables: - last_date (str): The last modification date in the format YYYY-MM-DDTHH:MM:SS.f. - lock_state (str): Locking state of the schema. - schema_info (dict): A dictionary with schema properties values. """ # get schema last modification date date_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri))) schema_info = date_result['results']['bindings'][0] lock_state = schema_info['Schema_lock']['value'] last_date = schema_info['last_modified']['value'] return [last_date, lock_state, schema_info]
def rm_loci(identifier, virtuoso_graph, local_sparql, base_url, virtuoso_user, virtuoso_pass): """ """ total_triples = 0 # check input type if os.path.isfile(identifier) is False: if ',' in identifier: loci_ids = identifier.split(',') else: loci_ids = [identifier] else: with open(identifier, 'r') as ids: loci_ids = [l.strip() for l in ids.readlines()] # create loci URIs loci_uris = ['{0}loci/{1}'.format(base_url, i) for i in loci_ids] logging.info('Started rm process for loci: {0}.'.format(loci_ids)) # check if loci exist invalid = [] for locus in loci_uris: locus_result = aux.get_data(SPARQLWrapper(local_sparql), (sq.ASK_LOCUS.format(locus))) if locus_result['boolean'] is not True: invalid.append(locus) logging.info('Could not find locus {0}.\n'.format(locus)) # exclude invalid URIs loci_uris = [l for l in loci_uris if l not in invalid] print('\nLoci to delete: {0}\n'.format(loci_uris)) results = collapse_loci(loci_uris, virtuoso_graph, local_sparql, virtuoso_user, virtuoso_pass) total_triples += results[0] print('\nDeleted a total of {0} triples.'.format(total_triples)) logging.info('Deleted a total of {0} triples.'.format(total_triples)) print('({0} loci, {1} species links, {2} schema links, ' '{3} alleles)'.format(results[2], results[3], results[4], results[1])) logging.info('({0} loci, {1} species links, {2} schema links, ' '{3} alleles)'.format(results[2], results[3], results[4], results[1])) return_dict = { 'loci': results[2], 'splinks': results[3], 'sclinks': results[4], 'alleles': results[1], 'total_triples': total_triples } return return_dict
def create_queries(locus_file, virtuoso_graph, local_sparql, base_url): """ """ # get sequences sent by user with open(locus_file, 'rb') as f: locus_data = pickle.load(f) locus_url = locus_data[0] locus_id = locus_url.split('/')[-1] # get sequences in the NS sequences = fasta_sequences(locus_url, local_sparql, virtuoso_graph) ns_seqs = fasta_seqs = { f['nucSeq']['value']: f['allele_id']['value'] for f in sequences } # count number of alleles for locus count_query = (sq.COUNT_LOCUS_ALLELES.format(virtuoso_graph, locus_url)) count_res = aux.get_data(SPARQLWrapper(local_sparql), count_query) start_id = int(count_res['results']['bindings'][0]['count']['value']) + 1 spec_name = locus_data[1] user_url = locus_data[2] alleles = locus_data[3] novel = [a for a in alleles if a not in ns_seqs] repeated = { hashlib.sha256(a.encode('utf-8')).hexdigest(): ns_seqs[a] for a in alleles if a in ns_seqs } attributed = {} if len(novel) > 0: max_length = max([len(a) for a in novel]) if max_length < 7000: queries, attributed = create_multiple_insert( novel, spec_name, locus_url, user_url, start_id, base_url, virtuoso_graph, attributed) else: queries, attributed = create_single_insert(novel, spec_name, locus_url, user_url, start_id, base_url, virtuoso_graph, attributed) queries_file = '{0}_queries'.format(locus_file.split('alleles')[0]) with open(queries_file, 'wb') as qf: pickle.dump(queries, qf) return [queries_file, locus_id, repeated, attributed] else: return [None, locus_id, repeated, attributed]
def rm_loci_links(mode, identifier, virtuoso_graph, local_sparql, base_url, virtuoso_user, virtuoso_pass): """ """ total_triples = 0 # check input type if os.path.isfile(identifier) is False: if ',' in identifier: loci_ids = identifier.split(',') else: loci_ids = [identifier] else: with open(identifier, 'r') as ids: loci_ids = [l.strip() for l in ids.readlines()] # create loci URIs loci_uris = ['{0}loci/{1}'.format(base_url, i) for i in loci_ids] logging.info('Started rm process to delete {0} for loci: {1}.'.format( mode, loci_uris)) # check if loci exist invalid = [] for locus in loci_uris: locus_result = aux.get_data(SPARQLWrapper(local_sparql), (sq.ASK_LOCUS.format(locus))) if locus_result['boolean'] is not True: invalid.append(locus) logging.info('Could not find locus {0}.\n'.format(locus)) # exclude invalid URIs loci_uris = [[l] for l in loci_uris if l not in invalid] if mode == 'splinks': statement = sq.DELETE_SPECIES_LOCUS elif mode == 'sclinks': statement = sq.DELETE_SCHEMA_LOCUS # delete loci links to species print('Deleting loci {0}...'.format(mode)) deleted, stderr, noeffect, triples = \ multiple_delete(statement, loci_uris, virtuoso_graph, local_sparql, virtuoso_user, virtuoso_pass) total_links = int(triples) if mode == 'splinks' else int(triples / 4) stdout_text = 'Deleted {0} {1} ({2} triples).'.format( deleted, mode, triples) log_results(stdout_text, stderr, noeffect) return_dict = {'{0}'.format(mode): total_links, 'total_triples': triples} return return_dict
def count_alleles(schema, virtuoso_graph, local_sparql): """ """ # get total number of alleles loci = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.COUNT_SCHEMA_ALLELES.format(virtuoso_graph, schema)) loci = loci['results']['bindings'] total_alleles = sum(map(int, [a['nr_allele']['value'] for a in loci])) return total_alleles
def global_species(virtuoso_graph, local_sparql, base_url): """ """ # get all species in the NS species_result = aux.get_data(SPARQLWrapper(local_sparql), sparql_queries.SELECT_SPECIES.format(virtuoso_graph, ' typon:name ?name. ')) result_data = species_result['results']['bindings'] ns_species = {s['species']['value']: s['name']['value'] for s in result_data} species_ids = [s.split('/')[-1] for s in ns_species] for i in species_ids: single_species(i, virtuoso_graph, local_sparql, base_url)
def alleles_lengths(total_alleles, schema, offset, limit, virtuoso_graph, local_sparql): """ """ limit = limit offset = offset count = 0 result = [] while count != total_alleles: alleles = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_ALLELES_LENGTH.format( virtuoso_graph, schema, offset, limit)) data = alleles['results']['bindings'] result.extend(data) count += len(data) offset += limit return result
def loci_annotations(schema, virtuoso_graph, local_sparql): """ """ # get total number of alleles tries = 0 bah = False while bah is False: loci = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SCHEMA_LOCI_ANNOTATIONS.format( virtuoso_graph, schema)) try: annotations = loci['results']['bindings'] bah = True except: print( sparql_queries.SELECT_SCHEMA_LOCI_ANNOTATIONS.format( virtuoso_graph, schema)) logging.warning('Could not get annotations.') logging.warning(loci) tries += 1 if tries == 5: sys.exit('This schema makes no sense!') annotations = [{ 'locus': l['locus']['value'], 'name': l['name']['value'], 'original_name': l['original_name']['value'], 'UniprotName': l['UniprotName']['value'], 'UniprotURI': l['UniprotURI']['value'], 'UserAnnotation': l['UserAnnotation']['value'], 'CustomAnnotation': l['CustomAnnotation']['value'] } for l in annotations] return annotations
def determine_date(schema_uri, local_sparql, virtuoso_graph): """ Gets the last modification date for a schema. Parameters ---------- schema_uri : str The URI of the schema in the Chewie-NS. Returns -------- insertion_date : str The insertion date in the format YYYY-MM-DDTHH:MM:SS.f. """ # get schema last modification date date_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri))) schema_info = date_result['results']['bindings'][0] insertion_date = schema_info['dateEntered']['value'] return insertion_date
def species_schemas(species_uri, schemas, local_sparql, virtuoso_graph): """ Gets the list of schemas for a species. Parameters ---------- species_uri : str The URI of the species in the Chewie-NS. schemas : dict An empty dictionary to store schemas' data. Returns ------- A list with the following variables: - status (int): status code of the response. - schemas (dict): A dictionary with the species URI as key and a list of tuples as value. Each tuple has a schema URI and the name of that schema. """ result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SPECIES_SCHEMAS.format(virtuoso_graph, species_uri))) try: ns_schemas = result['results']['bindings'] if len(ns_schemas) > 0: for schema in ns_schemas: schemas.setdefault(species_uri, []).append( (schema['schemas']['value'], schema['name']['value'])) except Exception: logging.warning('Could not retrieve schemas for ' '{0}. Exception:\n{1}'.format(species_uri, result)) return schemas
def main(temp_dir, graph, sparql, base_url, user, password, c_user): start = time.time() # get species and schema identifiers species_id = os.path.basename(temp_dir).split('_')[0] schema_id = os.path.basename(temp_dir).split('_')[1] # create schema URI schema_uri = '{0}species/{1}/schemas/{2}'.format(base_url, species_id, schema_id) # count total loci count_schema_loci = (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format( graph, schema_uri)) count_schema_loci_res = aux.get_data(SPARQLWrapper(sparql), count_schema_loci) total_loci = int( count_schema_loci_res["results"]["bindings"][0]["nr_loci"]["value"]) post_files = [ os.path.join(temp_dir, file) for file in os.listdir(temp_dir) ] # extract files schema_files = [] for file in post_files: dest_dir = os.path.dirname(file) locus_file = unzip(file, dest_dir) locus_file = os.path.join(temp_dir, locus_file) schema_files.append(locus_file) new_alleles_per_locus = [] # create SPARQL multiple INSERT queries new_seqs = 0 identifiers = {} queries_files = [] with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: for res in executor.map(create_queries, schema_files, repeat(graph), repeat(sparql), repeat(base_url)): if res[0] is not None: queries_files.append(res[0]) identifiers[res[1]] = [res[2], res[3]] new_seqs += len(res[3]) new_alleles_per_locus.append( {"locus_{0}".format(res[1]): { "newAlleles": [len(res[3])] }}) # create/update allele contributions file allele_contrib_path = os.path.join( "pre-computed-data", "allele_contributions_{0}_{1}.json".format(species_id, schema_id)) if os.path.exists(allele_contrib_path): update_contributions_file(allele_contrib_path, schema_uri, c_user, new_alleles_per_locus, new_seqs) else: create_contributions_file(allele_contrib_path, schema_uri, c_user, new_alleles_per_locus, new_seqs, total_loci) start = time.time() # insert data # sort reponses to include summary in log file # create lock file with open(sync_lock, 'w') as lf: lf.write('{0}\n{1}'.format(temp_dir, user)) post_results = send_alleles(queries_files, sparql, user, password) # remove lock file after insertion os.remove(sync_lock) # create file with identifiers identifiers_file = os.path.join(temp_dir, 'identifiers') with open(identifiers_file, 'wb') as rf: pickle.dump(identifiers, rf) end = time.time() delta = end - start print('Insertion: {0}'.format(delta), flush=True) # change last_modified date if new_seqs > 0: modification_date = str( dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f')) change_date(schema_uri, 'last_modified', modification_date, graph, sparql, user, password) # change_schema_version(schema_uri, '2.5.1', # graph, sparql, user, password) # create pre-computed frontend files os.system('python schema_totals.py -m single_schema ' '--sp {0} --sc {1} --g {2} --s {3} --b {4}' ''.format(species_id, schema_id, graph, sparql, base_url)) os.system('python loci_totals.py -m single_schema ' '--sp {0} --sc {1} --g {2} --s {3} --b {4}' ''.format(species_id, schema_id, graph, sparql, base_url)) os.system('python loci_mode.py -m single_schema ' '--sp {0} --sc {1} --g {2} --s {3} --b {4}' ''.format(species_id, schema_id, graph, sparql, base_url)) os.system('python annotations.py -m single_schema ' '--sp {0} --sc {1} --g {2} --s {3} --b {4}' ''.format(species_id, schema_id, graph, sparql, base_url)) os.system('python loci_boxplot.py -m single_schema ' '--sp {0} --sc {1} --g {2} --s {3} --b {4}' ''.format(species_id, schema_id, graph, sparql, base_url)) # unlock schema change_lock(schema_uri, 'Unlocked', graph, sparql, user, password)
def main(temp_dir, graph, sparql, base_url, user, password): # get species and schema identifiers species_id = os.path.basename(temp_dir).split('_')[0] schema_id = os.path.basename(temp_dir).split('_')[1] # create species and schema URIs species_uri = '{0}species/{1}'.format(base_url, species_id) schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id) logging.info('Started loci insertion for ' 'schema {0}'.format(schema_uri)) # define path to file with loci data loci_file = os.path.join(temp_dir, '{0}_{1}_loci'.format(species_id, schema_id)) # read loci data if os.path.isfile(loci_file) is True: with open(loci_file, 'rb') as lf: loci_prefix, loci_data = pickle.load(lf) logging.info('Loci prefix is {0}.'.format(loci_prefix)) else: logging.warning('Could not find file {0}. ' 'Aborting\n\n'.format(loci_file)) sys.exit(1) # determine locus with highest identifier result = aux.get_data(SPARQLWrapper(sparql), (sq.SELECT_HIGHEST_LOCUS.format(graph))) highest_locus = result['results']['bindings'] # if there are no loci if highest_locus != []: highest_locus = highest_locus[0]['locus']['value'] highest_id = int(highest_locus.split('/')[-1]) start_id = highest_id + 1 elif highest_locus == []: start_id = 1 # define path to file with schema upload status data hashes_file = os.path.join(temp_dir, '{0}_{1}_hashes'.format(species_id, schema_id)) # read schema upload status data if os.path.isfile(hashes_file) is True: with open(hashes_file, 'rb') as hf: schema_hashes = pickle.load(hf) else: logging.warning( 'Could not find schema upload status file. Aborting.\n\n') sys.exit(1) # assign identifiers to new loci based on the total number of loci in the Chewie-NS response, hash_to_uri, loci_data, = assign_identifiers( loci_data, schema_hashes, loci_prefix, start_id, base_url) # insert loci insert_queries, insert = create_insert_queries(loci_data, schema_hashes, graph) logging.info('{0} loci to insert out of {1} total loci ' 'in schema.'.format(insert, len(loci_data))) logging.info('Loci integer identifiers interval: [{0} .. {1}]' ''.format(start_id, start_id + insert)) # insert data to create loci if len(insert_queries) > 0: loci_insertion = send_queries(insert_queries, sparql, user, password) insert_status, success, failed = results_status(loci_insertion) for h, v in insert_status.items(): if v is True: schema_hashes[h][1][0] = hash_to_uri[h] response[h].append(v) elif v is False: response[h].append(v) logging.info('Successfully inserted {0} loci. ' 'Failed {1}'.format(success, failed)) # halt process if it could not insert all loci if failed > 0: logging.warning('Could not insert all loci. Aborting.\n\n') sys.exit(1) # link loci to species sp_queries, link = species_link_queries(loci_data, schema_hashes, species_uri, graph) logging.info('{0} loci to link to species out of {1} total loci ' 'in schema.'.format(link, len(loci_data))) if len(sp_queries) > 0: species_links = send_queries(sp_queries, sparql, user, password) link_status, success, failed = results_status(species_links) for h, v in link_status.items(): if v is True: schema_hashes[h][1][1] = True response[h].append(v) elif v is False: response[h].append(v) logging.info('Successfully linked {0} loci to species. ' 'Failed {1}'.format(success, failed)) # link loci to schema sc_queries, link = schema_link_queries(loci_data, schema_hashes, schema_uri, graph) logging.info('{0} loci to link to schema out of {1} total loci ' 'in schema.'.format(link, len(loci_data))) if len(sc_queries) > 0: schema_links = send_queries(sc_queries, sparql, user, password) link_status, success, failed = results_status(schema_links) for h, v in link_status.items(): if v is True: schema_hashes[h][1][2] = True response[h].append(v) elif v is False: response[h].append(v) logging.info('Successfully linked {0} loci to schema. ' 'Failed {1}'.format(success, failed)) # save updated schema hashes with open(hashes_file, 'wb') as hf: pickle.dump(schema_hashes, hf) # write response to file response_file = os.path.join( temp_dir, '{0}_{1}_loci_response'.format(species_id, schema_id)) with open(response_file, 'wb') as rf: pickle.dump(response, rf) # remove temp file os.remove(loci_file) logging.info('Finished loci insertion for ' 'schema {0}\n\n'.format(schema_uri)) return response_file
def rm_schema(identifier, species_id, virtuoso_graph, local_sparql, base_url, virtuoso_user, virtuoso_pass): """ """ total_triples = 0 # create schema URI schema_uri = ('{0}species/{1}/schemas/{2}' '').format(base_url, species_id, identifier) logging.info('Started rm process for schema {0}.'.format(schema_uri)) # check if schema exists schema_result = aux.get_data(SPARQLWrapper(local_sparql), (sq.ASK_SCHEMA.format(schema_uri))) if schema_result['boolean'] is not True: logging.info('Could not find schema.\n') sys.exit('\nThere is no schema with specified ID.') print('\nDeleting loci and alleles for schema: {0}'.format(schema_uri)) # get schema's loci schema_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema_uri))) schema_result = schema_result['results']['bindings'] results = [0, 0, 0, 0, 0] if len(schema_result) == 0: logging.info('{0} has no loci.'.format(schema_uri)) else: loci_uris = [l['locus']['value'] for l in schema_result] print('Loci to delete: {0}\n'.format(len(loci_uris))) logging.info('{0} loci to delete'.format(len(loci_uris))) # collapse all loci (sequences are not deleted) results = collapse_loci(loci_uris, virtuoso_graph, local_sparql, virtuoso_user, virtuoso_pass) total_triples += results[0] # delete description #schema_desc = aux.get_data(SPARQLWrapper(local_sparql), # (sq.SELECT_SCHEMA_DESCRIPTION.format(virtuoso_graph, schema_uri))) #schema_desc = schema_desc['results']['bindings'][0]['description']['value'] #desc_file = '{0}/{1}'.format(Config.PRE_COMPUTE, schema_desc) #if os.path.isfile(desc_file) is True: # subprocess.call(['rm', desc_file]) # delete compressed version zip_file = [ f for f in os.listdir(Config.SCHEMAS_ZIP) if f.startswith('{0}_{1}'.format(species_id, identifier)) ] if len(zip_file) > 0: zip_file = '{0}/{1}'.format(Config.SCHEMAS_ZIP, zip_file[0]) subprocess.call(['rm', zip_file]) print('Deleted compressed version ({0})'.format(zip_file)) logging.info('Deleted compressed version ({0})'.format(zip_file)) # delete pre-computed files length_files = '{0}/{1}_{2}_lengths'.format(Config.PRE_COMPUTE, species_id, identifier) if os.path.isdir(length_files) is True: subprocess.call(['rm', '-rf', length_files]) print( 'Deleted directory with length values ({0})'.format(length_files)) logging.info( 'Deleted directory with length values ({0})'.format(length_files)) annotation_file = '{0}/annotations_{1}_{2}.json'.format( Config.PRE_COMPUTE, species_id, identifier) if os.path.isfile(annotation_file) is True: subprocess.call(['rm', annotation_file]) print('Deleted pre-computed annotations ({0})'.format(annotation_file)) logging.info( 'Deleted pre-computed annotations ({0})'.format(annotation_file)) mode_file = '{0}/mode_{1}_{2}.json'.format(Config.PRE_COMPUTE, species_id, identifier) if os.path.isfile(mode_file) is True: subprocess.call(['rm', mode_file]) print('Deleted pre-computed modes ({0})'.format(mode_file)) logging.info('Deleted pre-computed modes ({0})'.format(mode_file)) boxplot_file = '{0}/boxplot_{1}_{2}.json'.format(Config.PRE_COMPUTE, species_id, identifier) if os.path.isfile(boxplot_file) is True: subprocess.call(['rm', boxplot_file]) print('Deleted pre-computed boxplot values ({0})'.format(boxplot_file)) logging.info( 'Deleted pre-computed boxplot values ({0})'.format(boxplot_file)) # remove schema data from pre-computed files loci_file = '{0}/loci_{1}.json'.format(Config.PRE_COMPUTE, species_id) if os.path.isfile(loci_file) is True: with open(loci_file, 'r') as json_file: json_data = json.load(json_file) schemas = json_data['message'] schemas = [s for s in schemas if s['schema'] != schema_uri] json_data['message'] = schemas with open(loci_file, 'w') as json_outfile: json.dump(json_data, json_outfile) print('Deleted pre-computed values from file with loci values ({0})'. format(loci_file)) logging.info( 'Deleted pre-computed values from file with loci values ({0})'. format(loci_file)) totals_file = '{0}/totals_{1}.json'.format(Config.PRE_COMPUTE, species_id) if os.path.isfile(totals_file) is True: with open(totals_file, 'r') as json_file: json_data = json.load(json_file) schemas = json_data['message'] schemas = [s for s in schemas if s['uri'] != schema_uri] json_data['message'] = schemas with open(totals_file, 'w') as json_outfile: json.dump(json_data, json_outfile) print('Deleted pre-computed values from file with schema totals ({0})'. format(totals_file)) logging.info( 'Deleted pre-computed values from file with schema totals ({0})'. format(totals_file)) # delete schema status_code, message = single_delete(sq.DELETE_SCHEMA, [schema_uri], virtuoso_graph, local_sparql, virtuoso_user, virtuoso_pass) schema_triples = int(extract_triples(message)) schema_del = 0 if status_code in [200, 201]: if schema_triples > 0: schema_del = 1 print('Deleted {0}'.format(schema_uri)) logging.info('Deleted {0}'.format(schema_uri)) total_triples += schema_triples else: print('Could not delete triples for {0}'.format(schema_uri)) logging.info('Could not delete triples for {0}'.format(schema_uri)) else: print('Failed to delete schema: {0}'.format(schema_uri)) logging.info('Failed to delete {0}'.format(schema_uri)) logging.info('Failed stderr:\n{0}'.format(message)) print('\nDeleted a total of {0} triples.'.format(total_triples)) print('({0} loci, {1} species links, {2} schema links, ' '{3} alleles)'.format(results[2], results[3], results[4], results[1])) return_dict = { 'schema': schema_del, 'loci': results[2], 'splinks': results[3], 'sclinks': results[4], 'alleles': results[1], 'total_triples': total_triples } return return_dict
def single_species(species_id, virtuoso_graph, local_sparql, base_url): """ """ start_date = dt.datetime.now() start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S') logging.info( 'Started determination of loci and alleles counts at: {0}'.format( start_date_str)) # create species uri species_uri = '{0}species/{1}'.format(base_url, species_id) species_result = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SINGLE_SPECIES.format(virtuoso_graph, species_uri)) result_data = species_result['results']['bindings'] if len(result_data) == 0: logging.warning('Could not find species with identifier {0}. ' 'Aborting.\n\n'.format(species_id)) # get all schemas for the species species_result = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SPECIES_SCHEMAS.format(virtuoso_graph, species_uri)) result_data = species_result['results']['bindings'] if len(result_data) == 0: logging.info('Species has no schemas.') schemas = [s['schemas']['value'] for s in result_data] # sort by integer identifier to be able to fetch schemas by index schemas = sorted(schemas, key=lambda x: int(x.split('/')[-1])) # list files in folder computed_dir = Config.PRE_COMPUTE computed_files = os.listdir(computed_dir) for schema in schemas: schema_id = schema.split('/')[-1] schema_prefix = 'mode_{0}_{1}'.format(species_id, schema_id) schema_files = [ f for f in computed_files if f == '{0}.json'.format(schema_prefix) ] schema_file = os.path.join(computed_dir, '{0}.json'.format(schema_prefix)) # check if schema is locked schema_lock = aux.get_data( SPARQLWrapper(local_sparql), (sparql_queries.ASK_SCHEMA_LOCK.format(schema))) lock_status = schema_lock['boolean'] if lock_status is True: schema_info = aux.get_data( SPARQLWrapper(local_sparql), (sparql_queries.SELECT_SPECIES_SCHEMA.format( virtuoso_graph, schema))) schema_properties = schema_info['results']['bindings'] if len(schema_properties) == 0: logging.warning( 'Could not find properties values for schema with identifier {0}. ' 'Aborting.\n\n'.format(schema_id)) continue last_modified = schema_properties[0]['last_modified']['value'] if len(schema_files) == 0: create_file(schema_file, { 'mode': [], 'total_alleles': [], 'scatter_data': [] }) lengths_dir = '{0}_{1}_lengths'.format(species_id, schema_id) if lengths_dir in computed_files: lengths_dir = os.path.join(computed_dir, lengths_dir) fast_update(schema, last_modified, schema_file, lengths_dir, virtuoso_graph, local_sparql) else: full_update(schema, last_modified, schema_file, virtuoso_graph, local_sparql) else: logging.warning('Schema {0} is locked. Aborting.'.format(schema))
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph, local_sparql): """ """ schema_id = int(schema.split('/')[-1]) current_file = file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) loci_modes = json_data['mode'] loci_alleles = json_data['total_alleles'] loci_scatter = json_data['scatter_data'] # get schema loci loci = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema)) loci = loci['results']['bindings'] loci_names = {l['locus']['value']: l['name']['value'] for l in loci} if len(loci_modes) == 0: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] loci_stats = [] for locus_file in length_files: with open(locus_file, 'rb') as lf: locus_data = pickle.load(lf) locus_uri = list(locus_data.keys())[0] locus_name = loci_names[locus_uri] locus_id = locus_name.split('-')[-1] alleles_lengths = [v for k, v in locus_data[locus_uri].items()] nr_alleles = len(alleles_lengths) locus_mode = Counter(alleles_lengths).most_common()[0][0] locus_mean = round(sum(alleles_lengths) / nr_alleles) locus_median = round(statistics.median(alleles_lengths)) locus_min = min(alleles_lengths) locus_max = max(alleles_lengths) loci_stats.append((locus_name, locus_id, nr_alleles, locus_mode, locus_mean, locus_median, locus_min, locus_max)) modes = determine_modes(loci_stats) total_alleles = loci_total_alleles(loci_stats) scatter_data = get_scatter_data(loci_stats) json_to_file = { 'schema': schema, 'last_modified': last_modified, 'mode': modes, 'total_alleles': total_alleles, 'scatter_data': scatter_data } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) # if the schema is in the json file elif len(loci_modes) > 0: # get modification date in json file json_date = json_data['last_modified'] virtuoso_date = last_modified if json_date == virtuoso_date: logging.info( 'Information about number for schema {0} is up-to-date.'. format(schema)) elif json_date != virtuoso_date: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] loci_stats = [] for locus_file in length_files: with open(locus_file, 'rb') as f: locus_data = pickle.load(f) locus_uri = list(locus_data.keys())[0] locus_name = loci_names[locus_uri] locus_id = locus_name.split('-')[-1] alleles_lengths = [v for k, v in locus_data[locus_uri].items()] nr_alleles = len(alleles_lengths) locus_mode = Counter(alleles_lengths).most_common()[0][0] locus_mean = round(sum(alleles_lengths) / nr_alleles) locus_median = round(statistics.median(alleles_lengths)) locus_min = min(alleles_lengths) locus_max = max(alleles_lengths) loci_stats.append( (locus_name, locus_id, nr_alleles, locus_mode, locus_mean, locus_median, locus_min, locus_max)) modes = determine_modes(loci_stats) total_alleles = loci_total_alleles(loci_stats) scatter_data = get_scatter_data(loci_stats) json_to_file = { 'schema': schema, 'last_modified': last_modified, 'mode': modes, 'total_alleles': total_alleles, 'scatter_data': scatter_data } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) logging.info('Updated data for schema {0}'.format(schema))
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph, local_sparql): """ """ schema_id = int(schema.split('/')[-1]) current_file = file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) loci_info = json_data['message'] # get schema loci loci = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema)) loci = loci['results']['bindings'] loci_names = {l['locus']['value']: l['name']['value'] for l in loci} if len(loci_info) == 0: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] loci_stats = {} for locus_file in length_files: with open(locus_file, 'rb') as lf: locus_data = pickle.load(lf) locus_uri = list(locus_data.keys())[0] locus_name = loci_names[locus_uri] alleles_lengths = [v for k, v in locus_data[locus_uri].items()] total_alleles = len(alleles_lengths) locus_mode = Counter(alleles_lengths).most_common()[0][0] locus_min = min(alleles_lengths) locus_max = max(alleles_lengths) loci_stats[locus_name] = [ locus_mode, total_alleles, locus_min, locus_max ] annotations = loci_annotations(schema, virtuoso_graph, local_sparql) for a in annotations: locus = a['name'] a['mode'] = loci_stats[locus][0] a['nr_alleles'] = loci_stats[locus][1] a['min'] = loci_stats[locus][2] a['max'] = loci_stats[locus][3] json_to_file = { 'schema': schema, 'last_modified': last_modified, 'message': annotations } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) # if the schema is in the json file elif len(loci_info) > 0: # get modification date in json file json_date = json_data['last_modified'] virtuoso_date = last_modified if json_date == virtuoso_date: logging.info( 'Information about loci annotations and length modes for schema {0} is up-to-date.' .format(schema)) elif json_date != virtuoso_date: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] loci_stats = {} for locus_file in length_files: with open(locus_file, 'rb') as lf: locus_data = pickle.load(lf) locus_uri = list(locus_data.keys())[0] locus_name = loci_names[locus_uri] alleles_lengths = [v for k, v in locus_data[locus_uri].items()] total_alleles = len(alleles_lengths) locus_mode = Counter(alleles_lengths).most_common()[0][0] locus_min = min(alleles_lengths) locus_max = max(alleles_lengths) loci_stats[locus_name] = [ locus_mode, total_alleles, locus_min, locus_max ] annotations = loci_annotations(schema, virtuoso_graph, local_sparql) for a in annotations: locus = a['name'] a['mode'] = loci_stats[locus][0] a['nr_alleles'] = loci_stats[locus][1] a['min'] = loci_stats[locus][2] a['max'] = loci_stats[locus][3] json_to_file = { 'schema': schema, 'last_modified': last_modified, 'message': annotations } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) logging.info('Updated data for schema {0}'.format(schema))
def fasta_sequences(locus, date, local_sparql, virtuoso_graph): """ Get the DNA sequences of all alleles of a locus. Parameters ---------- locus : str The URI of the locus in the Chewie-NS. date : str Last modification date of the schema in the format YYYY-MM-DDTHH:MM:SS.f. Returns ------- fasta_seqs : list of dict A list with one dictionary per allele. Each dictionary has the identifier and the DNA sequence of an allele. """ # setting [SPARQL] ResultSetMaxRows = 400000 in virtuoso.ini # is important to return all sequences at once fasta_result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_LOCUS_FASTA_BY_DATE.format(virtuoso_graph, locus, date))) # virtuoso returned an error because request length exceeded maximum value of Temp Col # get each allele separately try: fasta_seqs = fasta_result['results']['bindings'] # virtuoso returned an error # probably because sequence/request length exceeded maximum value except: logging.warning('Could not retrieve FASTA records for locus {0}\n' 'Response content:\n{1}\nTrying to get each sequence ' 'separately...\n'.format(locus, fasta_result)) # get each allele separately result = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_LOCUS_SEQS_BY_DATE.format(virtuoso_graph, locus, date))) try: fasta_seqs = result['results']['bindings'] if len(fasta_seqs) == 0: logging.warning('Locus {0} has 0 sequences.'.format(locus)) return False except: logging.warning('Could not retrieve sequences hashes ' 'for locus {0}.'.format(locus)) return False total = 0 hashes = [] for s in range(len(fasta_seqs)): # get the sequence corresponding to the hash result2 = aux.get_data( SPARQLWrapper(local_sparql), (sq.SELECT_SEQ_FASTA.format( virtuoso_graph, fasta_seqs[s]['sequence']['value']))) hashes.append(fasta_seqs[s]['sequence']['value']) fasta_seqs[s]['nucSeq'] = result2['results']['bindings'][0][ 'nucSeq'] total += 1 return fasta_seqs
def full_update(schema_uri, file, schema_data, virtuoso_graph, local_sparql, base_url): """ """ schema_id = int(schema_uri.split('/')[-1]) current_file = file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) json_schemas = json_data['message'] schemas_indexes = { int(s['uri'].split('/')[-1]): i for i, s in enumerate(json_schemas) } # if the schema is in the json file if schema_id in schemas_indexes: current_schema = json_schemas[schemas_indexes[schema_id]] # get modification date in json file json_date = current_schema['last_modified'] virtuoso_date = schema_data['last_modified'] if json_date == virtuoso_date: logging.info( 'Information about number of loci and number of alleles for schema {0} is up-to-date.' .format(schema_uri)) elif json_date != virtuoso_date: result = aux.get_data(SPARQLWrapper(local_sparql), (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format( virtuoso_graph, schema_uri))) result_data = result['results']['bindings'][0] current_schema['last_modified'] = virtuoso_date current_schema['nr_loci'] = result_data['nr_loci']['value'] current_schema['nr_alleles'] = result_data['nr_alleles']['value'] json_data['message'][schemas_indexes[schema_id]] = current_schema with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile) logging.info('Updated data for schema {0}'.format(schema_uri)) # new schema that is not in the json file elif schema_id not in schemas_indexes: result = aux.get_data(SPARQLWrapper(local_sparql), (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format( virtuoso_graph, schema_uri))) result_data = result['results']['bindings'][0] # determine user that uploaded the file admin = aux.get_data( SPARQLWrapper(local_sparql), sq.SELECT_SCHEMA_ADMIN.format(virtuoso_graph, schema_uri)) admin = admin['results']['bindings'][0]['admin']['value'] new_schema = schema_data new_schema['user'] = admin new_schema['uri'] = schema_uri new_schema['nr_loci'] = result_data['nr_loci']['value'] new_schema['nr_alleles'] = result_data['nr_alleles']['value'] del (new_schema['Schema_lock']) json_data['message'].append(new_schema) with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile)
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph, local_sparql): """ """ current_file = file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) loci_list = json_data['loci'] # get schema loci loci = aux.get_data( SPARQLWrapper(local_sparql), sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema)) loci = loci['results']['bindings'] loci_names = {l['locus']['value']: l['name']['value'] for l in loci} if len(loci_list) == 0: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] # sort by locus id length_files = sorted(length_files, key=lambda x: int(x.split('_')[-1])) loci_list = [] loci_min = [] loci_q1 = [] loci_median = [] loci_q3 = [] loci_max = [] loci_mean = [] loci_sd = [] alleles_counts = [] for locus_file in length_files: with open(locus_file, 'rb') as lf: locus_data = pickle.load(lf) locus_uri = list(locus_data.keys())[0] # get name of locus locus_name = loci_names[locus_uri] loci_list.append(locus_name) alleles_lengths = [v for k, v in locus_data[locus_uri].items()] alleles_lengths.sort() nr_alleles = len(alleles_lengths) alleles_counts.append(nr_alleles) # minimum and maximum values loci_min.append(min(alleles_lengths)) loci_max.append(max(alleles_lengths)) # standard deviation if nr_alleles > 1: locus_sd = statistics.stdev(alleles_lengths) else: locus_sd = 0.0 loci_sd.append(locus_sd) # mean locus_mean = round(sum(alleles_lengths) / nr_alleles) loci_mean.append(locus_mean) # median locus_median = round(statistics.median(alleles_lengths)) loci_median.append(locus_median) # q1 and q3 if nr_alleles > 1: half = int(nr_alleles // 2) q1 = statistics.median(alleles_lengths[:half]) q3 = statistics.median(alleles_lengths[-half:]) else: q1 = alleles_lengths[0] q3 = alleles_lengths[0] loci_q1.append(q1) loci_q3.append(q3) json_to_file = { 'schema': schema, 'last_modified': last_modified, 'loci': loci_list, 'min': loci_min, 'q1': loci_q1, 'median': loci_median, 'q3': loci_q3, 'max': loci_max, 'mean': loci_mean, 'sd': loci_sd, 'nr_alleles': alleles_counts } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) # if the schema is in the json file elif len(loci_list) > 0: # get modification date in json file json_date = json_data['last_modified'] virtuoso_date = last_modified if json_date == virtuoso_date: logging.info( 'Information for schema {0} is up-to-date.'.format(schema)) elif json_date != virtuoso_date: length_files = [ os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir) ] # sort by locus id length_files = sorted(length_files, key=lambda x: int(x.split('_')[-1])) loci_list = [] loci_min = [] loci_q1 = [] loci_median = [] loci_q3 = [] loci_max = [] loci_mean = [] loci_sd = [] alleles_counts = [] for locus_file in length_files: with open(locus_file, 'rb') as f: locus_data = pickle.load(f) locus_uri = list(locus_data.keys())[0] # get name of locus locus_name = loci_names[locus_uri] loci_list.append(locus_name) alleles_lengths = [v for k, v in locus_data[locus_uri].items()] alleles_lengths.sort() nr_alleles = len(alleles_lengths) alleles_counts.append(nr_alleles) # minimum and maximum values loci_min.append(min(alleles_lengths)) loci_max.append(max(alleles_lengths)) # standard deviation if nr_alleles > 1: locus_sd = statistics.stdev(alleles_lengths) else: locus_sd = 0.0 loci_sd.append(locus_sd) # mean locus_mean = round(sum(alleles_lengths) / nr_alleles) loci_mean.append(locus_mean) # median locus_median = round(statistics.median(alleles_lengths)) loci_median.append(locus_median) # q1 and q3 if nr_alleles > 1: half = int(nr_alleles // 2) q1 = statistics.median(alleles_lengths[:half]) q3 = statistics.median(alleles_lengths[-half:]) else: q1 = alleles_lengths[0] q3 = alleles_lengths[0] loci_q1.append(q1) loci_q3.append(q3) json_to_file = { 'schema': schema, 'last_modified': last_modified, 'loci': loci_list, 'min': loci_min, 'q1': loci_q1, 'median': loci_median, 'q3': loci_q3, 'max': loci_max, 'mean': loci_mean, 'sd': loci_sd, 'nr_alleles': alleles_counts } with open(file, 'w') as json_outfile: json.dump(json_to_file, json_outfile) logging.info('Updated data for schema {0}'.format(schema))
def single_compressor(species_id, schema_id, graph, sparql, base_url, user, password): """ Determines if a schema needs to be compressed and generates a compressed version if needed. """ logging.info('Started single compressor for schema {0} ' 'of species {1}'.format(schema_id, species_id)) # check if species exists species_uri = '{0}species/{1}'.format(base_url, species_id) species_result = aux.get_data( SPARQLWrapper(sparql), sq.SELECT_SINGLE_SPECIES.format(graph, species_uri)) result_data = species_result['results']['bindings'] if len(result_data) == 0: logging.warning('Could not find species with identifier {0}. ' 'Aborting schema compression.\n\n'.format(species_id)) sys.exit(1) sp_name = result_data[0]['name']['value'] sp_name = '_'.join(sp_name.split(' ')) # get schema info # construct schema URI schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id) schema_info = aux.get_data( SPARQLWrapper(sparql), (sq.SELECT_SPECIES_SCHEMA.format(graph, schema_uri))) schema_properties = schema_info['results']['bindings'] if len(schema_properties) == 0: logging.warning( 'Could not find properties values for schema with identifier {0}. ' 'Aborting schema compression.\n\n'.format(schema_id)) sys.exit(1) schema_name = schema_properties[0]['name']['value'] schemas = [(schema_uri, schema_name)] # list compressed schemas compressed_schemas = os.listdir(Config.SCHEMAS_ZIP) to_compress = [] old_zips = {} to_compress, old_zip = compress_determiner(schemas, species_id, sp_name, compressed_schemas, to_compress, old_zips, sparql, graph) if len(to_compress) == 0: logging.info('Aborting schema compression.\n\n') sys.exit(0) else: schemas = ['{0} ({1})'.format(s[0], s[-2]) for s in to_compress] logging.info('Schema to compress: {0}'.format(';'.join(schemas))) # check if schema is locked schema_lock = aux.get_data(SPARQLWrapper(sparql), (sq.ASK_SCHEMA_LOCK.format(schema_uri))) lock_status = schema_lock['boolean'] if lock_status is True: # lock schema locked = change_lock(schema_uri, 'LOCKED', graph, sparql, user, password) if isinstance(locked, list) is True: logging.warning('Could not lock schema {0}. Response:' '\n{1}\n\n'.format(schema_uri, locked[1])) sys.exit(1) single_schema_name = to_compress[0][-2] if old_zip[schema_uri] is not None: old_zip[schema_uri] = os.path.join(Config.SCHEMAS_ZIP, old_zip[schema_uri]) # adapt and compress schema response = compress_schema(to_compress[0], old_zip[schema_uri], sparql, graph) if response == 0: logging.info('Successfully compressed schema {0} ' '({1})'.format(schema_uri, single_schema_name)) else: logging.info('Could not compress schema {0} ' '({1})'.format(schema_uri, single_schema_name)) # unlock schema unlocked = change_lock(schema_uri, 'Unlocked', graph, sparql, user, password) if isinstance(unlocked, list) is True: logging.warning( 'Could not unlock schema at the end of compression process.') logging.info('Finished single compressor for schema {0} ' 'of species {1}'.format(schema_id, species_id))
def fast_update(species_file, files_dir, schema_uri, schema_data, virtuoso_graph, local_sparql, base_url): """ """ schema_id = int(schema_uri.split('/')[-1]) current_file = species_file # read current file with open(current_file, 'r') as json_file: json_data = json.load(json_file) json_schemas = json_data['message'] schemas_indexes = { int(s['uri'].split('/')[-1]): i for i, s in enumerate(json_schemas) } # if the schema is in the json file if schema_id in schemas_indexes: current_schema = json_schemas[schemas_indexes[schema_id]] # get modification date in json file json_date = current_schema['last_modified'] # get schema info that is in Virtuoso virtuoso_date = schema_data['last_modified'] if json_date == virtuoso_date: logging.info( 'Information about number of loci and number of alleles for schema {0} is up-to-date.' .format(schema_uri)) elif json_date != virtuoso_date: length_files = [ os.path.join(files_dir, file) for file in os.listdir(files_dir) ] total_loci = len(length_files) total_alleles = 0 for file in length_files: locus_id = os.path.basename(file).split('_')[-1] locus_uri = '{0}loci/{1}'.format(base_url, locus_id) with open(file, 'rb') as f: locus_data = pickle.load(f) total_alleles += len(locus_data[locus_uri]) current_schema['last_modified'] = virtuoso_date current_schema['nr_loci'] = str(total_loci) current_schema['nr_alleles'] = str(total_alleles) json_data['message'][schemas_indexes[schema_id]] = current_schema with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile) logging.info('Updated data for schema {0}'.format(schema_uri)) # new schema that is not in the json file elif schema_id not in schemas_indexes: length_files = [ os.path.join(files_dir, file) for file in os.listdir(files_dir) ] total_loci = len(length_files) total_alleles = 0 for file in length_files: locus_id = os.path.basename(file).split('_')[-1] locus_uri = '{0}loci/{1}'.format(base_url, locus_id) with open(file, 'rb') as f: locus_data = pickle.load(f) total_alleles += len(locus_data[locus_uri]) # determine user that uploaded the file admin = aux.get_data( SPARQLWrapper(local_sparql), sq.SELECT_SCHEMA_ADMIN.format(virtuoso_graph, schema_uri)) admin = admin['results']['bindings'][0]['admin']['value'] new_schema = schema_data new_schema['user'] = admin new_schema['uri'] = schema_uri new_schema['nr_loci'] = str(total_loci) new_schema['nr_alleles'] = str(total_alleles) del (new_schema['Schema_lock']) json_data['message'].append(new_schema) with open(current_file, 'w') as json_outfile: json.dump(json_data, json_outfile)