Ejemplo n.º 1
0
def single_schema(species_id, schema_id, virtuoso_graph, local_sparql, base_url):
	"""
	"""

	start = time.time()
	start_date = dt.datetime.now()
	start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S')
	logging.info('Started determination of loci and alleles counts at: {0}'.format(start_date_str))

	# create species uri
	species_uri = '{0}species/{1}'.format(base_url, species_id)
	species_result = aux.get_data(SPARQLWrapper(local_sparql),
                                  sparql_queries.SELECT_SINGLE_SPECIES.format(virtuoso_graph, species_uri))
	result_data = species_result['results']['bindings']

	if len(result_data) == 0:
		logging.warning('Could not find species with identifier {0}. '
						'Aborting.\n\n'.format(species_id))
		sys.exit(1)

	schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id)
	schema_info = aux.get_data(SPARQLWrapper(local_sparql),
                          (sparql_queries.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri)))

	schema_properties = schema_info['results']['bindings']
	if len(schema_properties) == 0:
		logging.warning('Could not find properties values for schema with identifier {0}. '
                        'Aborting.\n\n'.format(schema_id))
		sys.exit(1)

	last_modified = schema_properties[0]['last_modified']['value']

	# list files in folder
	computed_dir = Config.PRE_COMPUTE
	computed_files = os.listdir(computed_dir)

	# check if folder with schema alleles lengths files exists
	lengths_dir = '{0}_{1}_lengths'.format(species_id, schema_id)

	# get files with species prefix
	species_prefix = 'loci_{0}'.format(species_id)
	species_files = [f for f in computed_files if f.startswith(species_prefix)]

	species_file = os.path.join(computed_dir, '{0}.json'.format(species_prefix))
	if len(species_files) == 0:
		create_file(species_file, {'message': []})

	if lengths_dir in computed_files:
		lengths_dir = os.path.join(computed_dir, lengths_dir)
		fast_update(schema_uri, last_modified, species_file, lengths_dir)
	else:
		full_update(schema_uri, last_modified, species_file, virtuoso_graph,
	                local_sparql)

	end = time.time()
	delta = end - start
	print(delta)
Ejemplo n.º 2
0
def full_update(schema, last_modified, file, virtuoso_graph,
	            local_sparql):
	"""
	"""

	schema_id = int(schema.split('/')[-1])
	current_file = file

	# read current file
	with open(current_file, 'r') as json_file:
		json_data = json.load(json_file)

	json_schemas = json_data['message']
	schemas_indexes = {int(s['schema'].split('/')[-1]): i for i, s in enumerate(json_schemas)}
	# if the schema is in the json file
	if schema_id in schemas_indexes:
		current_schema = json_schemas[schemas_indexes[schema_id]]

		# get modification date in json file
		json_date = current_schema['last_modified']
		virtuoso_date = last_modified

		if json_date == virtuoso_date:
			logging.info('Information about number of loci and number of alleles for schema {0} is up-to-date.'.format(schema))

		elif json_date != virtuoso_date:
			result = aux.get_data(SPARQLWrapper(local_sparql),
                           		  (sparql_queries.COUNT_SINGLE_SCHEMA_LOCI_ALLELE.format(virtuoso_graph, schema)))

			result_data = result['results']['bindings']
			loci_data = [{'locus': r['locus']['value'], 'nr_alleles': r['nr_alleles']['value']} for r in result_data]
			proc_data = {'schema': schema,
					 	 'last_modified': virtuoso_date,
					 	 'loci': loci_data}

			json_data['message'][schemas_indexes[schema_id]] = proc_data
			with open(current_file, 'w') as json_outfile:
				json.dump(json_data, json_outfile)

			logging.info('Updated data for schema {0}'.format())
	# new schema that is not in the json file
	elif schema_id not in schemas_indexes:
		result = aux.get_data(SPARQLWrapper(local_sparql),
                          	  (sparql_queries.COUNT_SINGLE_SCHEMA_LOCI_ALLELE.format(virtuoso_graph, schema)))

		result_data = result['results']['bindings']
		loci_data = [{'locus': r['locus']['value'], 'nr_alleles': r['nr_alleles']['value']} for r in result_data]
		proc_data = {'schema': schema,
					 'last_modified': last_modified,
					 'loci': loci_data}

		if len(result_data) > 0:
			json_data['message'].append(proc_data)
			with open(current_file, 'w') as json_outfile:
				json.dump(json_data, json_outfile)
Ejemplo n.º 3
0
def schema_loci(schema_uri, local_sparql, virtuoso_graph):
    """ Gets the list of loci for a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.

        Returns
        -------
        loci_list : list of tup
            A list with tuples. Each tuple has two
            elements, a locus name and a locus URI.
    """

    # get loci
    loci_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema_uri)))

    # check if schema has loci
    loci_list = loci_result['results']['bindings']
    if loci_list != []:
        loci_list = [(l['name']['value'], l['locus']['value'])
                     for l in loci_list]

    return loci_list
Ejemplo n.º 4
0
def get_species(local_sparql, virtuoso_graph):
    """ Gets the list of species in the Chewie-NS.

        This function has no arguments but expects
        that the SPARQL endpoint and default Virtuoso
        Graph be set as OS environment variables.

        Returns
        -------
        species_list : dict
        A dictionary with species URIs as keys and species
        names as values. None if species has no schemas.
    """

    # get the list of species in NS
    species_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SPECIES.format(virtuoso_graph, ' typon:name ?name. ')))

    species = species_result['results']['bindings']
    if len(species) == 0:
        species_list = None
    else:
        species_list = {
            s['species']['value']: s['name']['value']
            for s in species
        }

    return species_list
Ejemplo n.º 5
0
def determine_date(schema_uri, local_sparql, virtuoso_graph):
    """ Gets the last modification date for a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.

        Returns
        -------
        A list with the following variables:

        - last_date (str): The last modification date in
          the format YYYY-MM-DDTHH:MM:SS.f.
        - lock_state (str): Locking state of the schema.
        - schema_info (dict): A dictionary with schema
          properties values.
    """

    # get schema last modification date
    date_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri)))

    schema_info = date_result['results']['bindings'][0]

    lock_state = schema_info['Schema_lock']['value']
    last_date = schema_info['last_modified']['value']

    return [last_date, lock_state, schema_info]
Ejemplo n.º 6
0
def rm_loci(identifier, virtuoso_graph, local_sparql, base_url, virtuoso_user,
            virtuoso_pass):
    """
	"""

    total_triples = 0

    # check input type
    if os.path.isfile(identifier) is False:
        if ',' in identifier:
            loci_ids = identifier.split(',')
        else:
            loci_ids = [identifier]
    else:
        with open(identifier, 'r') as ids:
            loci_ids = [l.strip() for l in ids.readlines()]

    # create loci URIs
    loci_uris = ['{0}loci/{1}'.format(base_url, i) for i in loci_ids]

    logging.info('Started rm process for loci: {0}.'.format(loci_ids))

    # check if loci exist
    invalid = []
    for locus in loci_uris:
        locus_result = aux.get_data(SPARQLWrapper(local_sparql),
                                    (sq.ASK_LOCUS.format(locus)))

        if locus_result['boolean'] is not True:
            invalid.append(locus)
            logging.info('Could not find locus {0}.\n'.format(locus))

    # exclude invalid URIs
    loci_uris = [l for l in loci_uris if l not in invalid]

    print('\nLoci to delete: {0}\n'.format(loci_uris))

    results = collapse_loci(loci_uris, virtuoso_graph, local_sparql,
                            virtuoso_user, virtuoso_pass)
    total_triples += results[0]

    print('\nDeleted a total of {0} triples.'.format(total_triples))
    logging.info('Deleted a total of {0} triples.'.format(total_triples))
    print('({0} loci, {1} species links, {2} schema links, '
          '{3} alleles)'.format(results[2], results[3], results[4],
                                results[1]))
    logging.info('({0} loci, {1} species links, {2} schema links, '
                 '{3} alleles)'.format(results[2], results[3], results[4],
                                       results[1]))

    return_dict = {
        'loci': results[2],
        'splinks': results[3],
        'sclinks': results[4],
        'alleles': results[1],
        'total_triples': total_triples
    }

    return return_dict
Ejemplo n.º 7
0
def create_queries(locus_file, virtuoso_graph, local_sparql, base_url):
    """
    """

    # get sequences sent by user
    with open(locus_file, 'rb') as f:
        locus_data = pickle.load(f)

    locus_url = locus_data[0]
    locus_id = locus_url.split('/')[-1]

    # get sequences in the NS
    sequences = fasta_sequences(locus_url, local_sparql, virtuoso_graph)
    ns_seqs = fasta_seqs = {
        f['nucSeq']['value']: f['allele_id']['value']
        for f in sequences
    }

    # count number of alleles for locus
    count_query = (sq.COUNT_LOCUS_ALLELES.format(virtuoso_graph, locus_url))

    count_res = aux.get_data(SPARQLWrapper(local_sparql), count_query)

    start_id = int(count_res['results']['bindings'][0]['count']['value']) + 1

    spec_name = locus_data[1]
    user_url = locus_data[2]
    alleles = locus_data[3]
    novel = [a for a in alleles if a not in ns_seqs]
    repeated = {
        hashlib.sha256(a.encode('utf-8')).hexdigest(): ns_seqs[a]
        for a in alleles if a in ns_seqs
    }

    attributed = {}
    if len(novel) > 0:
        max_length = max([len(a) for a in novel])
        if max_length < 7000:
            queries, attributed = create_multiple_insert(
                novel, spec_name, locus_url, user_url, start_id, base_url,
                virtuoso_graph, attributed)
        else:
            queries, attributed = create_single_insert(novel, spec_name,
                                                       locus_url, user_url,
                                                       start_id, base_url,
                                                       virtuoso_graph,
                                                       attributed)

        queries_file = '{0}_queries'.format(locus_file.split('alleles')[0])
        with open(queries_file, 'wb') as qf:
            pickle.dump(queries, qf)

        return [queries_file, locus_id, repeated, attributed]
    else:
        return [None, locus_id, repeated, attributed]
Ejemplo n.º 8
0
def rm_loci_links(mode, identifier, virtuoso_graph, local_sparql, base_url,
                  virtuoso_user, virtuoso_pass):
    """
	"""

    total_triples = 0

    # check input type
    if os.path.isfile(identifier) is False:
        if ',' in identifier:
            loci_ids = identifier.split(',')
        else:
            loci_ids = [identifier]
    else:
        with open(identifier, 'r') as ids:
            loci_ids = [l.strip() for l in ids.readlines()]

    # create loci URIs
    loci_uris = ['{0}loci/{1}'.format(base_url, i) for i in loci_ids]

    logging.info('Started rm process to delete {0} for loci: {1}.'.format(
        mode, loci_uris))

    # check if loci exist
    invalid = []
    for locus in loci_uris:
        locus_result = aux.get_data(SPARQLWrapper(local_sparql),
                                    (sq.ASK_LOCUS.format(locus)))

        if locus_result['boolean'] is not True:
            invalid.append(locus)
            logging.info('Could not find locus {0}.\n'.format(locus))

    # exclude invalid URIs
    loci_uris = [[l] for l in loci_uris if l not in invalid]

    if mode == 'splinks':
        statement = sq.DELETE_SPECIES_LOCUS
    elif mode == 'sclinks':
        statement = sq.DELETE_SCHEMA_LOCUS

# delete loci links to species
    print('Deleting loci {0}...'.format(mode))
    deleted, stderr, noeffect, triples = \
     multiple_delete(statement, loci_uris, virtuoso_graph,
      local_sparql, virtuoso_user, virtuoso_pass)

    total_links = int(triples) if mode == 'splinks' else int(triples / 4)
    stdout_text = 'Deleted {0} {1} ({2} triples).'.format(
        deleted, mode, triples)
    log_results(stdout_text, stderr, noeffect)

    return_dict = {'{0}'.format(mode): total_links, 'total_triples': triples}

    return return_dict
Ejemplo n.º 9
0
def count_alleles(schema, virtuoso_graph, local_sparql):
    """
	"""

    # get total number of alleles
    loci = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.COUNT_SCHEMA_ALLELES.format(virtuoso_graph, schema))

    loci = loci['results']['bindings']
    total_alleles = sum(map(int, [a['nr_allele']['value'] for a in loci]))

    return total_alleles
Ejemplo n.º 10
0
def global_species(virtuoso_graph, local_sparql, base_url):
	"""
	"""
	
	# get all species in the NS
	species_result = aux.get_data(SPARQLWrapper(local_sparql),
	                              sparql_queries.SELECT_SPECIES.format(virtuoso_graph, ' typon:name ?name. '))
	result_data = species_result['results']['bindings']

	ns_species = {s['species']['value']: s['name']['value'] for s in result_data}

	species_ids = [s.split('/')[-1] for s in ns_species]
	for i in species_ids:
		single_species(i, virtuoso_graph, local_sparql, base_url)
Ejemplo n.º 11
0
def alleles_lengths(total_alleles, schema, offset, limit, virtuoso_graph,
                    local_sparql):
    """
	"""

    limit = limit
    offset = offset
    count = 0
    result = []
    while count != total_alleles:
        alleles = aux.get_data(
            SPARQLWrapper(local_sparql),
            sparql_queries.SELECT_ALLELES_LENGTH.format(
                virtuoso_graph, schema, offset, limit))
        data = alleles['results']['bindings']
        result.extend(data)
        count += len(data)
        offset += limit

    return result
Ejemplo n.º 12
0
def loci_annotations(schema, virtuoso_graph, local_sparql):
    """
	"""

    # get total number of alleles
    tries = 0
    bah = False
    while bah is False:
        loci = aux.get_data(
            SPARQLWrapper(local_sparql),
            sparql_queries.SELECT_SCHEMA_LOCI_ANNOTATIONS.format(
                virtuoso_graph, schema))

        try:
            annotations = loci['results']['bindings']
            bah = True
        except:
            print(
                sparql_queries.SELECT_SCHEMA_LOCI_ANNOTATIONS.format(
                    virtuoso_graph, schema))
            logging.warning('Could not get annotations.')
            logging.warning(loci)
            tries += 1

        if tries == 5:
            sys.exit('This schema makes no sense!')

    annotations = [{
        'locus': l['locus']['value'],
        'name': l['name']['value'],
        'original_name': l['original_name']['value'],
        'UniprotName': l['UniprotName']['value'],
        'UniprotURI': l['UniprotURI']['value'],
        'UserAnnotation': l['UserAnnotation']['value'],
        'CustomAnnotation': l['CustomAnnotation']['value']
    } for l in annotations]

    return annotations
Ejemplo n.º 13
0
def determine_date(schema_uri, local_sparql, virtuoso_graph):
    """ Gets the last modification date for a schema.

        Parameters
        ----------
        schema_uri : str
            The URI of the schema in the Chewie-NS.

        Returns
        --------
        insertion_date : str
        	The insertion date in the format YYYY-MM-DDTHH:MM:SS.f.
    """

    # get schema last modification date
    date_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri)))

    schema_info = date_result['results']['bindings'][0]

    insertion_date = schema_info['dateEntered']['value']

    return insertion_date
Ejemplo n.º 14
0
def species_schemas(species_uri, schemas, local_sparql, virtuoso_graph):
    """ Gets the list of schemas for a species.

        Parameters
        ----------
        species_uri : str
            The URI of the species in the Chewie-NS.
        schemas : dict
            An empty dictionary to store schemas' data.

        Returns
        -------
        A list with the following variables:

        - status (int): status code of the response.
        - schemas (dict): A dictionary with the species
          URI as key and a list of tuples as value.
          Each tuple has a schema URI and the name of
          that schema.
    """

    result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SPECIES_SCHEMAS.format(virtuoso_graph, species_uri)))

    try:
        ns_schemas = result['results']['bindings']
        if len(ns_schemas) > 0:
            for schema in ns_schemas:
                schemas.setdefault(species_uri, []).append(
                    (schema['schemas']['value'], schema['name']['value']))
    except Exception:
        logging.warning('Could not retrieve schemas for '
                        '{0}. Exception:\n{1}'.format(species_uri, result))

    return schemas
Ejemplo n.º 15
0
def main(temp_dir, graph, sparql, base_url, user, password, c_user):

    start = time.time()

    # get species and schema identifiers
    species_id = os.path.basename(temp_dir).split('_')[0]
    schema_id = os.path.basename(temp_dir).split('_')[1]

    # create schema URI
    schema_uri = '{0}species/{1}/schemas/{2}'.format(base_url, species_id,
                                                     schema_id)

    # count total loci
    count_schema_loci = (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format(
        graph, schema_uri))

    count_schema_loci_res = aux.get_data(SPARQLWrapper(sparql),
                                         count_schema_loci)

    total_loci = int(
        count_schema_loci_res["results"]["bindings"][0]["nr_loci"]["value"])

    post_files = [
        os.path.join(temp_dir, file) for file in os.listdir(temp_dir)
    ]

    # extract files
    schema_files = []
    for file in post_files:
        dest_dir = os.path.dirname(file)
        locus_file = unzip(file, dest_dir)
        locus_file = os.path.join(temp_dir, locus_file)
        schema_files.append(locus_file)

    new_alleles_per_locus = []

    # create SPARQL multiple INSERT queries
    new_seqs = 0
    identifiers = {}
    queries_files = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for res in executor.map(create_queries, schema_files, repeat(graph),
                                repeat(sparql), repeat(base_url)):
            if res[0] is not None:
                queries_files.append(res[0])
            identifiers[res[1]] = [res[2], res[3]]
            new_seqs += len(res[3])
            new_alleles_per_locus.append(
                {"locus_{0}".format(res[1]): {
                     "newAlleles": [len(res[3])]
                 }})

    # create/update allele contributions file
    allele_contrib_path = os.path.join(
        "pre-computed-data",
        "allele_contributions_{0}_{1}.json".format(species_id, schema_id))

    if os.path.exists(allele_contrib_path):
        update_contributions_file(allele_contrib_path, schema_uri, c_user,
                                  new_alleles_per_locus, new_seqs)
    else:
        create_contributions_file(allele_contrib_path, schema_uri, c_user,
                                  new_alleles_per_locus, new_seqs, total_loci)

    start = time.time()
    # insert data
    # sort reponses to include summary in log file
    # create lock file
    with open(sync_lock, 'w') as lf:
        lf.write('{0}\n{1}'.format(temp_dir, user))
    post_results = send_alleles(queries_files, sparql, user, password)
    # remove lock file after insertion
    os.remove(sync_lock)

    # create file with identifiers
    identifiers_file = os.path.join(temp_dir, 'identifiers')
    with open(identifiers_file, 'wb') as rf:
        pickle.dump(identifiers, rf)

    end = time.time()
    delta = end - start
    print('Insertion: {0}'.format(delta), flush=True)

    # change last_modified date
    if new_seqs > 0:
        modification_date = str(
            dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f'))
        change_date(schema_uri, 'last_modified', modification_date, graph,
                    sparql, user, password)

        # change_schema_version(schema_uri, '2.5.1',
        # 	graph, sparql, user, password)

        # create pre-computed frontend files
        os.system('python schema_totals.py -m single_schema '
                  '--sp {0} --sc {1} --g {2} --s {3} --b {4}'
                  ''.format(species_id, schema_id, graph, sparql, base_url))
        os.system('python loci_totals.py -m single_schema '
                  '--sp {0} --sc {1} --g {2} --s {3} --b {4}'
                  ''.format(species_id, schema_id, graph, sparql, base_url))
        os.system('python loci_mode.py -m single_schema '
                  '--sp {0} --sc {1} --g {2} --s {3} --b {4}'
                  ''.format(species_id, schema_id, graph, sparql, base_url))
        os.system('python annotations.py -m single_schema '
                  '--sp {0} --sc {1} --g {2} --s {3} --b {4}'
                  ''.format(species_id, schema_id, graph, sparql, base_url))
        os.system('python loci_boxplot.py -m single_schema '
                  '--sp {0} --sc {1} --g {2} --s {3} --b {4}'
                  ''.format(species_id, schema_id, graph, sparql, base_url))

    # unlock schema
    change_lock(schema_uri, 'Unlocked', graph, sparql, user, password)
Ejemplo n.º 16
0
def main(temp_dir, graph, sparql, base_url, user, password):

    # get species and schema identifiers
    species_id = os.path.basename(temp_dir).split('_')[0]
    schema_id = os.path.basename(temp_dir).split('_')[1]

    # create species and schema URIs
    species_uri = '{0}species/{1}'.format(base_url, species_id)
    schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id)

    logging.info('Started loci insertion for ' 'schema {0}'.format(schema_uri))

    # define path to file with loci data
    loci_file = os.path.join(temp_dir,
                             '{0}_{1}_loci'.format(species_id, schema_id))

    # read loci data
    if os.path.isfile(loci_file) is True:
        with open(loci_file, 'rb') as lf:
            loci_prefix, loci_data = pickle.load(lf)
            logging.info('Loci prefix is {0}.'.format(loci_prefix))
    else:
        logging.warning('Could not find file {0}. '
                        'Aborting\n\n'.format(loci_file))
        sys.exit(1)

    # determine locus with highest identifier
    result = aux.get_data(SPARQLWrapper(sparql),
                          (sq.SELECT_HIGHEST_LOCUS.format(graph)))

    highest_locus = result['results']['bindings']
    # if there are no loci
    if highest_locus != []:
        highest_locus = highest_locus[0]['locus']['value']
        highest_id = int(highest_locus.split('/')[-1])
        start_id = highest_id + 1
    elif highest_locus == []:
        start_id = 1

    # define path to file with schema upload status data
    hashes_file = os.path.join(temp_dir,
                               '{0}_{1}_hashes'.format(species_id, schema_id))

    # read schema upload status data
    if os.path.isfile(hashes_file) is True:
        with open(hashes_file, 'rb') as hf:
            schema_hashes = pickle.load(hf)
    else:
        logging.warning(
            'Could not find schema upload status file. Aborting.\n\n')
        sys.exit(1)

    # assign identifiers to new loci based on the total number of loci in the Chewie-NS
    response, hash_to_uri, loci_data, = assign_identifiers(
        loci_data, schema_hashes, loci_prefix, start_id, base_url)

    # insert loci
    insert_queries, insert = create_insert_queries(loci_data, schema_hashes,
                                                   graph)
    logging.info('{0} loci to insert out of {1} total loci '
                 'in schema.'.format(insert, len(loci_data)))
    logging.info('Loci integer identifiers interval: [{0} .. {1}]'
                 ''.format(start_id, start_id + insert))

    # insert data to create loci
    if len(insert_queries) > 0:
        loci_insertion = send_queries(insert_queries, sparql, user, password)
        insert_status, success, failed = results_status(loci_insertion)

        for h, v in insert_status.items():
            if v is True:
                schema_hashes[h][1][0] = hash_to_uri[h]
                response[h].append(v)
            elif v is False:
                response[h].append(v)

        logging.info('Successfully inserted {0} loci. '
                     'Failed {1}'.format(success, failed))
        # halt process if it could not insert all loci
        if failed > 0:
            logging.warning('Could not insert all loci. Aborting.\n\n')
            sys.exit(1)

    # link loci to species
    sp_queries, link = species_link_queries(loci_data, schema_hashes,
                                            species_uri, graph)

    logging.info('{0} loci to link to species out of {1} total loci '
                 'in schema.'.format(link, len(loci_data)))

    if len(sp_queries) > 0:
        species_links = send_queries(sp_queries, sparql, user, password)
        link_status, success, failed = results_status(species_links)

        for h, v in link_status.items():
            if v is True:
                schema_hashes[h][1][1] = True
                response[h].append(v)
            elif v is False:
                response[h].append(v)

        logging.info('Successfully linked {0} loci to species. '
                     'Failed {1}'.format(success, failed))

    # link loci to schema
    sc_queries, link = schema_link_queries(loci_data, schema_hashes,
                                           schema_uri, graph)

    logging.info('{0} loci to link to schema out of {1} total loci '
                 'in schema.'.format(link, len(loci_data)))

    if len(sc_queries) > 0:
        schema_links = send_queries(sc_queries, sparql, user, password)
        link_status, success, failed = results_status(schema_links)

        for h, v in link_status.items():
            if v is True:
                schema_hashes[h][1][2] = True
                response[h].append(v)
            elif v is False:
                response[h].append(v)

        logging.info('Successfully linked {0} loci to schema. '
                     'Failed {1}'.format(success, failed))

    # save updated schema hashes
    with open(hashes_file, 'wb') as hf:
        pickle.dump(schema_hashes, hf)

    # write response to file
    response_file = os.path.join(
        temp_dir, '{0}_{1}_loci_response'.format(species_id, schema_id))
    with open(response_file, 'wb') as rf:
        pickle.dump(response, rf)

    # remove temp file
    os.remove(loci_file)

    logging.info('Finished loci insertion for '
                 'schema {0}\n\n'.format(schema_uri))

    return response_file
Ejemplo n.º 17
0
def rm_schema(identifier, species_id, virtuoso_graph, local_sparql, base_url,
              virtuoso_user, virtuoso_pass):
    """
	"""

    total_triples = 0

    # create schema URI
    schema_uri = ('{0}species/{1}/schemas/{2}'
                  '').format(base_url, species_id, identifier)

    logging.info('Started rm process for schema {0}.'.format(schema_uri))

    # check if schema exists
    schema_result = aux.get_data(SPARQLWrapper(local_sparql),
                                 (sq.ASK_SCHEMA.format(schema_uri)))

    if schema_result['boolean'] is not True:
        logging.info('Could not find schema.\n')
        sys.exit('\nThere is no schema with specified ID.')

    print('\nDeleting loci and alleles for schema: {0}'.format(schema_uri))

    # get schema's loci
    schema_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema_uri)))

    schema_result = schema_result['results']['bindings']

    results = [0, 0, 0, 0, 0]
    if len(schema_result) == 0:
        logging.info('{0} has no loci.'.format(schema_uri))
    else:
        loci_uris = [l['locus']['value'] for l in schema_result]

        print('Loci to delete: {0}\n'.format(len(loci_uris)))
        logging.info('{0} loci to delete'.format(len(loci_uris)))

        # collapse all loci (sequences are not deleted)
        results = collapse_loci(loci_uris, virtuoso_graph, local_sparql,
                                virtuoso_user, virtuoso_pass)
        total_triples += results[0]

    # delete description
    #schema_desc = aux.get_data(SPARQLWrapper(local_sparql),
    #                            (sq.SELECT_SCHEMA_DESCRIPTION.format(virtuoso_graph, schema_uri)))

    #schema_desc = schema_desc['results']['bindings'][0]['description']['value']
    #desc_file = '{0}/{1}'.format(Config.PRE_COMPUTE, schema_desc)
    #if os.path.isfile(desc_file) is True:
    #	subprocess.call(['rm', desc_file])

    # delete compressed version
    zip_file = [
        f for f in os.listdir(Config.SCHEMAS_ZIP)
        if f.startswith('{0}_{1}'.format(species_id, identifier))
    ]
    if len(zip_file) > 0:
        zip_file = '{0}/{1}'.format(Config.SCHEMAS_ZIP, zip_file[0])
        subprocess.call(['rm', zip_file])
        print('Deleted compressed version ({0})'.format(zip_file))
        logging.info('Deleted compressed version ({0})'.format(zip_file))

    # delete pre-computed files
    length_files = '{0}/{1}_{2}_lengths'.format(Config.PRE_COMPUTE, species_id,
                                                identifier)
    if os.path.isdir(length_files) is True:
        subprocess.call(['rm', '-rf', length_files])
        print(
            'Deleted directory with length values ({0})'.format(length_files))
        logging.info(
            'Deleted directory with length values ({0})'.format(length_files))

    annotation_file = '{0}/annotations_{1}_{2}.json'.format(
        Config.PRE_COMPUTE, species_id, identifier)
    if os.path.isfile(annotation_file) is True:
        subprocess.call(['rm', annotation_file])
        print('Deleted pre-computed annotations ({0})'.format(annotation_file))
        logging.info(
            'Deleted pre-computed annotations ({0})'.format(annotation_file))

    mode_file = '{0}/mode_{1}_{2}.json'.format(Config.PRE_COMPUTE, species_id,
                                               identifier)
    if os.path.isfile(mode_file) is True:
        subprocess.call(['rm', mode_file])
        print('Deleted pre-computed modes ({0})'.format(mode_file))
        logging.info('Deleted pre-computed modes ({0})'.format(mode_file))

    boxplot_file = '{0}/boxplot_{1}_{2}.json'.format(Config.PRE_COMPUTE,
                                                     species_id, identifier)
    if os.path.isfile(boxplot_file) is True:
        subprocess.call(['rm', boxplot_file])
        print('Deleted pre-computed boxplot values ({0})'.format(boxplot_file))
        logging.info(
            'Deleted pre-computed boxplot values ({0})'.format(boxplot_file))

    # remove schema data from pre-computed files
    loci_file = '{0}/loci_{1}.json'.format(Config.PRE_COMPUTE, species_id)
    if os.path.isfile(loci_file) is True:
        with open(loci_file, 'r') as json_file:
            json_data = json.load(json_file)

        schemas = json_data['message']
        schemas = [s for s in schemas if s['schema'] != schema_uri]
        json_data['message'] = schemas
        with open(loci_file, 'w') as json_outfile:
            json.dump(json_data, json_outfile)
        print('Deleted pre-computed values from file with loci values ({0})'.
              format(loci_file))
        logging.info(
            'Deleted pre-computed values from file with loci values ({0})'.
            format(loci_file))

    totals_file = '{0}/totals_{1}.json'.format(Config.PRE_COMPUTE, species_id)
    if os.path.isfile(totals_file) is True:
        with open(totals_file, 'r') as json_file:
            json_data = json.load(json_file)

        schemas = json_data['message']
        schemas = [s for s in schemas if s['uri'] != schema_uri]
        json_data['message'] = schemas
        with open(totals_file, 'w') as json_outfile:
            json.dump(json_data, json_outfile)
        print('Deleted pre-computed values from file with schema totals ({0})'.
              format(totals_file))
        logging.info(
            'Deleted pre-computed values from file with schema totals ({0})'.
            format(totals_file))

    # delete schema
    status_code, message = single_delete(sq.DELETE_SCHEMA, [schema_uri],
                                         virtuoso_graph, local_sparql,
                                         virtuoso_user, virtuoso_pass)

    schema_triples = int(extract_triples(message))
    schema_del = 0
    if status_code in [200, 201]:
        if schema_triples > 0:
            schema_del = 1
            print('Deleted {0}'.format(schema_uri))
            logging.info('Deleted {0}'.format(schema_uri))
            total_triples += schema_triples
        else:
            print('Could not delete triples for {0}'.format(schema_uri))
            logging.info('Could not delete triples for {0}'.format(schema_uri))
    else:
        print('Failed to delete schema: {0}'.format(schema_uri))
        logging.info('Failed to delete {0}'.format(schema_uri))
        logging.info('Failed stderr:\n{0}'.format(message))

    print('\nDeleted a total of {0} triples.'.format(total_triples))
    print('({0} loci, {1} species links, {2} schema links, '
          '{3} alleles)'.format(results[2], results[3], results[4],
                                results[1]))

    return_dict = {
        'schema': schema_del,
        'loci': results[2],
        'splinks': results[3],
        'sclinks': results[4],
        'alleles': results[1],
        'total_triples': total_triples
    }

    return return_dict
Ejemplo n.º 18
0
def single_species(species_id, virtuoso_graph, local_sparql, base_url):
    """
	"""

    start_date = dt.datetime.now()
    start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S')
    logging.info(
        'Started determination of loci and alleles counts at: {0}'.format(
            start_date_str))

    # create species uri
    species_uri = '{0}species/{1}'.format(base_url, species_id)
    species_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.SELECT_SINGLE_SPECIES.format(virtuoso_graph,
                                                    species_uri))
    result_data = species_result['results']['bindings']

    if len(result_data) == 0:
        logging.warning('Could not find species with identifier {0}. '
                        'Aborting.\n\n'.format(species_id))

    # get all schemas for the species
    species_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.SELECT_SPECIES_SCHEMAS.format(virtuoso_graph,
                                                     species_uri))
    result_data = species_result['results']['bindings']

    if len(result_data) == 0:
        logging.info('Species has no schemas.')

    schemas = [s['schemas']['value'] for s in result_data]
    # sort by integer identifier to be able to fetch schemas by index
    schemas = sorted(schemas, key=lambda x: int(x.split('/')[-1]))

    # list files in folder
    computed_dir = Config.PRE_COMPUTE
    computed_files = os.listdir(computed_dir)
    for schema in schemas:
        schema_id = schema.split('/')[-1]
        schema_prefix = 'mode_{0}_{1}'.format(species_id, schema_id)
        schema_files = [
            f for f in computed_files if f == '{0}.json'.format(schema_prefix)
        ]
        schema_file = os.path.join(computed_dir,
                                   '{0}.json'.format(schema_prefix))

        # check if schema is locked
        schema_lock = aux.get_data(
            SPARQLWrapper(local_sparql),
            (sparql_queries.ASK_SCHEMA_LOCK.format(schema)))
        lock_status = schema_lock['boolean']
        if lock_status is True:
            schema_info = aux.get_data(
                SPARQLWrapper(local_sparql),
                (sparql_queries.SELECT_SPECIES_SCHEMA.format(
                    virtuoso_graph, schema)))

            schema_properties = schema_info['results']['bindings']
            if len(schema_properties) == 0:
                logging.warning(
                    'Could not find properties values for schema with identifier {0}. '
                    'Aborting.\n\n'.format(schema_id))
                continue

            last_modified = schema_properties[0]['last_modified']['value']
            if len(schema_files) == 0:
                create_file(schema_file, {
                    'mode': [],
                    'total_alleles': [],
                    'scatter_data': []
                })

            lengths_dir = '{0}_{1}_lengths'.format(species_id, schema_id)
            if lengths_dir in computed_files:
                lengths_dir = os.path.join(computed_dir, lengths_dir)
                fast_update(schema, last_modified, schema_file, lengths_dir,
                            virtuoso_graph, local_sparql)
            else:
                full_update(schema, last_modified, schema_file, virtuoso_graph,
                            local_sparql)
        else:
            logging.warning('Schema {0} is locked. Aborting.'.format(schema))
Ejemplo n.º 19
0
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph,
                local_sparql):
    """
	"""

    schema_id = int(schema.split('/')[-1])
    current_file = file

    # read current file
    with open(current_file, 'r') as json_file:
        json_data = json.load(json_file)

    loci_modes = json_data['mode']
    loci_alleles = json_data['total_alleles']
    loci_scatter = json_data['scatter_data']

    # get schema loci
    loci = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema))
    loci = loci['results']['bindings']
    loci_names = {l['locus']['value']: l['name']['value'] for l in loci}

    if len(loci_modes) == 0:
        length_files = [
            os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
        ]

        loci_stats = []
        for locus_file in length_files:
            with open(locus_file, 'rb') as lf:
                locus_data = pickle.load(lf)

            locus_uri = list(locus_data.keys())[0]
            locus_name = loci_names[locus_uri]
            locus_id = locus_name.split('-')[-1]
            alleles_lengths = [v for k, v in locus_data[locus_uri].items()]

            nr_alleles = len(alleles_lengths)
            locus_mode = Counter(alleles_lengths).most_common()[0][0]
            locus_mean = round(sum(alleles_lengths) / nr_alleles)
            locus_median = round(statistics.median(alleles_lengths))
            locus_min = min(alleles_lengths)
            locus_max = max(alleles_lengths)

            loci_stats.append((locus_name, locus_id, nr_alleles, locus_mode,
                               locus_mean, locus_median, locus_min, locus_max))

        modes = determine_modes(loci_stats)
        total_alleles = loci_total_alleles(loci_stats)
        scatter_data = get_scatter_data(loci_stats)

        json_to_file = {
            'schema': schema,
            'last_modified': last_modified,
            'mode': modes,
            'total_alleles': total_alleles,
            'scatter_data': scatter_data
        }

        with open(file, 'w') as json_outfile:
            json.dump(json_to_file, json_outfile)

    # if the schema is in the json file
    elif len(loci_modes) > 0:
        # get modification date in json file
        json_date = json_data['last_modified']
        virtuoso_date = last_modified

        if json_date == virtuoso_date:
            logging.info(
                'Information about number  for schema {0} is up-to-date.'.
                format(schema))

        elif json_date != virtuoso_date:
            length_files = [
                os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
            ]

            loci_stats = []
            for locus_file in length_files:
                with open(locus_file, 'rb') as f:
                    locus_data = pickle.load(f)

                locus_uri = list(locus_data.keys())[0]
                locus_name = loci_names[locus_uri]
                locus_id = locus_name.split('-')[-1]
                alleles_lengths = [v for k, v in locus_data[locus_uri].items()]

                nr_alleles = len(alleles_lengths)
                locus_mode = Counter(alleles_lengths).most_common()[0][0]
                locus_mean = round(sum(alleles_lengths) / nr_alleles)
                locus_median = round(statistics.median(alleles_lengths))
                locus_min = min(alleles_lengths)
                locus_max = max(alleles_lengths)

                loci_stats.append(
                    (locus_name, locus_id, nr_alleles, locus_mode, locus_mean,
                     locus_median, locus_min, locus_max))

            modes = determine_modes(loci_stats)
            total_alleles = loci_total_alleles(loci_stats)
            scatter_data = get_scatter_data(loci_stats)

            json_to_file = {
                'schema': schema,
                'last_modified': last_modified,
                'mode': modes,
                'total_alleles': total_alleles,
                'scatter_data': scatter_data
            }

            with open(file, 'w') as json_outfile:
                json.dump(json_to_file, json_outfile)

            logging.info('Updated data for schema {0}'.format(schema))
Ejemplo n.º 20
0
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph,
                local_sparql):
    """
	"""

    schema_id = int(schema.split('/')[-1])
    current_file = file

    # read current file
    with open(current_file, 'r') as json_file:
        json_data = json.load(json_file)

    loci_info = json_data['message']

    # get schema loci
    loci = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema))
    loci = loci['results']['bindings']
    loci_names = {l['locus']['value']: l['name']['value'] for l in loci}

    if len(loci_info) == 0:
        length_files = [
            os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
        ]

        loci_stats = {}
        for locus_file in length_files:
            with open(locus_file, 'rb') as lf:
                locus_data = pickle.load(lf)

            locus_uri = list(locus_data.keys())[0]
            locus_name = loci_names[locus_uri]
            alleles_lengths = [v for k, v in locus_data[locus_uri].items()]
            total_alleles = len(alleles_lengths)
            locus_mode = Counter(alleles_lengths).most_common()[0][0]
            locus_min = min(alleles_lengths)
            locus_max = max(alleles_lengths)
            loci_stats[locus_name] = [
                locus_mode, total_alleles, locus_min, locus_max
            ]

        annotations = loci_annotations(schema, virtuoso_graph, local_sparql)
        for a in annotations:
            locus = a['name']
            a['mode'] = loci_stats[locus][0]
            a['nr_alleles'] = loci_stats[locus][1]
            a['min'] = loci_stats[locus][2]
            a['max'] = loci_stats[locus][3]

        json_to_file = {
            'schema': schema,
            'last_modified': last_modified,
            'message': annotations
        }

        with open(file, 'w') as json_outfile:
            json.dump(json_to_file, json_outfile)

    # if the schema is in the json file
    elif len(loci_info) > 0:
        # get modification date in json file
        json_date = json_data['last_modified']
        virtuoso_date = last_modified

        if json_date == virtuoso_date:
            logging.info(
                'Information about loci annotations and length modes for schema {0} is up-to-date.'
                .format(schema))

        elif json_date != virtuoso_date:
            length_files = [
                os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
            ]

            loci_stats = {}
            for locus_file in length_files:
                with open(locus_file, 'rb') as lf:
                    locus_data = pickle.load(lf)

                locus_uri = list(locus_data.keys())[0]
                locus_name = loci_names[locus_uri]
                alleles_lengths = [v for k, v in locus_data[locus_uri].items()]
                total_alleles = len(alleles_lengths)
                locus_mode = Counter(alleles_lengths).most_common()[0][0]
                locus_min = min(alleles_lengths)
                locus_max = max(alleles_lengths)
                loci_stats[locus_name] = [
                    locus_mode, total_alleles, locus_min, locus_max
                ]

            annotations = loci_annotations(schema, virtuoso_graph,
                                           local_sparql)
            for a in annotations:
                locus = a['name']
                a['mode'] = loci_stats[locus][0]
                a['nr_alleles'] = loci_stats[locus][1]
                a['min'] = loci_stats[locus][2]
                a['max'] = loci_stats[locus][3]

            json_to_file = {
                'schema': schema,
                'last_modified': last_modified,
                'message': annotations
            }

            with open(file, 'w') as json_outfile:
                json.dump(json_to_file, json_outfile)

            logging.info('Updated data for schema {0}'.format(schema))
Ejemplo n.º 21
0
def fasta_sequences(locus, date, local_sparql, virtuoso_graph):
    """ Get the DNA sequences of all alleles of a locus.

        Parameters
        ----------
        locus : str
            The URI of the locus in the Chewie-NS.
        date : str
            Last modification date of the schema in
            the format YYYY-MM-DDTHH:MM:SS.f.

        Returns
        -------
        fasta_seqs : list of dict
            A list with one dictionary per allele.
            Each dictionary has the identifier and the DNA
            sequence of an allele.
    """

    # setting [SPARQL] ResultSetMaxRows = 400000 in virtuoso.ini
    # is important to return all sequences at once
    fasta_result = aux.get_data(
        SPARQLWrapper(local_sparql),
        (sq.SELECT_LOCUS_FASTA_BY_DATE.format(virtuoso_graph, locus, date)))

    # virtuoso returned an error because request length exceeded maximum value of Temp Col
    # get each allele separately
    try:
        fasta_seqs = fasta_result['results']['bindings']
    # virtuoso returned an error
    # probably because sequence/request length exceeded maximum value
    except:
        logging.warning('Could not retrieve FASTA records for locus {0}\n'
                        'Response content:\n{1}\nTrying to get each sequence '
                        'separately...\n'.format(locus, fasta_result))
        # get each allele separately
        result = aux.get_data(
            SPARQLWrapper(local_sparql),
            (sq.SELECT_LOCUS_SEQS_BY_DATE.format(virtuoso_graph, locus, date)))
        try:
            fasta_seqs = result['results']['bindings']
            if len(fasta_seqs) == 0:
                logging.warning('Locus {0} has 0 sequences.'.format(locus))
                return False
        except:
            logging.warning('Could not retrieve sequences hashes '
                            'for locus {0}.'.format(locus))
            return False

        total = 0
        hashes = []
        for s in range(len(fasta_seqs)):
            # get the sequence corresponding to the hash
            result2 = aux.get_data(
                SPARQLWrapper(local_sparql), (sq.SELECT_SEQ_FASTA.format(
                    virtuoso_graph, fasta_seqs[s]['sequence']['value'])))
            hashes.append(fasta_seqs[s]['sequence']['value'])

            fasta_seqs[s]['nucSeq'] = result2['results']['bindings'][0][
                'nucSeq']
            total += 1

    return fasta_seqs
Ejemplo n.º 22
0
def full_update(schema_uri, file, schema_data, virtuoso_graph, local_sparql,
                base_url):
    """
	"""

    schema_id = int(schema_uri.split('/')[-1])
    current_file = file

    # read current file
    with open(current_file, 'r') as json_file:
        json_data = json.load(json_file)

    json_schemas = json_data['message']
    schemas_indexes = {
        int(s['uri'].split('/')[-1]): i
        for i, s in enumerate(json_schemas)
    }
    # if the schema is in the json file
    if schema_id in schemas_indexes:
        current_schema = json_schemas[schemas_indexes[schema_id]]

        # get modification date in json file
        json_date = current_schema['last_modified']

        virtuoso_date = schema_data['last_modified']
        if json_date == virtuoso_date:
            logging.info(
                'Information about number of loci and number of alleles for schema {0} is up-to-date.'
                .format(schema_uri))

        elif json_date != virtuoso_date:
            result = aux.get_data(SPARQLWrapper(local_sparql),
                                  (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format(
                                      virtuoso_graph, schema_uri)))

            result_data = result['results']['bindings'][0]
            current_schema['last_modified'] = virtuoso_date
            current_schema['nr_loci'] = result_data['nr_loci']['value']
            current_schema['nr_alleles'] = result_data['nr_alleles']['value']

            json_data['message'][schemas_indexes[schema_id]] = current_schema
            with open(current_file, 'w') as json_outfile:
                json.dump(json_data, json_outfile)

            logging.info('Updated data for schema {0}'.format(schema_uri))
    # new schema that is not in the json file
    elif schema_id not in schemas_indexes:
        result = aux.get_data(SPARQLWrapper(local_sparql),
                              (sq.COUNT_SINGLE_SCHEMA_LOCI_ALLELES.format(
                                  virtuoso_graph, schema_uri)))
        result_data = result['results']['bindings'][0]

        # determine user that uploaded the file
        admin = aux.get_data(
            SPARQLWrapper(local_sparql),
            sq.SELECT_SCHEMA_ADMIN.format(virtuoso_graph, schema_uri))

        admin = admin['results']['bindings'][0]['admin']['value']
        new_schema = schema_data
        new_schema['user'] = admin
        new_schema['uri'] = schema_uri
        new_schema['nr_loci'] = result_data['nr_loci']['value']
        new_schema['nr_alleles'] = result_data['nr_alleles']['value']
        del (new_schema['Schema_lock'])

        json_data['message'].append(new_schema)
        with open(current_file, 'w') as json_outfile:
            json.dump(json_data, json_outfile)
Ejemplo n.º 23
0
def fast_update(schema, last_modified, file, lengths_dir, virtuoso_graph,
                local_sparql):
    """
    """

    current_file = file

    # read current file
    with open(current_file, 'r') as json_file:
        json_data = json.load(json_file)

    loci_list = json_data['loci']

    # get schema loci
    loci = aux.get_data(
        SPARQLWrapper(local_sparql),
        sparql_queries.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema))
    loci = loci['results']['bindings']
    loci_names = {l['locus']['value']: l['name']['value'] for l in loci}

    if len(loci_list) == 0:
        length_files = [
            os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
        ]
        # sort by locus id
        length_files = sorted(length_files,
                              key=lambda x: int(x.split('_')[-1]))

        loci_list = []
        loci_min = []
        loci_q1 = []
        loci_median = []
        loci_q3 = []
        loci_max = []
        loci_mean = []
        loci_sd = []
        alleles_counts = []

        for locus_file in length_files:
            with open(locus_file, 'rb') as lf:
                locus_data = pickle.load(lf)

            locus_uri = list(locus_data.keys())[0]

            # get name of locus
            locus_name = loci_names[locus_uri]
            loci_list.append(locus_name)
            alleles_lengths = [v for k, v in locus_data[locus_uri].items()]
            alleles_lengths.sort()
            nr_alleles = len(alleles_lengths)
            alleles_counts.append(nr_alleles)
            # minimum and maximum values
            loci_min.append(min(alleles_lengths))
            loci_max.append(max(alleles_lengths))
            # standard deviation
            if nr_alleles > 1:
                locus_sd = statistics.stdev(alleles_lengths)
            else:
                locus_sd = 0.0
            loci_sd.append(locus_sd)
            # mean
            locus_mean = round(sum(alleles_lengths) / nr_alleles)
            loci_mean.append(locus_mean)
            # median
            locus_median = round(statistics.median(alleles_lengths))
            loci_median.append(locus_median)
            # q1 and q3
            if nr_alleles > 1:
                half = int(nr_alleles // 2)
                q1 = statistics.median(alleles_lengths[:half])
                q3 = statistics.median(alleles_lengths[-half:])
            else:
                q1 = alleles_lengths[0]
                q3 = alleles_lengths[0]
            loci_q1.append(q1)
            loci_q3.append(q3)

        json_to_file = {
            'schema': schema,
            'last_modified': last_modified,
            'loci': loci_list,
            'min': loci_min,
            'q1': loci_q1,
            'median': loci_median,
            'q3': loci_q3,
            'max': loci_max,
            'mean': loci_mean,
            'sd': loci_sd,
            'nr_alleles': alleles_counts
        }

        with open(file, 'w') as json_outfile:
            json.dump(json_to_file, json_outfile)

    # if the schema is in the json file
    elif len(loci_list) > 0:
        # get modification date in json file
        json_date = json_data['last_modified']
        virtuoso_date = last_modified

        if json_date == virtuoso_date:
            logging.info(
                'Information for schema {0} is up-to-date.'.format(schema))

        elif json_date != virtuoso_date:
            length_files = [
                os.path.join(lengths_dir, f) for f in os.listdir(lengths_dir)
            ]
            # sort by locus id
            length_files = sorted(length_files,
                                  key=lambda x: int(x.split('_')[-1]))

            loci_list = []
            loci_min = []
            loci_q1 = []
            loci_median = []
            loci_q3 = []
            loci_max = []
            loci_mean = []
            loci_sd = []
            alleles_counts = []

            for locus_file in length_files:
                with open(locus_file, 'rb') as f:
                    locus_data = pickle.load(f)

                locus_uri = list(locus_data.keys())[0]
                # get name of locus
                locus_name = loci_names[locus_uri]
                loci_list.append(locus_name)
                alleles_lengths = [v for k, v in locus_data[locus_uri].items()]
                alleles_lengths.sort()
                nr_alleles = len(alleles_lengths)
                alleles_counts.append(nr_alleles)
                # minimum and maximum values
                loci_min.append(min(alleles_lengths))
                loci_max.append(max(alleles_lengths))
                # standard deviation
                if nr_alleles > 1:
                    locus_sd = statistics.stdev(alleles_lengths)
                else:
                    locus_sd = 0.0
                loci_sd.append(locus_sd)
                # mean
                locus_mean = round(sum(alleles_lengths) / nr_alleles)
                loci_mean.append(locus_mean)
                # median
                locus_median = round(statistics.median(alleles_lengths))
                loci_median.append(locus_median)
                # q1 and q3
                if nr_alleles > 1:
                    half = int(nr_alleles // 2)
                    q1 = statistics.median(alleles_lengths[:half])
                    q3 = statistics.median(alleles_lengths[-half:])
                else:
                    q1 = alleles_lengths[0]
                    q3 = alleles_lengths[0]
                loci_q1.append(q1)
                loci_q3.append(q3)

            json_to_file = {
                'schema': schema,
                'last_modified': last_modified,
                'loci': loci_list,
                'min': loci_min,
                'q1': loci_q1,
                'median': loci_median,
                'q3': loci_q3,
                'max': loci_max,
                'mean': loci_mean,
                'sd': loci_sd,
                'nr_alleles': alleles_counts
            }

            with open(file, 'w') as json_outfile:
                json.dump(json_to_file, json_outfile)

            logging.info('Updated data for schema {0}'.format(schema))
Ejemplo n.º 24
0
def single_compressor(species_id, schema_id, graph, sparql, base_url, user,
                      password):
    """ Determines if a schema needs to be compressed and
        generates a compressed version if needed.
    """

    logging.info('Started single compressor for schema {0} '
                 'of species {1}'.format(schema_id, species_id))

    # check if species exists
    species_uri = '{0}species/{1}'.format(base_url, species_id)
    species_result = aux.get_data(
        SPARQLWrapper(sparql),
        sq.SELECT_SINGLE_SPECIES.format(graph, species_uri))
    result_data = species_result['results']['bindings']

    if len(result_data) == 0:
        logging.warning('Could not find species with identifier {0}. '
                        'Aborting schema compression.\n\n'.format(species_id))
        sys.exit(1)

    sp_name = result_data[0]['name']['value']
    sp_name = '_'.join(sp_name.split(' '))

    # get schema info
    # construct schema URI
    schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id)
    schema_info = aux.get_data(
        SPARQLWrapper(sparql),
        (sq.SELECT_SPECIES_SCHEMA.format(graph, schema_uri)))

    schema_properties = schema_info['results']['bindings']
    if len(schema_properties) == 0:
        logging.warning(
            'Could not find properties values for schema with identifier {0}. '
            'Aborting schema compression.\n\n'.format(schema_id))
        sys.exit(1)

    schema_name = schema_properties[0]['name']['value']
    schemas = [(schema_uri, schema_name)]
    # list compressed schemas
    compressed_schemas = os.listdir(Config.SCHEMAS_ZIP)
    to_compress = []
    old_zips = {}
    to_compress, old_zip = compress_determiner(schemas, species_id, sp_name,
                                               compressed_schemas, to_compress,
                                               old_zips, sparql, graph)

    if len(to_compress) == 0:
        logging.info('Aborting schema compression.\n\n')
        sys.exit(0)
    else:
        schemas = ['{0} ({1})'.format(s[0], s[-2]) for s in to_compress]
        logging.info('Schema to compress: {0}'.format(';'.join(schemas)))

    # check if schema is locked
    schema_lock = aux.get_data(SPARQLWrapper(sparql),
                               (sq.ASK_SCHEMA_LOCK.format(schema_uri)))

    lock_status = schema_lock['boolean']
    if lock_status is True:
        # lock schema
        locked = change_lock(schema_uri, 'LOCKED', graph, sparql, user,
                             password)
        if isinstance(locked, list) is True:
            logging.warning('Could not lock schema {0}. Response:'
                            '\n{1}\n\n'.format(schema_uri, locked[1]))
            sys.exit(1)

    single_schema_name = to_compress[0][-2]
    if old_zip[schema_uri] is not None:
        old_zip[schema_uri] = os.path.join(Config.SCHEMAS_ZIP,
                                           old_zip[schema_uri])

    # adapt and compress schema
    response = compress_schema(to_compress[0], old_zip[schema_uri], sparql,
                               graph)
    if response == 0:
        logging.info('Successfully compressed schema {0} '
                     '({1})'.format(schema_uri, single_schema_name))
    else:
        logging.info('Could not compress schema {0} '
                     '({1})'.format(schema_uri, single_schema_name))

    # unlock schema
    unlocked = change_lock(schema_uri, 'Unlocked', graph, sparql, user,
                           password)
    if isinstance(unlocked, list) is True:
        logging.warning(
            'Could not unlock schema at the end of compression process.')

        logging.info('Finished single compressor for schema {0} '
                     'of species {1}'.format(schema_id, species_id))
Ejemplo n.º 25
0
def fast_update(species_file, files_dir, schema_uri, schema_data,
                virtuoso_graph, local_sparql, base_url):
    """
	"""

    schema_id = int(schema_uri.split('/')[-1])
    current_file = species_file

    # read current file
    with open(current_file, 'r') as json_file:
        json_data = json.load(json_file)

    json_schemas = json_data['message']
    schemas_indexes = {
        int(s['uri'].split('/')[-1]): i
        for i, s in enumerate(json_schemas)
    }

    # if the schema is in the json file
    if schema_id in schemas_indexes:
        current_schema = json_schemas[schemas_indexes[schema_id]]

        # get modification date in json file
        json_date = current_schema['last_modified']

        # get schema info that is in Virtuoso
        virtuoso_date = schema_data['last_modified']
        if json_date == virtuoso_date:
            logging.info(
                'Information about number of loci and number of alleles for schema {0} is up-to-date.'
                .format(schema_uri))

        elif json_date != virtuoso_date:
            length_files = [
                os.path.join(files_dir, file) for file in os.listdir(files_dir)
            ]

            total_loci = len(length_files)

            total_alleles = 0
            for file in length_files:
                locus_id = os.path.basename(file).split('_')[-1]
                locus_uri = '{0}loci/{1}'.format(base_url, locus_id)
                with open(file, 'rb') as f:
                    locus_data = pickle.load(f)

                total_alleles += len(locus_data[locus_uri])

            current_schema['last_modified'] = virtuoso_date
            current_schema['nr_loci'] = str(total_loci)
            current_schema['nr_alleles'] = str(total_alleles)

            json_data['message'][schemas_indexes[schema_id]] = current_schema
            with open(current_file, 'w') as json_outfile:
                json.dump(json_data, json_outfile)

            logging.info('Updated data for schema {0}'.format(schema_uri))
    # new schema that is not in the json file
    elif schema_id not in schemas_indexes:
        length_files = [
            os.path.join(files_dir, file) for file in os.listdir(files_dir)
        ]

        total_loci = len(length_files)

        total_alleles = 0
        for file in length_files:
            locus_id = os.path.basename(file).split('_')[-1]
            locus_uri = '{0}loci/{1}'.format(base_url, locus_id)
            with open(file, 'rb') as f:
                locus_data = pickle.load(f)

            total_alleles += len(locus_data[locus_uri])

        # determine user that uploaded the file
        admin = aux.get_data(
            SPARQLWrapper(local_sparql),
            sq.SELECT_SCHEMA_ADMIN.format(virtuoso_graph, schema_uri))

        admin = admin['results']['bindings'][0]['admin']['value']
        new_schema = schema_data
        new_schema['user'] = admin
        new_schema['uri'] = schema_uri
        new_schema['nr_loci'] = str(total_loci)
        new_schema['nr_alleles'] = str(total_alleles)
        del (new_schema['Schema_lock'])

        json_data['message'].append(new_schema)
        with open(current_file, 'w') as json_outfile:
            json.dump(json_data, json_outfile)