Example #1
def find_core_genome(biosample):
    Finds all Genomes with the specified BioSample id that are core genomes
    (labelled with CORE)

        biosample: BioSample id of interest

    Returns: a list of SPARQL URIs of Genomes that match the BioSample and are
    core genomes

    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'PREFIX gfvo: <http://www.biointerchange.org/gfvo#>\n'
        'SELECT ?Genome \n'
        'WHERE {'
        '?Genome rdf:type gfvo:Genome .'
        '?Genome :has_biosample "%s"^^xsd:string .'
        '?Genome :has_sequence ?Sequence .'
        '?Sequence :is_from "CORE"^^xsd:string .'
        '}' % biosample

    return [result["Genome"]["value"].split("#", 1)[1] for result in \
Example #2
def find_duplicate_biosamples():
    Checks to see if a BioSample id is unique or not; if it is not, identify
    all Genomes that refer to it

    Returns: a list of tuples composed of a BioSample id and a list of SPARQL
    URIs for Genomes

    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'PREFIX gfvo: <http://www.biointerchange.org/gfvo#>\n'
        'SELECT ?BioSample (GROUP_CONCAT( ?Genome ; SEPARATOR = "#") AS \
        ?Genomes) (COUNT (?Genome) AS ?Elements)\n'
        'WHERE { ?Genome rdf:type gfvo:Genome . ?Genome :has_biosample \
        ?BioSample . '
        'MINUS { ?Genome :has_sequence ?Sequence . ?Sequence :is_from \
        "WGS"^^xsd:string .}}\n'
        'GROUP BY ?BioSample HAVING ( ?Elements > 1)'

    return (
            result["Genomes"]["value"].split("#", )[1::2]
        ) for result in results["results"]["bindings"])
Example #3
def check_blank_nodes():
    Checks to see if there are any blank nodes present on the database

    Returns: a boolean indicating if blank nodes exists or not in the database

    results = _sparql_query(
        'ASK {?x ?y ?z . FILTER ( isBlank(?x) || isBlank(?z) )}'

    return results["boolean"]
Example #4
def find_genome(accession):
    Finds the genome instance in Blazegraph. Returns None if nothing is found.

        genome: genome accession number

    Returns: the SPARQL URI for the associated genome instance. Returns None if
    nothing found.
    query = (
        'PREFIX : <https://github.com/superphy#>\n'
        'SELECT ?s WHERE {?s :has_accession "%s" . }' % accession
    results = _sparql_query(query)

    return results["results"]["bindings"][0]["s"]["value"]
Example #5
def check_validation(genome):
    Checks to see if a particular genome has already had its sequence validated

        genome(str): A genome's accession number

    Returns: a boolean indication if the genome has been through validation
    (whether validation was true or false)
    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX owl: <http://www.w3.org/2002/07/owl#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'ASK { :%s :has_valid_sequence ?o .}' % genome

    return results["boolean"]
Example #6
def has_ref_gene(gene_name):
    Determines if a particular gene already has a genome its sequence is
    referenced from
        gene_name(str): name of the gene

    Returns: A boolean, T if is has a reference_gene tag, false if not.
    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'ASK {'
        ':%s :has_copy ?location .'
        '?location rdf:type :reference_gene'
        '}' % gene_name

    return results["boolean"]
Example #7
def find_missing_sequences():
    Finds Genome instances in Blazegraph that are missing a sequence and hasn't
    failed sequence validation

    Returns:  list of SPARQL URIs for Genome instances

    results = _sparql_query(
        'PREFIX : <https://github.com/superphy#>\n'
        'PREFIX gfvo: <http://www.biointerchange.org/gfvo#>\n'
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'SELECT ?s ?acc WHERE { ?s rdf:type gfvo:Genome . \
        ?s :has_accession ?acc . '
        'MINUS { ?s :has_valid_sequence ?o }}'

    return ((result["s"]["value"].rsplit("#", 1)[1], result["acc"]["value"])
            for result in results["results"]["bindings"])
Example #8
def check_named_individual(name):
    Checks to see if a given SPARQL URI is an instance of any RDF class encoded
    into the database

        name: the SPARQL URI of the instance
        (must be from the superphyontology)

    Returns: a boolean indicating if the instance exists or not in the database

    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX owl: <http://www.w3.org/2002/07/owl#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'ASK { :%s rdf:type owl:NamedIndividual .}' % name

    return results["boolean"]
Example #9
def find_source(source):
    Finds the correct isolation_from_source instance in Blazegraph given a
    term, or returns none if nothing is found

        source: a term used to identify the isolation_from_source

    Returns: the SPARQL URI for the associated isolation_from_source or None

    results = _sparql_query(
        'PREFIX : <https://github.com/superphy#>\n'
        'SELECT ?s WHERE {'
        '?s ?o "%s"^^xsd:string .'
        '?s rdf:type :isolation_from_source'
        '}' % source

    return results["results"]["bindings"][0]["s"]["value"].split("#", 1)[1]
Example #10
def check_checksum(checksum):
    Checks if a particular checksum exists in the database.

    As checksums are supposed to be unique to the sequence, if any are found in
    the database, the chances of there being a duplicate sequence is high.

        checksum (str): the hash for a sequence

    Returns: a boolean indicating if the hash was found in the database

    results = _sparql_query(
        'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n'
        'PREFIX : <https://github.com/superphy#>\n'
        'ASK { ?Sequence :has_checksum "%s"^^xsd:string}' % checksum

    return results["boolean"]
Example #11
def find_from_host(host):
    Finds the correct isolation_from_host instance in Blazegraph given a host
    descriptor, or returns none if nothing is found

        host: a term used to identify the host (common or scientifi name,

    Returns: the SPARQL URI for the associated isolation_from_host object or

    results = _sparql_query(
        'PREFIX : <https://github.com/superphy#>\n'
        'SELECT ?p WHERE {?s ?o "%s"^^xsd:string . ?s :is_object_of ?p . ?p \
        rdf:type :isolation_from_host}' % host

    return results["results"]["bindings"][0]["p"]["value"].split("#", 1)[1]