Beispiel #1
0
def insert_new_proteins_n_alignments(session, target_sequence, record):
    database = record.database
    # insert the database entity (if it doesn't already exist)
    q_get_database = 'match $db isa database, has name "' + database + '"; get $db;'
    q_insert_database = 'insert $db isa database, has name "' + database + '";'
    database_id = insert_if_non_existent(session, q_get_database,
                                         q_insert_database, "$db")

    for alignment in record.alignments:
        # insert the protein entity (if it doesn't already exist)
        protein_name = alignment.hit_def.split(" >")[0].split(";")[0]
        q_get_protein = 'match $pr isa protein, has name "' + protein_name + '"; get $pr;'
        q_insert_protein = 'insert $pr isa protein, has name "' + protein_name + '";'
        protein_id = insert_if_non_existent(session, q_get_protein,
                                            q_insert_protein, "$pr")

        for hsp in alignment.hsps:
            sequence = hsp.sbjct
            # insert the sequence attribute for the protein entity (if doesn't exists already)
            q_get_protein_sequence = 'match $pr id ' + protein_id + ', has sequence $seq; $seq "' + sequence + '"; get $seq;'
            q_insert_protein_sequence = 'match $pr id ' + protein_id + '; insert $seq isa sequence; $seq "' + sequence + '"; $pr has sequence $seq;'
            insert_if_non_existent(session, q_get_protein_sequence,
                                   q_insert_protein_sequence, "$seq")

            # insert the sourcing-of-information relationship (if it doesn't already exist)
            q_get_sourcing = (
                'match $seq isa sequence; $seq "' + sequence + '"; $db id ' +
                database_id + '; ' +
                '$sourcing (information-source: $db, sourced-information: $seq) isa sourcing-of-information; '
                + 'get $sourcing;')
            q_insert_sourcing = (
                'match $seq isa sequence; $seq "' + sequence + '"; $db id ' +
                database_id + ';' +
                'insert $sourcing (information-source: $db, sourced-information: $seq) isa sourcing-of-information;'
            )
            insert_if_non_existent(session, q_get_sourcing, q_insert_sourcing,
                                   "$sourcing")

            # insert the alignment relationship (if it doesn't already exist)
            sequence_positivity = round(hsp.positives / alignment.length, 3)
            sequence_identicality = round(hsp.identities / alignment.length, 3)
            sequence_gaps = round(hsp.gaps / alignment.length, 5)
            sequence_midline = hsp.match
            alignment_identifier = alignment.hit_id.split("|", 4)[3]
            q_get_alignment = (
                'match $target-seq isa sequence; $target-seq "' +
                target_sequence + '"; ' +
                '$matched-seq isa sequence; $matched-seq "' + sequence +
                '"; ' +
                '$alignment (target-sequence: $target-seq, matched-sequence: $matched-seq) isa sequence-sequence-alignment; '
                + 'get $alignment;')
            q_insert_alignment = (
                'match $target-seq isa sequence; $target-seq "' +
                target_sequence + '"; ' +
                '$matched-seq isa sequence; $matched-seq "' + sequence +
                '"; ' +
                'insert $alignment (target-sequence: $target-seq, matched-sequence: $matched-seq) isa sequence-sequence-alignment; '
                + '$alignment has sequence-positivity ' +
                str(sequence_positivity) + ', has sequence-identicality ' +
                str(sequence_identicality) + ', has sequence-gaps ' +
                str(sequence_gaps) + ', has sequence-midline "' +
                sequence_midline + '"' + ', has identifier "' +
                alignment_identifier + '";')
            insert_if_non_existent(session, q_get_alignment,
                                   q_insert_alignment, "$alignment")

        # insert the species entity (if it doesn't already exist)
        if (len(alignment.hit_def.split("[")) > 1):
            species = alignment.hit_def.split("[")[1].split("]")[0]
            q_get_species = 'match $species isa species, has name "' + species + '"; get $species;'
            q_insert_species = 'insert $species isa species, has name "' + species + '"; '
            species_id = insert_if_non_existent(session, q_get_species,
                                                q_insert_species, "$species")

            # insert protein-ownership relationship (if it doesn't already exist)
            q_get_protein_ownership = (
                'match $sp id "' + species_id + '"; ' + '$pr id ' +
                protein_id + '; ' +
                '$pr-ownership (species-owner: $sp, owned-protein: $pr) isa protein-ownership;'
                + 'get $pr-ownership;')
            q_insert_protein_ownership = (
                'match $sp id "' + species_id + '"; ' + '$pr id ' +
                protein_id + '; ' +
                'insert $pr-ownership (species-owner: $sp, owned-protein: $pr) isa protein-ownership;'
            )
            insert_if_non_existent(session, q_get_protein_ownership,
                                   q_insert_protein_ownership, "$pr-ownership")
Beispiel #2
0
            species = protein_details[2]

            # insert the protein entity
            q_insert_protein = ("insert $pr isa protein " +
                                'has identifier "' + identifier + '" ' +
                                'has name "' + name + '" ' + 'has sequence "' +
                                sequence + '";')
            protein_id = insert_anyway(session, q_insert_protein)

            # insert the sourcing-of-information relationship
            q_insert_sourcing_of_information = (
                'match $pr id ' + protein_id + '; ' + '$db id ' + db_id +
                '; ' +
                "insert (information-source: $db, sourced-information: $pr) isa sourcing-of-information;"
            )
            insert_anyway(session, q_insert_sourcing_of_information)

            # insert the species entity (if it doesn't already exist)
            q_insert_species = 'insert $species isa species has name "' + species + '"; '
            species_id = insert_if_non_existent(session, q_insert_species,
                                                "$species")

            # insert protein-ownership relationship (protein <> species)
            q_insert_protein_ownership = (
                'match $species id ' + species_id + '; ' + '$protein id ' +
                protein_id + '; ' +
                "insert (species-owner: $species, owned-protein: $protein) isa protein-ownership;"
            )
            insert_anyway(session, q_insert_protein_ownership)
            print("- - - - - - - - - - - - - - - - -")
Beispiel #3
0
def init(data_path):
    """
        1. creates a Grakn session to talk to the 'proteins' keyspace
        2. inserts the database entity named 'UniProt'
        3. for each protein stored in target-protein-sequences.fasta, inserts the:
            - protein entity
            - species entity
            - species <> protein relationship
            - protein <> database relationship
    """
    with GraknClient(uri="localhost:48555") as client:
        with client.session(keyspace="blast") as session:
            # insert the database entity
            q_get_database = 'match $db isa database, has name "uniprot"; get $db;'
            q_insert_database = 'insert $db isa database, has name "uniprot";'
            database_id = insert_if_non_existent(session, q_get_database,
                                                 q_insert_database, "$db")

            with open(data_path) as data:
                for first_line, sequence in SimpleFastaParser(data):
                    # extra relevant edata from first_line of each fasta (protein)
                    protein_details = re.split(',| OS=| OX=',
                                               first_line.replace(' ', ',', 1))
                    identifier = protein_details[0].split("|")[1]
                    name = protein_details[1]
                    species = protein_details[2]

                    # insert the protein entity
                    q_get_protein = ('match $pr isa protein ' +
                                     ', has identifier "' + identifier + '" ' +
                                     ', has name "' + name + '" ' +
                                     ', has sequence "' + sequence + '"; ' +
                                     'get $pr;')
                    q_insert_protein = ('insert $pr isa protein ' +
                                        ', has identifier "' + identifier +
                                        '" ' + ', has name "' + name + '" ' +
                                        ', has sequence "' + sequence + '";')
                    protein_id = insert_if_non_existent(
                        session, q_get_protein, q_insert_protein, "$pr")

                    # insert the sourcing-of-information relationship
                    q_get_sourcing_of_information = (
                        'match $pr id ' + protein_id + '; ' + '$db id ' +
                        database_id + '; ' +
                        '$sourcing (information-source: $db, sourced-information: $pr) isa sourcing-of-information; '
                        + 'get $sourcing;')
                    q_insert_sourcing_of_information = (
                        'match $pr id ' + protein_id + '; ' + '$db id ' +
                        database_id + '; ' +
                        'insert $sourcing (information-source: $db, sourced-information: $pr) isa sourcing-of-information;'
                    )
                    insert_if_non_existent(session,
                                           q_get_sourcing_of_information,
                                           q_insert_sourcing_of_information,
                                           "$sourcing")

                    # insert the species entity
                    q_get_species = 'match $species isa species, has name "' + species + '"; get $species;'
                    q_insert_species = 'insert $species isa species, has name "' + species + '";'
                    species_id = insert_if_non_existent(
                        session, q_get_species, q_insert_species, "$species")

                    # insert protein-ownership relationship
                    q_get_protein_ownership = (
                        'match $species id ' + species_id + '; ' +
                        '$protein id ' + protein_id + '; ' +
                        '$pr-ownership (species-owner: $species, owned-protein: $protein) isa protein-ownership; '
                        + 'get $pr-ownership;')
                    q_insert_protein_ownership = (
                        'match $species id ' + species_id + '; ' +
                        '$protein id ' + protein_id + '; ' +
                        'insert $pr-ownership (species-owner: $species, owned-protein: $protein) isa protein-ownership;'
                    )
                    insert_if_non_existent(session, q_get_protein_ownership,
                                           q_insert_protein_ownership,
                                           "$pr-ownership")