Beispiel #1
0
    def __init__(
        self,
        # constants
        c_localization_enty_class=LOC_ID,
        c_localization_norm_class=GO_NORM_ID,
        c_protein_enty_class=PRO_ID,
        c_protein_norm_class=UNIPROT_NORM_ID,
        c_SwissProt_organisms_used=[9606
                                    ],  # TODO perhaps use other organisms ?
        #
        # The following two were DOCUMENT-BASED features and not domain-specific
        #
        f_corpus_unnormalized_total_background_loc_rels_ratios=None,
        f_corpus_normalized_total_background_loc_rels_ratios=None,
        #
        f_SwissProt_normalized_total_absolute_loc_rels_ratios=None,
        f_SwissProt_normalized_total_background_loc_rels_ratios=None,
        #
        f_SwissProt_normalized_exists_relation=None,
    ):

        self.c_localization_enty_class = c_localization_enty_class
        self.c_localization_norm_class = c_localization_norm_class
        self.c_protein_enty_class = c_protein_enty_class
        self.c_protein_norm_class = c_protein_norm_class

        #

        path = repo_path(
            "resources", "features",
            "corpus_unnormalized_total_background_loc_rels_ratios.pickle")
        with open(path, "rb") as f:
            self.c_corpus_unnormalized_total_background_loc_rels_ratios = pickle.load(
                f)

        #
        # Comment so far out as it doesn't provide benefits in performance
        #
        # path = repo_path("resources", "features", "corpus_normalized_total_background_loc_rels_ratios.pickle")
        # with open(path, "rb") as f:
        #     self.c_corpus_normalized_total_background_loc_rels_ratios = pickle.load(f)
        #
        # path = repo_path("resources", "features", "SwissProt_normalized_total_absolute_loc_rels_ratios.pickle")
        # with open(path, "rb") as f:
        #     self.c_SwissProt_normalized_total_absolute_loc_rels_ratios = pickle.load(f)
        #
        # path = repo_path("resources", "features", "SwissProt_normalized_total_background_loc_rels_ratios.pickle")
        # with open(path, "rb") as f:
        #     self.c_SwissProt_normalized_total_background_loc_rels_ratios = pickle.load(f)

        self.c_SwissProt_organisms_used = c_SwissProt_organisms_used

        self.c_SwissProt_all_relations = SWISSPROT_ALL_RELATIONS

        #

        self.f_corpus_unnormalized_total_background_loc_rels_ratios = f_corpus_unnormalized_total_background_loc_rels_ratios
        self.f_corpus_normalized_total_background_loc_rels_ratios = f_corpus_normalized_total_background_loc_rels_ratios
        #
        self.f_SwissProt_normalized_total_absolute_loc_rels_ratios = f_SwissProt_normalized_total_absolute_loc_rels_ratios
        self.f_SwissProt_normalized_total_background_loc_rels_ratios = f_SwissProt_normalized_total_background_loc_rels_ratios

        #
        #

        self.f_SwissProt_normalized_exists_relation = f_SwissProt_normalized_exists_relation
Beispiel #2
0
import pickle
from nalaf.utils.download import DownloadArticle
from nalaf.utils.readers import StringReader, PMIDReader
from loctext.learning.annotators import StringTagger
from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, repo_path
from loctext.learning.annotators import LocTextDXModelRelationExtractor

RE_MODEL_PATH = repo_path("resources", "models",
                          "D0_9606,3702,4932_1497520729.163767.bin")
RE_FEATURES_PATH = repo_path(
    "resources", "features", "selected",
    "0_True_LinearSVC_C=2.0-1487943476.673364-NAMES.py")
RE_MODEL_BIN = None
with open(RE_MODEL_PATH, "rb") as f:
    RE_MODEL_BIN = pickle.load(f)


def parse_arguments(argv=[]):

    import argparse

    parser = argparse.ArgumentParser(
        description=
        'Run LocText on some text to extract Protein<-->Cell Compartments relations'
    )

    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument("--text", help="Run against given text/string")
    input_group.add_argument(
        "--pmid",
        help=
Beispiel #3
0
from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, repo_path
from loctext.learning.evaluations import are_go_parent_and_child
from loctext.util import simple_parse_GO

GO_TREE = simple_parse_GO.simple_parse(
    repo_path("resources", "ontologies",
              "go-basic.cellular_component.latest.obo"))

Lars = [
    "GO:0005576",  # extracellular
    "GO:0005634",  # nucleus
    "GO:0005739",  # mitochondrion
    "GO:0005764",  # lysosome
    "GO:0005768",  # endosome
    "GO:0005773",  # vacuole
    "GO:0005777",  # peroxisome
    "GO:0005783",  # endoplasmic reticulum
    "GO:0005794",  # golgi apparatus
    "GO:0005829",  # cytosol
    "GO:0005856",  # cytoskeleton
    "GO:0005886",  # plasma membrane
    "GO:0009507",  # chloroplast
]

Tanya = [
    # "GO:0016021",  # integral to membrane
    "GO:0009507",  # chloroplast
    "GO:0009535",  # chloroplast thylakoid membrane
    # "GO:0005737",  # cytoplasm
    "GO:0005783",  # endoplasmic reticulum
    "GO:0005789",  # endoplasmic reticulum membrane
Beispiel #4
0
import json
from pprint import pprint
import re

# Run grep -o "GO:[0-9]*" "resources/features/human_localization_all__2016-11-20.tsv" | sort | uniq -c | sort > scripts/precomputed_SwissProt_normalized_total_absolute_loc_rels_ratio.txt

in_path = "scripts/precomputed_SwissProt_normalized_total_absolute_loc_rels_ratio.txt"

mode = "normalized"

regex_canonical_go_id = re.compile("canonical[\s\W]*?(GO:[0-9]+)")

with open(in_path) as f:
    counter_relations = Counter({go: int(count) for line in f for count, go in [line.strip().split(" ")]})

GO_TREE = simple_parse_GO.simple_parse(repo_path(["resources", "ontologies", "go-basic.cellular_component.latest.obo"]))

counter_mentions = Counter()

for json_path in glob.glob("resources/features/human_localization_all_PMIDs_only_StringTagger_results__2016-11-20/*.json"):
    with open(json_path) as f:
        # We read the file as string since we saw json-parsing errors or inconsistencies
        data = f.read()

        for go in regex_canonical_go_id.findall(data):
            if go in GO_TREE:
                counter_mentions.update([go])

for key in counter_relations:
    if key not in counter_mentions:
        counter_mentions.update({key: 1})
Beispiel #5
0
regex_go_id = re.compile('GO:[0-9]+')

with open(in_path) as f:
    next(f)  # skip header

    all_relations = {}

    for line in f:
        upid, organism_id, localization_gos = line.split("\t")
        organism_id = int(organism_id)

        organism_relations = all_relations.get(organism_id, {})
        go_terms = regex_go_id.findall(localization_gos)
        organism_relations[upid] = set(go_terms)
        all_relations[organism_id] = organism_relations

    print(
        "Total uniprot entries:",
        sum(
            len(organism_relations)
            for organism_relations in all_relations.values()))
    print("4932 unit prot entries:", (len(all_relations[4932])))
    print("9606 unit prot entries:", (len(all_relations[9606])))
    print("Organisms:", set(all_relations.keys()))

    out_path = repo_path("resources", "features",
                         "SwissProt_all_relations.pickle")
    with open(out_path, "wb") as f:
        pickle.dump(all_relations, f)
Beispiel #6
0
import pickle
from itertools import product
from loctext.util import repo_path, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID
from loctext.util import simple_parse_GO
from loctext.util.ncbi_global_align import global_align

GO_TREE = simple_parse_GO.simple_parse(
    repo_path("resources", "ontologies",
              "go-basic.cellular_component.latest.obo"))
"""
Dictionary with go term child --> to [list of go term parents] relationships
"""


def get_localization_name(go_id, default=""):
    return GO_TREE.get(go_id, (default, "", ""))[0]


def _verify_in_ontology(term):
    if term not in GO_TREE:
        raise KeyError(
            "The term '{}' is not recognized in the considered GO ontology hierarchy"
            .format(term))


def are_go_parent_and_child(parent, child):
    """
    True if terms are equal or parent is indeed a parent in the localization GO of the child. False otherwise.
    """
    return _go_ids_accept_single(parent, child) is True