def __init__( self, # constants c_localization_enty_class=LOC_ID, c_localization_norm_class=GO_NORM_ID, c_protein_enty_class=PRO_ID, c_protein_norm_class=UNIPROT_NORM_ID, c_SwissProt_organisms_used=[9606 ], # TODO perhaps use other organisms ? # # The following two were DOCUMENT-BASED features and not domain-specific # f_corpus_unnormalized_total_background_loc_rels_ratios=None, f_corpus_normalized_total_background_loc_rels_ratios=None, # f_SwissProt_normalized_total_absolute_loc_rels_ratios=None, f_SwissProt_normalized_total_background_loc_rels_ratios=None, # f_SwissProt_normalized_exists_relation=None, ): self.c_localization_enty_class = c_localization_enty_class self.c_localization_norm_class = c_localization_norm_class self.c_protein_enty_class = c_protein_enty_class self.c_protein_norm_class = c_protein_norm_class # path = repo_path( "resources", "features", "corpus_unnormalized_total_background_loc_rels_ratios.pickle") with open(path, "rb") as f: self.c_corpus_unnormalized_total_background_loc_rels_ratios = pickle.load( f) # # Comment so far out as it doesn't provide benefits in performance # # path = repo_path("resources", "features", "corpus_normalized_total_background_loc_rels_ratios.pickle") # with open(path, "rb") as f: # self.c_corpus_normalized_total_background_loc_rels_ratios = pickle.load(f) # # path = repo_path("resources", "features", "SwissProt_normalized_total_absolute_loc_rels_ratios.pickle") # with open(path, "rb") as f: # self.c_SwissProt_normalized_total_absolute_loc_rels_ratios = pickle.load(f) # # path = repo_path("resources", "features", "SwissProt_normalized_total_background_loc_rels_ratios.pickle") # with open(path, "rb") as f: # self.c_SwissProt_normalized_total_background_loc_rels_ratios = pickle.load(f) self.c_SwissProt_organisms_used = c_SwissProt_organisms_used self.c_SwissProt_all_relations = SWISSPROT_ALL_RELATIONS # self.f_corpus_unnormalized_total_background_loc_rels_ratios = f_corpus_unnormalized_total_background_loc_rels_ratios self.f_corpus_normalized_total_background_loc_rels_ratios = f_corpus_normalized_total_background_loc_rels_ratios # self.f_SwissProt_normalized_total_absolute_loc_rels_ratios = f_SwissProt_normalized_total_absolute_loc_rels_ratios self.f_SwissProt_normalized_total_background_loc_rels_ratios = f_SwissProt_normalized_total_background_loc_rels_ratios # # self.f_SwissProt_normalized_exists_relation = f_SwissProt_normalized_exists_relation
import pickle from nalaf.utils.download import DownloadArticle from nalaf.utils.readers import StringReader, PMIDReader from loctext.learning.annotators import StringTagger from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, repo_path from loctext.learning.annotators import LocTextDXModelRelationExtractor RE_MODEL_PATH = repo_path("resources", "models", "D0_9606,3702,4932_1497520729.163767.bin") RE_FEATURES_PATH = repo_path( "resources", "features", "selected", "0_True_LinearSVC_C=2.0-1487943476.673364-NAMES.py") RE_MODEL_BIN = None with open(RE_MODEL_PATH, "rb") as f: RE_MODEL_BIN = pickle.load(f) def parse_arguments(argv=[]): import argparse parser = argparse.ArgumentParser( description= 'Run LocText on some text to extract Protein<-->Cell Compartments relations' ) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument("--text", help="Run against given text/string") input_group.add_argument( "--pmid", help=
from loctext.util import PRO_ID, LOC_ID, ORG_ID, REL_PRO_LOC_ID, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID, repo_path from loctext.learning.evaluations import are_go_parent_and_child from loctext.util import simple_parse_GO GO_TREE = simple_parse_GO.simple_parse( repo_path("resources", "ontologies", "go-basic.cellular_component.latest.obo")) Lars = [ "GO:0005576", # extracellular "GO:0005634", # nucleus "GO:0005739", # mitochondrion "GO:0005764", # lysosome "GO:0005768", # endosome "GO:0005773", # vacuole "GO:0005777", # peroxisome "GO:0005783", # endoplasmic reticulum "GO:0005794", # golgi apparatus "GO:0005829", # cytosol "GO:0005856", # cytoskeleton "GO:0005886", # plasma membrane "GO:0009507", # chloroplast ] Tanya = [ # "GO:0016021", # integral to membrane "GO:0009507", # chloroplast "GO:0009535", # chloroplast thylakoid membrane # "GO:0005737", # cytoplasm "GO:0005783", # endoplasmic reticulum "GO:0005789", # endoplasmic reticulum membrane
import json from pprint import pprint import re # Run grep -o "GO:[0-9]*" "resources/features/human_localization_all__2016-11-20.tsv" | sort | uniq -c | sort > scripts/precomputed_SwissProt_normalized_total_absolute_loc_rels_ratio.txt in_path = "scripts/precomputed_SwissProt_normalized_total_absolute_loc_rels_ratio.txt" mode = "normalized" regex_canonical_go_id = re.compile("canonical[\s\W]*?(GO:[0-9]+)") with open(in_path) as f: counter_relations = Counter({go: int(count) for line in f for count, go in [line.strip().split(" ")]}) GO_TREE = simple_parse_GO.simple_parse(repo_path(["resources", "ontologies", "go-basic.cellular_component.latest.obo"])) counter_mentions = Counter() for json_path in glob.glob("resources/features/human_localization_all_PMIDs_only_StringTagger_results__2016-11-20/*.json"): with open(json_path) as f: # We read the file as string since we saw json-parsing errors or inconsistencies data = f.read() for go in regex_canonical_go_id.findall(data): if go in GO_TREE: counter_mentions.update([go]) for key in counter_relations: if key not in counter_mentions: counter_mentions.update({key: 1})
regex_go_id = re.compile('GO:[0-9]+') with open(in_path) as f: next(f) # skip header all_relations = {} for line in f: upid, organism_id, localization_gos = line.split("\t") organism_id = int(organism_id) organism_relations = all_relations.get(organism_id, {}) go_terms = regex_go_id.findall(localization_gos) organism_relations[upid] = set(go_terms) all_relations[organism_id] = organism_relations print( "Total uniprot entries:", sum( len(organism_relations) for organism_relations in all_relations.values())) print("4932 unit prot entries:", (len(all_relations[4932]))) print("9606 unit prot entries:", (len(all_relations[9606]))) print("Organisms:", set(all_relations.keys())) out_path = repo_path("resources", "features", "SwissProt_all_relations.pickle") with open(out_path, "wb") as f: pickle.dump(all_relations, f)
import pickle from itertools import product from loctext.util import repo_path, UNIPROT_NORM_ID, GO_NORM_ID, TAXONOMY_NORM_ID from loctext.util import simple_parse_GO from loctext.util.ncbi_global_align import global_align GO_TREE = simple_parse_GO.simple_parse( repo_path("resources", "ontologies", "go-basic.cellular_component.latest.obo")) """ Dictionary with go term child --> to [list of go term parents] relationships """ def get_localization_name(go_id, default=""): return GO_TREE.get(go_id, (default, "", ""))[0] def _verify_in_ontology(term): if term not in GO_TREE: raise KeyError( "The term '{}' is not recognized in the considered GO ontology hierarchy" .format(term)) def are_go_parent_and_child(parent, child): """ True if terms are equal or parent is indeed a parent in the localization GO of the child. False otherwise. """ return _go_ids_accept_single(parent, child) is True