def get_raw_references(datapath, phenotype_set): """Parse a phenotype file and collect descriptions and raw phenotypes raw phenotypes are phenotypes in the original ontology Args: datapath path to phenotab file phenotype_set set of acceptable phenotypes Returns: two objects - dict mapping reference codes to reference descriptions and phenotypes - set of phenotypes that could not be mapped """ badphenotypes = set() references = dict() with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: if not valid_reference_id(row["Reference"]): continue phenotype = row["Phenotype"] if phenotype not in phenotype_set: badphenotypes.add(phenotype) continue rowval = tofreq[row["Frequency"]] id = row["Source"] + ":" + row["Disease_number"] if id not in references: references[id] = Representation(name=id) references[id].title = row["Disease_title"] references[id].set(phenotype, rowval) return references, badphenotypes
def get_emapa_map(emap_path, obo): """read a file definition of emap to mp mappings""" # get all the mapping from the raw file raw = dict() with open_file(emap_path, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: emapa = str(row["EMAPA_ID"]) mp = str(row["MP_ID"]) if not obo.has(mp): print("Skipping mp term: " + mp) continue ancestors = obo.ancestors(mp) if emapa not in raw: raw[emapa] = [] raw[emapa].append((len(ancestors), mp)) # extract the best hits result = dict() for k, hits in raw.items(): hits.sort() best_value = hits[0][0] result[k] = [mp for _, mp in hits if _ == best_value] return result
def prep_oo(owlfile, obo, siblings=False): """Scans one file and identify best 1-to-1 ontology mappings.""" okterms = set(obo.ids()) result = [] # scan file, collect lines into an OOset, then remember best hits with open_file(owlfile, "rt") as f: state = OOset() for line in f: tokens = line.split("\t") tokens[0] = tokens[0].replace("_", ":") tokens[1] = tokens[1].replace("_", ":") # skip over mapping to terms that are not in the ontology if tokens[1] not in okterms: continue # perhaps reset the OOset if tokens[0] != state.term1: if state.term1 is not None: result.extend(state.hits(obo, siblings=siblings)) state = OOset(tokens[0]) # add the mapping into the current store state.add(tokens[1], float(tokens[2]) * float(tokens[3])) if state.term1 != "": result.extend(state.hits(obo, siblings=siblings)) return result
def write_priors(priors, outprefix): """write phenotype priors to an output files.""" outfile = outprefix + "-priors.tsv.gz" with open_file(outfile, "wt") as f: fwrite(f, "\t".join(["phenotype", "value"])) for phenotype, value in priors.items(): fwrite(f, phenotype + "\t" + str(value))
def fill_phenotype_frequency_table(dbpath, datapath): """Transfer phenotype frequencies from a file into the database.""" freqtable = PhenotypeFrequencyTable(dbpath) with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: freqtable.add(row["phenotype"], float(row["value"])) freqtable.save()
def fill_concise_reference_table(dbpath, datapath): """transfer phenotypes from a data file into the database.""" model = ReferenceConcisePhenotypeTable(dbpath) with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: model.add(row["id"], row["phenotype"], float(row["value"])) model.save()
def write_descriptions(models, outfile, exclude=["timestamp"]): """ write a table with model descriptions.""" colnames = ModelDescriptionTable.text_fields colnames = [_ for _ in colnames if _ not in set(exclude)] with open_file(outfile, "wt") as f: fwrite(f, "\t".join(colnames)) for _, model in models.items(): fwrite(f, format_line(model, colnames))
def write_oo(ooarray, outprefix): """write a table summarizing ontology-ontology mapping.""" header = ["term1", "term2", "score"] outfile = outprefix + "-oomap.tsv.gz" with open_file(outfile, "wt") as f: fwrite(f, "\t".join(header)) for data in ooarray: fwrite(f, "\t".join([str(_) for _ in data]))
def write_references(references, outprefix): """write phenotypes for a set of references into output files.""" outfile = outprefix + "-phenotypes.tsv.gz" colnames = ["id", "phenotype", "value"] with open_file(outfile, "wt") as f: fwrite(f, "\t".join(colnames)) for key, object in references.items(): for phenotype, value in object.data.items(): fwrite(f, "\t".join([key, phenotype, str(value)]))
def test_writing(self): """write the parsed MGI data onto files""" obo = MinimalObo(obo_file) models = prep_IMPC(impc_file, (0.8, 0.05), 0.01, obo=obo) write_models(models, out_prefix) # read contents back self.assertTrue(exists(desc_file)) self.assertTrue(exists(pheno_file)) with open_file(desc_file, "rt") as f: desc = f.read().strip().split("\n") with open_file(pheno_file, "rt") as f: pheno = f.read().strip().split("\n") # description file should have 25 lines, 24 data lines plus header self.assertEqual(len(desc), 25) # phenotype file should have at least 7 lines (more) self.assertGreater(len(pheno), 7)
def get_file_models(filepath, timestamp): """get model descriptions defined in a file.""" result = dict() if filepath is None: return result with open_file(filepath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: result[row["id"]] = make_model(row, timestamp) return result
def test_writing(self): """write the parsed MGI data onto files""" models = prep_MGI(mgi_file, (0.8, 0.05), obo) write_models(models, out_prefix) # read contents back self.assertTrue(exists(desc_file)) self.assertTrue(exists(pheno_file)) with open_file(desc_file, "rt") as f: desc = f.read().strip().split("\n") with open_file(pheno_file, "rt") as f: pheno = f.read().strip().split("\n") # description file should have 5 lines, 4 data lines plus header # this one allele_id in two zygosities - 2 genotype models # 2 genotypes models will give 2 marker models self.assertEqual(len(desc), 5) # phenotype file should have at least 5 lines again (more) self.assertGreater(len(pheno), 5)
def test_writing(self): """write the parsed phenotypes into file""" write_references(self.references, out_prefix) # read contents back self.assertTrue(exists(out_file)) with open_file(out_file, "rt") as f: result = f.read().strip().split("\n") self.assertEqual(len(result), 12, "rows 3+3+3+2 for phenotypes and 1 for header")
def test_writing(self): """check output is a two column tsv""" priors, num = get_priors_from_models(self.models, set(["genotype"]), obo, dark=1) write_priors(priors, out_prefix) self.assertTrue(exists(priors_file)) with open_file(priors_file, "rt") as f: data = f.read().strip().split("\n") self.assertEqual(len(data), 1+len(obo.ids())) self.assertEqual(len(data[0].split("\t")), 2)
def test_imputing(self): """create new models based on UA.""" obo = MinimalObo(obo_file) models = prep_IMPC(impc_file, (0.8, 0.05), 0.01, obo=obo) models_allele = get_UA_models(models, "allele") imputed = impute_IMPC(models_allele, obo, 0) write_models(imputed, out_prefix + "-imputed") # check output files exist and contain proper content self.assertTrue(exists(imputed_desc_file)) self.assertTrue(exists(imputed_pheno_file)) with open_file(imputed_desc_file, "rt") as f: desc = f.read().strip().split("\n") with open_file(imputed_pheno_file, "rt") as f: pheno = f.read().strip().split("\n") # description file should have 3 lines, 2 desc lines plus header self.assertEqual(len(desc), 3) # phenotype file should have a few lines self.assertGreater(len(pheno), 3)
def write_hits_summary(tested, hits, outprefix): """write a table linking parameters, MP, to number of markers.""" header = ["parameter", "MP_term", "markers_tested", "markers_significant"] outfile = outprefix + "-hits-summary.tsv.gz" with open_file(outfile, "wt") as f: fwrite(f, "\t".join(header)) for key in tested: num_tested = str(len(tested[key])) num_hits = str(len(hits[key])) fwrite(f, key + "\t" + num_tested + "\t" + num_hits)
def test_write_hits_summary(self): """get a summary of number of hits.""" tested, hits = get_IMPC_hits_summary(impc_file, 0.01) write_hits_summary(tested, hits, out_prefix) self.assertTrue(exists(hits_file)) with open_file(hits_file, "rt") as f: summary = f.read().strip().split("\n") # output should have three phenotypes plus header self.assertEqual(len(summary), 4)
def get_gxd(gxd_path, emp_map, tprfpr): """read a file with marker-emapa associationss Arguments: gxd_path file with columns .... emp_map dict mapping EMAPA ids to other ids tprfpr 2-tuple with (tpr, fpr) Returns: dict mapping markers to phenotypes terms """ tpr = tprfpr[0] fpr = tprfpr[1] # get all the mapping from the raw file result = dict() with open_file(gxd_path, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: feature = row["feature.primaryIdentifier"] emapa = row["structure.identifier"] strength = row["strength"] if feature not in result: modelid = "GXD_" + feature result[feature] = Entity(modelid, "expression", marker_id=feature) result[feature].set_description("expression", 1) result[feature].set_description("source", "GXD") if emapa not in emp_map: continue if strength not in gxd_strength: continue # determine whether to add a positive or negative phenotype strength_factor = gxd_strength[strength] row_exp = Experiment(1, fpr + (tpr - fpr) * strength_factor, fpr) if strength == "Absent": row_exp.value = 0 for mp in emp_map[emapa]: result[feature].add(PhenotypeDatum(mp, row_exp)) # get a concensus value for id in result: result[id].consensus() return result
def write_phenotype_cooc(cooc, phenindex, outprefix): """write a table summarizing co-occurance of phenotypes.""" header = ["A", "B", "value"] outfile = outprefix + "-cooc.tsv.gz" with open_file(outfile, "wt") as f: fwrite(f, "\t".join(header)) for p1, i1 in phenindex.items(): for p2, i2 in phenindex.items(): if cooc[i1, i2] == 0: continue line = [p1, p2, str(cooc[i1, i2])] fwrite(f, "\t".join(line))
def get_file_phenotypes(filepath, timestamp): """get model phenotypes from a file.""" result = dict() if filepath is None: return result with open_file(filepath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="'") for row in reader: experiment = Experiment(row["value"], row["TPR"], row["FPR"]) stamp = timestamp if "timestamp" in row: stamp = row["timestamp"] datum = PhenotypeDatum(row["phenotype"], experiment, stamp) id = row["id"] if id not in result: result[id] = [] result[id].append(datum) return result
def write_model_phenotypes(models, outfile, exclude=["id"]): """write a table with phenotypes Arguments: models dict with Model models outfile path to output file exclude set of columns to omit in output Returns: nothing, writes data into output file """ # get all column names (except id, which will be entered separately) colnames = get_colnames(ModelPhenotypeTable, exclude) with open_file(outfile, "wt") as f: fwrite(f, "id\t" + "\t".join(colnames)) for key, object in models.items(): for d in object.data: fwrite(f, object.id + "\t" + format_line(d, colnames))
def get_oo_map(oopath): """read a file mapping ontology terms to another set of terms. Args: oopath path to text file with term1, term2, score Return: dict mapping term1 -> [(term2, score)] """ result = dict() with open_file(oopath, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: term1 = row["term1"] score = float(row["score"]) if term1 not in result: result[term1] = [] for term2 in row["term2"].split(";"): result[term1].append((term2, score)) return result
def prep_IMPC(datapath, tprfpr, pthreshold, simplify="average", obo=None): """parse IMPC statistical results and assemble a set of models. Args: datapath: path to MGI raw file tprfpr: list with two elements (tpr, fpr) pthreshold: float, minimum threshold for significance simplify: string, method for simplifying multiple data type (use 'none', 'average', or 'consensus') obo: object of class MinimalObo """ models = dict() if datapath is None: return models now = now_timestamp() base_tpr, base_fpr = tprfpr[0], tprfpr[1] male = set(["M", "B", "U"]) female = set(["F", "B", "U"]) def create_models(id, category, zygosity, row): """Create a family of model definitions, for sex=FMU, neg_phen=01""" prefix = "IMPC_" + id + "_" + zygosity + "_" for suffix in ["F", "FA", "M", "MA", "U", "UA"]: id = prefix + suffix if id not in models: models[id] = impc_model(id, category, row, zygosity) models[id].set_description("sex", sex_code(suffix)) with_negative = negative_code(suffix) models[id].set_description("neg_phenotypes", with_negative) def add_to_model(datum, id, zygosity, suffix): """add a datum into an existing model definition. Arguments: datum phenotype and experiment result id, zygosity, suffix characterization of model """ id = "IMPC_" + id + "_" + zygosity + "_" + suffix models[id].add(datum) def add_set_to_models(datum, row, val, sex): """helper to add a set of models, for alleles, markers Arguments: datum phenotype and experiment result row dict val value of phenotype (0/1) sex one-letter code """ zygosity = (row["zygosity"])[:3] zygosity = "hom" if zygosity == "hem" else zygosity marker = row["marker_accession_id"] allele = row["allele_accession_id"] # perhaps create model definitions create_models(marker, "marker", zygosity, row) create_models(allele, "allele", zygosity, row) # record phenotypes into the models if val == 1: add_to_model(datum, marker, zygosity, sex) add_to_model(datum, allele, zygosity, sex) add_to_model(datum, marker, zygosity, sex + "A") add_to_model(datum, allele, zygosity, sex + "A") # get a map from parameter to mp_terms - used for negative phenotypes parameter_phenotype_map = get_parameter_phenotype_map(datapath, obo) with open_file(datapath, "rt") as f: reader = csv.DictReader(f, delimiter=",", quotechar="\"") for row in reader: # skip over bad data rows if row["status"] not in ("Success", "Successful"): continue if row["allele_symbol"] == "": continue # get a phenotype MP id phenotype = row["mp_term_id"].strip() # redefine some phenotypes # (this handles morphology MP:0002169 annotations) parameter = row["parameter_name"].strip() if phenotype + " " + parameter in redef: phenotype = redef[phenotype + " " + parameter] if phenotype == "" and parameter in parameter_phenotype_map: phenotype = parameter_phenotype_map[parameter] if phenotype == "" or phenotype == "MP:0002169": continue sex = sex_code(row["phenotype_sex"]) # identify whether this is a positive or a negative phenotype value = get_value(row, pthreshold) # add data at marker level, allele level, by gender hit = Experiment(value, base_tpr, base_fpr) datum = PhenotypeDatum(phenotype, hit, now) add_set_to_models(datum, row, value, "U") if sex in male: add_set_to_models(datum, row, value, "M") if sex in female: add_set_to_models(datum, row, value, "F") # some models may have redundant rows (e.g. a phenotype recorded twice) # so collapse into a consensus here if simplify == "consensus": for id in models: models[id].consensus() elif simplify == "average": for id in models: models[id].average() return models
""" Prep data from IMPC into a format for Phenoscoring """ import csv import pkg_resources from tools.files import open_file from scoring.experiment import Experiment from phenoscoring.phenotypedatum import PhenotypeDatum from phenoscoring.entity import Entity from phenoscoring.time import now_timestamp # fetch a dictionary of term redefinitions for MP:0002169 (no abnormal phenotype detected) redef = dict() redef_file = pkg_resources.resource_filename(__name__, "impc.2169.tsv") with open_file(redef_file, "rt") as f: reader = csv.DictReader(f, delimiter="\t", quotechar="\"") for row in reader: rowkey = row["original_id"] + " " + row["parameter_name"] redef[rowkey] = row["redefined_id"] # ########################################################################### # Functions relevant to prep_IMPC def is_float(x): """an ad-hoc way to determine if a string encodes a pvalue.""" try: float(x) except:
config = parser.parse_args() mousemine = Service("http://www.mousemine.org/mousemine/service") mousemine_views = ["assayType", "feature.symbol", "feature.primaryIdentifier", "stage", "age", "structure.name", "strength", "pattern", "genotype.symbol", "assayId", "probe", "image", "publication.mgiJnum", "emaps", "structure.identifier"] # fetch all markers markers = set() for filename in config.input.split(","): markers.update(values_in_column(filename, "marker_id")) print("Working with "+str(len(markers))+" markers") # fetch data from mousemine markers = list(markers) result = [] for i in range(0, len(markers), config.group_size): imarkers = markers[i:(i+config.group_size)] print("querying: " + str(i) + " of " + str(len(markers))) result.extend(download_mousemine(imarkers)) sleep(config.sleep) print("done") # write expression data to disk with open_file(config.output, "wt") as f: f.write("\t".join(mousemine_views)+"\n") for row in result: f.write("\t".join([str(_) for _ in row]) + "\n")