def build_json(self, filename1, filename2): # *** Work through manually curated chemical probes from the different portals *** # chemicalprobes column names are Probe, Target, SGClink, CPPlink, OSPlink, Note with URLZSource(filename1).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): # Generate 'line' for current target probelinks = [] if row["SGClink"] != "": probelinks.append({ 'source': "Structural Genomics Consortium", 'link': row["SGClink"] }) if row["CPPlink"] != "": probelinks.append({ 'source': "Chemical Probes Portal", 'link': row["CPPlink"] }) if row["OSPlink"] != "": probelinks.append({ 'source': "Open Science Probes", 'link': row["OSPlink"] }) line = { "gene": row["Target"], "chemicalprobe": row["Probe"], "sourcelinks": probelinks, "note": row["Note"] } # Add data for current chemical probe to self.chemicalprobes[Target]['portalprobes'] # If gene has not appeared in chemical probe list yet, # initialise self.chemicalprobes with an empty list if row["Target"] not in self.chemicalprobes: self.chemicalprobes[row["Target"]] = {} self.chemicalprobes[row["Target"]]['portalprobes'] = [] self.chemicalprobes[row["Target"]]['portalprobes'].append(line) # *** Work through Probe Miner targets *** # probeminer column names are Target, UniPRotID, NrofProbes # probeminer column names are hgnc_symbol, uniprot_symbol, nr_of_probes with URLZSource(filename2).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): PMdata = { "probenumber": row["nr_of_probes"], "link": "https://probeminer.icr.ac.uk/#/" + row["uniprot_symbol"] } if row["hgnc_symbol"] not in self.chemicalprobes: self.chemicalprobes[row["hgnc_symbol"]] = {} self.chemicalprobes[row["hgnc_symbol"]]['probeminer'] = PMdata
def get_pathway_relations(self): added_relations = [] with URLZSource(self.pathway_relation_url).open() as source: for i, row in enumerate(csv.DictReader( source, fieldnames=self.headers_pathway_rel, dialect='excel-tab'), start=1): if len(row) != 2: raise ValueError( 'Reactome.py: Pathway Relation file format unexpected at line %d.' % i) parent_id = row["id"] child_id = row["related_id"] relation = (parent_id, child_id) if relation not in added_relations: if parent_id in self.valid_pathway_ids: yield dict( id=parent_id, child=child_id, ) added_relations.append(relation) if len(added_relations) % 1000 == 0: self.logger.debug( "%i rows parsed from reactome_pathway_relation" % len(added_relations)) else: self.logger.warn( "Pathway relation %s is already loaded, skipping duplicate data" % str(relation)) self.logger.info('parsed %i rows from reactome_pathway_relation' % len(added_relations))
def get_pathway_data(self): self.valid_pathway_ids = [] with URLZSource(self.pathway_data_url).open() as source: for i, row in enumerate(csv.DictReader(source, fieldnames=self.headers, dialect='excel-tab'), start=1): if len(row) != 3: raise ValueError( 'Reactome.py: Pathway file format unexpected at line %d.' % i) pathway_id = row["id"] pathway_name = row["description"] species = row["species"] if pathway_id not in self.valid_pathway_ids: if species in self.allowed_species: self.valid_pathway_ids.append(pathway_id) yield dict( id=pathway_id, name=pathway_name, species=species, ) if len(self.valid_pathway_ids) % 1000 == 0: self.logger.debug( "%i rows parsed for reactome_pathway_data" % len(self.valid_pathway_ids)) else: self.logger.warn( "Pathway id %s is already loaded, skipping duplicate data" % pathway_id) self.logger.info('parsed %i rows for reactome_pathway_data' % len(self.valid_pathway_ids))
def retrieve_normal_tissue_data(self): """Parse 'normal_tissue' csv file, the expression profiles for proteins in human tissues from HPA :return: dict """ self.logger.info('get normal tissue rows into dicts') table = (petl.fromcsv( URLZSource(self.normal_tissue_url), delimiter='\t' ).rename({ 'Tissue': 'tissue', 'Cell type': 'cell_type', 'Level': 'level', 'Reliability': 'reliability', 'Gene': 'gene' }).cut('tissue', 'cell_type', 'level', 'reliability', 'gene').addfield( 'tissue_label', lambda rec: name_from_tissue(rec['tissue'].strip(), self.t2m) ).addfield('tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'tissue_level', lambda rec: level_from_text(rec['level'])).addfield( 'anatomical_systems', lambda rec: asys_from_tissue(rec['tissue_label'], self.t2m) ).addfield( 'organs', lambda rec: organs_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'tissue_reliability', lambda rec: reliability_from_text(rec['reliability'])).cut( 'gene', 'tissue_code', 'tissue_label', 'tissue_level', 'tissue_reliability', 'cell_type', 'anatomical_systems', 'organs').aggregate( ('gene', 'tissue_code'), aggregation={ 'cell_types': (('cell_type', 'tissue_level', 'tissue_reliability'), list), 'tissue_label': ('tissue_label', set), 'anatomical_systems': ('anatomical_systems', list), 'organs': ('organs', list) }, presorted=True). aggregate('gene', aggregation={ 'data': (('tissue_code', 'tissue_label', 'cell_types', 'anatomical_systems', 'organs'), list) }, presorted=True).addfield( 'result', lambda rec: format_expression(rec)).cut( 'gene', 'result')) return table
def test_urlzsource(self): lines4 = [] with URLZSource('http://www.google.com/robots.txt').open() as f: take_and_rstrip = compose(curry(map, lambda l: rstrip(l, '\n')), curry(take, 4)) lines4 = list(take_and_rstrip(f)) print(str(lines4)) self.assertGreaterEqual(len(lines4), 1, "Failed to get more than 0 lines")
def populate_molecules_dict(self): self._logger.info('ChEMBL getting Molecule from ' + self.molecule_set_uri_pattern) # Shelve creates a file with specific database. Using a temp file requires a workaround to open it. # dumbdbm creates an empty database file. In this way shelve can open it properly. t_filename = tempfile.NamedTemporaryFile(delete=False).name dumb_dict = dumbdbm.open(t_filename) shelve_out = shelve.Shelf(dict=dumb_dict) with URLZSource(self.molecule_set_uri_pattern).open() as f_obj: for line in f_obj: mol = json.loads(line) shelve_out[str(mol["molecule_chembl_id"])] = mol self._logger.info('ChEMBL Molecule loading done. ') return shelve_out
def __init__(self, tissue_translation_map, tissue_curation_map, normal_tissue_url, rna_level_url, rna_value_url, rna_zscore_url): self.logger = logging.getLogger(__name__) self.tissue_translation_map = tissue_translation_map self.tissue_curation_map = tissue_curation_map self.normal_tissue_url = normal_tissue_url self.rna_level_url = rna_level_url self.rna_value_url = rna_value_url self.rna_zscore_url = rna_zscore_url #load t2m t2m = {'tissues': {}, 'curations': {}} with URLZSource(self.tissue_translation_map).open() as r_file: t2m['tissues'] = json.load(r_file)['tissues'] with URLZSource(self.tissue_curation_map).open() as r_file: t2m['curations'] = { el['name']: el['canonical'] for el in csv.DictReader( r_file, fieldnames=['name', 'canonical'], delimiter='\t') } self.t2m = t2m
def get_data_config(data_url): with URLZSource(data_url).open() as r_file: #note us safe loading as described at https://pyyaml.org/wiki/PyYAMLDocumentation #TL;DR - only dicts and lists and primitives data_config = yaml.safe_load(r_file) #replace hyphens with underscores in variable #this is because we want to use addict to #access config as config.foo_bar instead of config["foo-bar"] data_config_underscores = {} for key in data_config: key_underscore = key.replace("-", "_") data_config_underscores[key_underscore] = data_config[key] return addict.Dict(data_config_underscores)
def merge_data(self, genes, loader, r_server, data_config): self._logger.info("HGNC parsing - requesting from URL %s", data_config.hgnc_complete_set) with URLZSource(data_config.hgnc_complete_set).open() as source: data = json.load(source) for row in data['response']['docs']: gene = Gene() gene.load_hgnc_data_from_json(row) genes.add_gene(gene) self._logger.info("STATS AFTER HGNC PARSING:\n" + genes.get_stats())
def process(self, ensembl_filename, dry_run): def _put_line(line): return 1 self.logger.info('Reading Ensembl gene info from %s' % ensembl_filename) #setup elasticsearch if not dry_run: self.loader.create_new_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) #need to directly get the versioned index name for this function self.loader.prepare_for_bulk_indexing( self.loader.get_versioned_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)) inserted_lines = 0 for line in more_itertools.with_iter( URLZSource(ensembl_filename).open()): entry = json.loads(line) #store in elasticsearch if not dry running if not dry_run: self.loader.put(Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, Const.ELASTICSEARCH_ENSEMBL_DOC_NAME, entry['id'], line) inserted_lines += 1 self.logger.info("Read %d lines from %s", inserted_lines, ensembl_filename) self.logger.info("flush index") #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
def merge_data(self, genes, loader, r_server, data_config): #turn the species id/label mappings into a dict from the argument list self.orthologs_species = dict() if data_config.hgnc_orthologs_species: for value in data_config.hgnc_orthologs_species: code, label = value.split("-") label = label.strip() code = code.strip() self.orthologs_species[code] = label self._logger.info("Ortholog parsing - requesting from URL %s", data_config.hgnc_orthologs) with URLZSource(data_config.hgnc_orthologs).open() as source: reader = csv.DictReader(source, delimiter="\t") for row in reader: if row['human_ensembl_gene'] in genes: self.add_ortholog_data_to_gene( gene=genes[row['human_ensembl_gene']], data=row) self._logger.info("STATS AFTER HGNC ortholog PARSING:\n" + genes.get_stats())
def build_json(self, filename): # Just for reference: column names are: "ID_CENSUS_ANNOT", "ID_CENSUS", "ID_GENE", "GENE_NAME", "CELL_TYPE", # "PUBMED_PMID", "ID_DATA_CATEGORY", "DESCRIPTION", "DISPLAY", "SHORT", "CELL_LINE", "DESCRIPTION_1") with URLZSource(filename).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): PMID = re.sub(r'^"|"$', '', row["PUBMED_PMID"]) Short = re.sub(r'^"|"$', '', row["SHORT"]) GeneSymbol = re.sub(r'^"|"$', '', row["GENE_NAME"]) Description_1 = re.sub(r'^"|"$', '', row["DESCRIPTION_1"]) Description_1.rstrip() Description = re.sub(r'^"|"$', '', row["DESCRIPTION"]) if GeneSymbol not in self.hallmarks: self.hallmarks[GeneSymbol] = dict() if Description_1 in self.hallmarks_labels: promote = False suppress = False if Short == 'a': promote = True if Short == 's': suppress = True line = { "label": Description_1, "description": Description, "promote": promote, "suppress": suppress, "pmid": PMID } try: self.hallmarks[GeneSymbol]["cancer_hallmarks"].append( line) except KeyError: self.hallmarks[GeneSymbol]["cancer_hallmarks"] = list() self.hallmarks[GeneSymbol]["cancer_hallmarks"].append( line) elif Description_1 == 'function summary': line = {"pmid": PMID, "description": Description} try: self.hallmarks[GeneSymbol]["function_summary"].append( line) except KeyError: self.hallmarks[GeneSymbol]["function_summary"] = list() self.hallmarks[GeneSymbol]["function_summary"].append( line) else: line = { "attribute_name": Description_1, "description": Description, "pmid": PMID } try: self.hallmarks[GeneSymbol]["attributes"].append(line) except KeyError: self.hallmarks[GeneSymbol]["attributes"] = list() self.hallmarks[GeneSymbol]["attributes"].append(line)
def retrieve_rna_data(self): """ Parse 'rna_tissue' csv file, RNA levels in 56 cell lines and 37 tissues based on RNA-seq from HPA. :return: dict """ self.logger.info('get rna tissue rows into dicts') self.logger.debug('melting rna level table into geneid tissue level') t_level = (petl.fromcsv(URLZSource( self.rna_level_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'anatomical_systems', lambda rec: asys_from_tissue( rec['tissue_label'], self.t2m)). addfield( 'organs', lambda rec: organs_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_value = (petl.fromcsv(URLZSource( self.rna_value_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_value').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'rna_unit', 'TPM').cutout('tissue')) t_zscore = (petl.fromcsv( URLZSource(self.rna_zscore_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='zscore_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue(rec['tissue'].strip( ), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_vl = petl.join(t_level, t_value, key=('gene', 'tissue_code', 'tissue_label'), presorted=True) t_join = (petl.join(t_vl, t_zscore, key=('gene', 'tissue_code', 'tissue_label'), presorted=True).aggregate( 'gene', aggregation={ 'data': (('tissue_code', 'tissue_label', 'rna_level', 'rna_value', 'rna_unit', 'anatomical_systems', 'organs', 'zscore_level'), list) }, presorted=True)) return t_join
def get_genotype_phenotype(self): self._logger.debug("get_genotype_phenotype") with URLZSource( self.data_config.mouse_phenotypes_orthology).open() as fi: self._logger.debug("get %s", self.data_config.mouse_phenotypes_orthology) for li, line in enumerate(fi): # a way too many false spaces just to bother people array = map(str.strip, line.strip().split("\t")) if len(array) == 7: (human_gene_symbol, a, b, c, mouse_gene_symbol, mouse_gene_id, phenotypes_raw) = array # at least 1 phenotype in phenotypes_raw if len(phenotypes_raw) > 0: try: mouse_gene_id = mouse_gene_id.strip() mouse_gene_symbol = mouse_gene_symbol.strip() if mouse_gene_id not in self.mouse_genes: self.mouse_genes[mouse_gene_id] = { "gene_id": mouse_gene_id, "gene_symbol": mouse_gene_symbol, "phenotypes": {}, "human_orthologs": [], "phenotypes_summary": list(phenotypes_raw.strip().split("\s+")) } self.mouse_genes[mouse_gene_id][ "human_orthologs"].append({ "gene_symbol": human_gene_symbol, "gene_id": None }) if human_gene_symbol not in self.human_genes: self.human_genes[human_gene_symbol] = { "gene_symbol": human_gene_symbol, "ensembl_gene_id": None, "gene_id": None, "mouse_orthologs": [] } except Exception as e: self._logger.debug( "exception processing a line %d: %s", li, str(e)) self._logger.info("Retrieved %i mouse genes", len(self.mouse_genes)) count_symbols = set() count_accepted_symbols = set() with URLZSource(self.data_config.mouse_phenotypes_report).open() as fi: # lines = response.readlines() self._logger.debug("get lines from mgi report phenotyes file %s", self.data_config.mouse_phenotypes_report) # Allelic Composition Allele Symbol(s) Genetic Background Mammalian Phenotype ID PubMed ID MGI Marker Accession ID for li, line in enumerate(fi): # a way too many false spaces just to bother people array = map(str.strip, line.strip().split("\t")) self._logger.debug('mouse KO array %s in line %d', str(array), li) if len(array) == 6: (allelic_composition, allele_symbol, genetic_background, mp_id, pmid, mouse_gene_ids) = array # check for double-mutant but exclude duplicates for mouse_gene_id in set(mouse_gene_ids.split(",")): # exclude heritable phenotypic marker like http://www.debugrmatics.jax.org/marker/MGI:97446 count_symbols.add(mouse_gene_id) mp_id_key = mp_id.split("/")[-1].replace(":", "_") self._logger.debug("Looking for mouse_gene_id " + mouse_gene_id) self._logger.debug("Looking for mp_id_key " + mp_id_key) if mouse_gene_id in self.mouse_genes and mp_id_key in self.mps: self._logger.debug('process mouse KO gene %s', mouse_gene_id) count_accepted_symbols.add(mouse_gene_id) self._logger.debug('get class for %s' % mp_id) mp_class = self.mps[mp_id_key] mp_label = mp_class["label"] for k, v in PHENOTYPE_CATEGORIES.iteritems(): if k not in self.mouse_genes[mouse_gene_id][ "phenotypes"]: self.mouse_genes[mouse_gene_id]["phenotypes"][k] = \ { "category_mp_identifier": k, "category_mp_label": v, "genotype_phenotype": [] } # it's possible that there are multiple paths to the same root. mp_category_ids = set( map(lambda x: x[0], mp_class["path_codes"])) for category_id in mp_category_ids: mp_category_id = category_id.replace("_", ":") self.mouse_genes[mouse_gene_id]["phenotypes"][ mp_category_id][ "genotype_phenotype"].append({ "subject_allelic_composition": allelic_composition, "subject_background": genetic_background, "pmid": pmid, "mp_identifier": mp_id, "mp_label": mp_label }) else: self._logger.warning( 'process mouse KO gene %s failed because not in self.mouse_genes set in line %d', mouse_gene_id, li) else: self._logger.warning("could not process %i %s", len(array), line) self._logger.info("Count symbols %i / %i with phenotypes", len(count_accepted_symbols), len(count_symbols))
def open_to_read(filename): """return an iterator from izip (filename, (enumerate(file_handle, start=1))""" _l.debug('generate an iterator of (filename,enumerate) for filename %s', filename) it = more_itertools.with_iter(URLZSource(filename).open()) return itertools.izip(itertools.cycle([filename]), enumerate(it, start=1))
def build_json(self, filename): with URLZSource(filename).open() as r_file: # fieldnames=cancerbiomarker_columns not used at the moment for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): Source = row["Source"] Gene = row["Gene"] IndividualMutation = row["IndividualMutation"] PrimaryTumorTypeFullName = row["PrimaryTumorTypeFullName"] # Split Source and Gene to separate out multiple entries mSource = map(str.strip, Source.split(";")) geneList = list(map(str.strip, Gene.split(";"))) # If the two genes are identical, only keep one copy to prevent duplication of current biomarker if len(geneList) > 1: if geneList[0] == geneList[1]: geneList = [geneList[0]] # Edit IndividualMutation from eg. FGFR3:V555M to FGFR3 (V555M) # Replace ':' with ' (' and add ')' at the end if ":" in IndividualMutation: IndividualMutation = IndividualMutation.replace(':', ' (') + ')' # Get Tumor type names and EFO IDs/links PrimaryTumorTypeFullName = PrimaryTumorTypeFullName.replace( ' ', '_') PrimaryTumorTypeFullName = PrimaryTumorTypeFullName.replace( '-', '_') TumorNames = "" TumorIDs = "" if ";" in PrimaryTumorTypeFullName: TumorTypes = PrimaryTumorTypeFullName.split(";") diseases = [] for TumorType in TumorTypes: diseases.append({ 'label': BIOMARKER_DISEASE_MAPPINGS[TumorType]['label'], 'id': (BIOMARKER_DISEASE_MAPPINGS[TumorType]['url'] ).split('/')[-1] }) else: diseases = [{ 'label': BIOMARKER_DISEASE_MAPPINGS[PrimaryTumorTypeFullName] ['label'], 'id': (BIOMARKER_DISEASE_MAPPINGS[PrimaryTumorTypeFullName] ['url']).split('/')[-1] }] # Iterate through genes and sources for singleGene in geneList: # Replace 3 gene symbols with their approved_symbol (C15orf55=NUTM1, MLL=KMT2A, MLL2=KMT2D) if singleGene == 'C15orf55': singleGene = 'NUTM1' elif singleGene == 'MLL': singleGene = 'KMT2A' elif singleGene == 'MLL2': singleGene = 'KMT2D' # If gene has not appeared in biomarker list yet, # initialise self.cancerbiomarkers with an empty list if singleGene not in self.cancerbiomarkers: self.cancerbiomarkers[singleGene] = [] # Create empty lists for PMIDs and other references pubmed = [] other = [] # Go through the references/sources for singleSource in mSource: if "PMID" in singleSource: # If the source is a PMID currPMID = singleSource[ 5:] # Remove 'PMID:' if necessary pubmed.append({'pmid': currPMID}) else: # Else: the source is either a clinical trial or a conference abstract if 'NCT' in singleSource: other.append({ 'name': singleSource, 'link': 'https://clinicaltrials.gov/ct2/show/' + singleSource, 'description': 'Clinical Trial' }) elif singleSource.split( " (")[0] in BIOMARKER_SOURCE_MAPPINGS: other.append({ 'name': singleSource, 'link': BIOMARKER_SOURCE_MAPPINGS[ singleSource.split(" (")[0]]['url'], 'description': BIOMARKER_SOURCE_MAPPINGS[ singleSource.split(" (")[0]]['label'] }) # Put the reference info together for each biomarker myReferences = {"pubmed": pubmed, "other": other} line = { "gene": singleGene, "biomarker": row["Biomarker"], "individualbiomarker": row["IndividualMutation"], "association": row["Association"], "drug": row["Drug"], "drugfamily": row["DrugFamily"], "drugfullname": row["DrugFullName"], "diseases": diseases, "evidencelevel": row["EvidenceLevel"], "references": myReferences } # Add data for current biomarker to self.cancerbiomarkers self.cancerbiomarkers[singleGene].append(line)
def get_chembl_info_by_file(uri): with URLZSource(uri).open() as f_obj: for i, line in enumerate(f_obj, start=1): cheml_dict = json.loads(line) yield cheml_dict
def build_json(self, filename): self._logger.info("Data in TSV file comes in non standard ways, eg. bool comes as categ. data Y/N" "so casting to bool, int and float with default fallback values instead of " "throwing exceptions as we are parsing a TSV file where types do not exist") to_bool = SaferBool(with_fallback=False) to_int = SaferInt(with_fallback=0) to_float = SaferFloat(with_fallback=0.) sm_bucket_list = [1, 2, 3, 4, 5, 6, 7, 8] ab_bucket_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] with URLZSource(filename).open() as r_file: for i, row in enumerate(csv.DictReader(r_file, dialect='excel-tab'), start=1): try: # Get lists of small molecule and antibody buckets buckets = list(row[k] for k in ("Bucket_1", "Bucket_2", "Bucket_3", "Bucket_4", "Bucket_5", "Bucket_6", "Bucket_7", "Bucket_8")) buckets_ab = list(row[k] for k in ("Bucket_1_ab", "Bucket_2_ab", "Bucket_3_ab", "Bucket_4_ab", "Bucket_5_ab", "Bucket_6_ab", "Bucket_7_ab", "Bucket_8_ab", "Bucket_9_ab")) sm_buckets = list(compress(sm_bucket_list, [x == '1' for x in buckets])) ab_buckets = list(compress(ab_bucket_list, [x == '1' for x in buckets_ab])) # struct is built inline as the most pythonic way is preferable and more explicit # line = { 'smallmolecule': { 'buckets': sm_buckets, # list of buckets 'categories': { 'clinical_precedence': to_float(row["Clinical_Precedence"]), 'discovery_precedence': to_float(row["Discovery_Precedence"]), 'predicted_tractable': to_float(row["Predicted_Tractable"]) }, 'top_category': row["Category"], # TODO drugebility score not used at the moment but in a future 'ensemble': to_float(row["ensemble"]), 'high_quality_compounds': to_int(row["High_Quality_ChEMBL_compounds"]), 'small_molecule_genome_member': to_bool(row["Small_Molecule_Druggable_Genome_Member"]) }, 'antibody': { 'buckets': ab_buckets, 'categories': { 'clinical_precedence': to_float(row["Clinical_Precedence_ab"]), 'predicted_tractable_high_confidence': to_float(row["Predicted_Tractable__High_confidence"]), 'predicted_tractable_med_low_confidence': to_float(row["Predicted_Tractable__Medium_to_low_confidence"]) }, 'top_category': row["Category_ab"] } } # Add data for current gene to self.tractability self.tractability[row["ensembl_gene_id"]] = line except Exception as k_ex: self._logger.exception("this line %d won't be inserted %s with ex: %s", i, str(row), str(k_ex))