def make_details(locus_id=None, reference_id=None, topic=None): if locus_id is None and reference_id is None and topic is None: return {'Error': 'No locus_id or reference_id or topic given.'} evidences = get_literature_evidence(locus_id=locus_id, reference_id=reference_id, topic=topic) if evidences is None: return {'Error': 'Too much data to display.'} if locus_id is not None: primary_ids = set([x.reference_id for x in evidences if x.topic == 'Primary Literature']) evidences.sort(key=lambda x: (x.reference.year, x.reference.pubmed_id), reverse=True) go_references = sorted(set([x.reference for x in DBSession.query(Goevidence).filter_by(locus_id=locus_id).options(joinedload('reference')).all() if x.reference_id in primary_ids]), key=lambda x: (x.year, x.pubmed_id), reverse=True) phenotype_references = sorted(set([x.reference for x in DBSession.query(Phenotypeevidence).filter_by(locus_id=locus_id).options(joinedload('reference')).all() if x.reference_id in primary_ids]), key=lambda x: (x.year, x.pubmed_id), reverse=True) regulation_references = set([x.reference for x in DBSession.query(Regulationevidence).filter_by(locus1_id=locus_id).options(joinedload('reference')).all()]) regulation_references.update([x.reference for x in DBSession.query(Regulationevidence).filter_by(locus2_id=locus_id).options(joinedload('reference')).all()]) regulation_references = list(regulation_references) regulation_references.sort(key=lambda x: (x.year, x.pubmed_id), reverse=True) interaction_references = set([x.reference for x in DBSession.query(Geninteractionevidence).filter_by(locus1_id=locus_id).options(joinedload('reference')).all()]) interaction_references.update([x.reference for x in DBSession.query(Geninteractionevidence).filter_by(locus2_id=locus_id).options(joinedload('reference')).all()]) interaction_references.update([x.reference for x in DBSession.query(Physinteractionevidence).filter_by(locus1_id=locus_id).options(joinedload('reference')).all()]) interaction_references.update([x.reference for x in DBSession.query(Physinteractionevidence).filter_by(locus2_id=locus_id).options(joinedload('reference')).all()]) interaction_references = list(interaction_references) interaction_references.sort(key=lambda x: (x.year, x.pubmed_id), reverse=True) return json.dumps({'primary': [x.to_semi_json() for x in set([y.reference for y in evidences if y.topic == 'Primary Literature'])], 'additional': [x.to_semi_json() for x in set([y.reference for y in evidences if y.topic == 'Additional Literature'])], 'review': [x.to_semi_json() for x in set([y.reference for y in evidences if y.topic == 'Reviews'])], 'go': [x.to_semi_json() for x in go_references], 'phenotype': [x.to_semi_json() for x in phenotype_references], 'regulation': [x.to_semi_json() for x in regulation_references], 'interaction': [x.to_semi_json() for x in sorted(interaction_references, key=lambda x: (x.year, x.pubmed_id), reverse=True)]}) elif reference_id is not None: return '[' + ', '.join([x.json for x in evidences if x.json is not None]) + ']' return '[' + ', '.join([x.json for x in evidences if x.json is not None]) + ']'
def make_graph(bioent_id): evidences = DBSession.query(Regulationevidence).filter(or_(Regulationevidence.locus1_id == bioent_id, Regulationevidence.locus2_id == bioent_id)).all() id_to_neighbor = {} edge_key_to_evidence_count = {} for evidence in evidences: if evidence.direction == 'expression repressed' or evidence.direction == 'expression activated': if evidence.locus1_id not in id_to_neighbor: id_to_neighbor[evidence.locus1_id] = (evidence.locus1, 'REGULATOR') elif id_to_neighbor[evidence.locus1_id][1] == 'TARGET': id_to_neighbor[evidence.locus1_id] = (id_to_neighbor[evidence.locus1_id][0], 'BOTH') if evidence.locus2_id not in id_to_neighbor: id_to_neighbor[evidence.locus2_id] = (evidence.locus2, 'TARGET') elif id_to_neighbor[evidence.locus2_id] == 'REGULATOR': id_to_neighbor[evidence.locus2_id] = (id_to_neighbor[evidence.locus2_id][0], 'BOTH') edge_key = (evidence.locus1_id, evidence.locus2_id, evidence.direction) if edge_key in edge_key_to_evidence_count: edge_key_to_evidence_count[edge_key] += 1 else: edge_key_to_evidence_count[edge_key] = 1 neighbor_ids = id_to_neighbor.keys() cutoff = 1 while len(neighbor_ids) > 100: cutoff += 1 edge_key_to_evidence_count = dict([(x, y) for x, y in edge_key_to_evidence_count.iteritems() if y >= cutoff]) neighbor_ids = set([x[0] for x in edge_key_to_evidence_count.keys()]) neighbor_ids.update(([x[1] for x in edge_key_to_evidence_count.keys()])) id_to_neighbor = dict([(x, y) for x, y in id_to_neighbor.iteritems() if x in neighbor_ids]) tangent_evidences = DBSession.query(Regulationevidence).filter(and_(Regulationevidence.locus1_id.in_(neighbor_ids), Regulationevidence.locus2_id.in_(neighbor_ids))).all() for evidence in tangent_evidences: if evidence.locus1_id != bioent_id and evidence.locus2_id != bioent_id and (evidence.direction == 'expression repressed' or evidence.direction == 'expression activated'): edge_key = (evidence.locus1_id, evidence.locus2_id, evidence.direction) if edge_key in edge_key_to_evidence_count: edge_key_to_evidence_count[edge_key] += 1 else: edge_key_to_evidence_count[edge_key] = 1 edge_key_to_evidence_count = dict([(x, y) for x, y in edge_key_to_evidence_count.iteritems() if y >= cutoff]) edges = [create_edge(x[0], x[1], y, x[2]) for x, y in edge_key_to_evidence_count.iteritems()] nodes = [create_node(x[0], x[0].id==bioent_id, x[1]) for x in id_to_neighbor.values()] return {'nodes': nodes, 'edges': edges, 'min_evidence_count': cutoff}
def get_all_bioconcept_children(parent_id): from src.sgd.model.nex.bioconcept import Bioconceptrelation all_child_ids = set() new_parent_ids = [parent_id] while len(new_parent_ids) > 0: all_child_ids.update(new_parent_ids) if len(new_parent_ids) == 1: new_parent_ids = [x.child_id for x in DBSession.query(Bioconceptrelation).filter(Bioconceptrelation.relation_type == 'is a').filter(Bioconceptrelation.parent_id == new_parent_ids[0]).all()] else: num_chunks = int(ceil(1.0*len(new_parent_ids)/500)) latest_list = [] for i in range(num_chunks): latest_list.extend([x.child_id for x in DBSession.query(Bioconceptrelation).filter(Bioconceptrelation.relation_type == 'is a').filter(Bioconceptrelation.parent_id.in_(new_parent_ids[i*500:(i+1)*500])).all()]) new_parent_ids = latest_list return all_child_ids
def get_binding_evidence(locus_id): query = DBSession.query(Bindingevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_interactions_among(interaction_type, bioent_ids, interactor_ids): interactions = [] if len(bioent_ids) > 0 and len(interactor_ids) > 0: query = DBSession.query(Interaction).filter(Interaction.interaction_type==interaction_type).filter( Interaction.bioentity_id.in_(bioent_ids)).filter( Interaction.interactor_id.in_(interactor_ids)) interactions = query.all() return interactions
def get_proteinsequence_evidence(locus_id=None): query = DBSession.query(Proteinsequenceevidence).options(joinedload('locus'), joinedload('strain')) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_protein_experiment_evidence(locus_id): query = DBSession.query(Proteinexperimentevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_posttranslational_evidence(locus_id): query = DBSession.query(Posttranslationalevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_phosphorylation_evidence(locus_id): query = DBSession.query(Phosphorylationevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_physical_interaction_evidence(locus_id, reference_id): query = DBSession.query(Physinteractionevidence) if locus_id is not None: query = query.filter(or_(Physinteractionevidence.locus1_id == locus_id, Physinteractionevidence.locus2_id == locus_id)) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if query.count() > query_limit: return None return query.all()
def get_dnasequence_evidence(locus_id=None, contig_id=None): query = DBSession.query(DNAsequenceevidence).options(joinedload('locus'), joinedload('strain'), joinedload('contig')) if contig_id is not None: query = query.filter_by(contig_id=contig_id) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if query.count() > query_limit: return None return query.all()
def get_ec_number_evidence(locus_id, ec_number_id): query = DBSession.query(ECNumberevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if ec_number_id is not None: query = query.filter_by(ecnumber_id=ec_number_id) if query.count() > query_limit: return None return query.all()
def get_protein_domain_evidence(locus_id, domain_id): query = DBSession.query(Domainevidence).options(joinedload('domain')) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if domain_id is not None: query = query.filter_by(domain_id=domain_id) if query.count() > query_limit: return None return query.all()
def get_regulation_evidence(locus_id, reference_id, between_ids): query = DBSession.query(Regulationevidence) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if between_ids is not None: query = query.filter(and_(Regulationevidence.locus1_id.in_(between_ids), Regulationevidence.locus2_id.in_(between_ids))) if locus_id is not None: query = query.filter(or_(Regulationevidence.locus1_id == locus_id, Regulationevidence.locus2_id == locus_id)) if query.count() > query_limit: return None return query.all()
def get_archived_literature_evidence(locus_id, reference_id, topic): query = DBSession.query(ArchiveLiteratureevidence) if locus_id is not None: query = query.filter_by(bioentity_id=locus_id) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if topic is not None: query = query.filter(func.lower(ArchiveLiteratureevidence.topic) == topic.lower()) if query.count() > query_limit: return None return query.all()
def get_phenotype_evidence(locus_id, phenotype_id, observable_id, chemical_id, reference_id, with_children): query = DBSession.query(Phenotypeevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if phenotype_id is not None: query = query.filter_by(phenotype_id=phenotype_id) if observable_id is not None: if with_children: phenotype_ids = set() for new_observable_id in list(get_all_bioconcept_children(observable_id)): phenotype_ids.update([x.id for x in DBSession.query(Phenotype.id).filter_by(observable_id=new_observable_id).all()]) else: phenotype_ids = set([x.id for x in DBSession.query(Phenotype.id).filter_by(observable_id=observable_id).all()]) phenotype_ids = list(phenotype_ids) num_chunks = int(ceil(1.0*len(phenotype_ids)/500)) evidences = [] for i in range(num_chunks): subquery = query.filter(Phenotypeevidence.phenotype_id.in_(phenotype_ids[i*500:(i+1)*500])) if len(evidences) + subquery.count() > query_limit: return None evidences.extend(subquery.all()) return evidences if chemical_id is not None: chemical_evidence_ids = list(set([x.evidence_id for x in DBSession.query(Chemicalproperty).filter_by(bioitem_id=chemical_id).all()])) num_chunks = int(ceil(1.0*len(chemical_evidence_ids)/500)) evidences = [] for i in range(num_chunks): subquery = query.filter(Phenotypeevidence.id.in_(chemical_evidence_ids[i*500:(i+1)*500])) if len(evidences) + subquery.count() > query_limit: return None evidences.extend(subquery.all()) return evidences else: if query.count() > query_limit: return None return query.all()
def get_literature_evidence(locus_id, reference_id, topic): query = DBSession.query(Literatureevidence) if locus_id is not None: query = query.options(joinedload('reference')) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if topic is not None: query = query.filter(func.lower(Literatureevidence.topic) == topic.lower()) if query.count() > query_limit: return None return query.all()
def make_enrichment(bioent_ids): print len(bioent_ids) bioent_ids = list(set(bioent_ids)) bioent_format_names = [] num_chunks = int(ceil(1.0*len(bioent_ids)/500)) for i in range(0, num_chunks): bioent_format_names.extend([x.format_name for x in DBSession.query(Locus).filter(Locus.id.in_(bioent_ids[i*500:(i+1)*500])).all()]) enrichment_results = query_batter.query_go_processes(bioent_format_names) json_format = [] for enrichment_result in enrichment_results: try: identifier = 'GO:' + str(int(enrichment_result[0][3:])).zfill(7) goterm_id = get_obj_id(identifier, class_type='BIOCONCEPT', subclass_type='GO') if goterm_id is not None: goterm = DBSession.query(Go).filter_by(id=goterm_id).first().to_json() json_format.append({'go': goterm, 'match_count': enrichment_result[1], 'pvalue': enrichment_result[2]}) else: print 'Go term not found: ' + str(enrichment_result[0]) except: print 'Bad GO ID' + enrichment_result[0] return json_format
def make_details(locus_id=None): if locus_id is None: return {"Error": "No locus_id given."} expressionevidences = get_expression_evidence(locus_id=locus_id) # Expression id_to_datasetcolumn = dict( [(x.id, x) for x in DBSession.query(Datasetcolumn).options(joinedload(Datasetcolumn.dataset)).all()] ) expression_collapsed = {} id_to_dataset = dict() dataset_id_to_histogram = dict() min_value = 0 max_value = 0 for x in expressionevidences: value = float(x.value) rounded = math.floor(value) if value - rounded >= 0.5: rounded += 0.5 if rounded < min_value: min_value = rounded if rounded > max_value: max_value = rounded rounded = max(-5.5, min(5, rounded)) if rounded in expression_collapsed: expression_collapsed[rounded] += 1 else: expression_collapsed[rounded] = 1 datasetcolumn = id_to_datasetcolumn[x.evidence.datasetcolumn_id] dataset_id = datasetcolumn.dataset_id if dataset_id not in id_to_dataset: id_to_dataset[dataset_id] = datasetcolumn.dataset dataset_id_to_histogram[dataset_id] = set() dataset_id_to_histogram[dataset_id].add(rounded) datasets = [] for dataset in id_to_dataset.values(): obj_json = dataset.to_semi_json() obj_json["hist_values"] = sorted(dataset_id_to_histogram[dataset.id]) datasets.append(obj_json) return json.dumps( {"overview": expression_collapsed, "datasets": datasets, "min_value": min_value, "max_value": max_value + 0.5} )
def get_relations(cls, subclass_type, parent_ids=None, child_ids=None): query = DBSession.query(cls).options(joinedload('child'), joinedload('parent')) if subclass_type is not None: query = query.filter(cls.relation_type==subclass_type) if (parent_ids is not None and len(parent_ids) == 0) or (child_ids is not None and len(child_ids) == 0): return [] if parent_ids is not None: if len(parent_ids) == 1: query = query.filter(cls.parent_id==parent_ids[0]) else: query = query.filter(cls.parent_id.in_(parent_ids)) if child_ids is not None: if len(child_ids) == 1: query = query.filter(cls.child_id==child_ids[0]) else: query = query.filter(cls.child_id.in_(child_ids)) return query.all()
def get_interactions_among(locus_ids, interaction_cls, interaction_type, min_evidence_count=0): if len(locus_ids) > 0: query = DBSession.query(interaction_cls).filter_by(interaction_type=interaction_type).filter(interaction_cls.bioentity_id.in_(locus_ids)).filter(interaction_cls.interactor_id.in_(locus_ids)) if min_evidence_count > 0: query = query.filter(interaction_cls.evidence_count >= min_evidence_count) interactions = query.all() else: interactions = [] pair_to_edge = {} for interaction in interactions: pair = (interaction.bioentity_id, interaction.interactor_id) if interaction.direction == 'undirected' and interaction.bioentity_id < interaction.interactor_id: pair = (interaction.interactor_id, interaction.bioentity_id) if interaction.direction == 'backward': pair = (interaction.bioentity_id, interaction.interactor_id) pair_to_edge[pair] = interaction return pair_to_edge.values()
def make_neighbor_details(locus_id=None): if locus_id is None: return {'Error': 'No locus_id given.'} dnaseqevidences = get_dnasequence_evidence(locus_id=locus_id, contig_id=None) if dnaseqevidences is None: return {'Error': 'Too much data to display.'} neighbors = {} genomic_dnaseqevidences = [x for x in dnaseqevidences if x.dna_type == 'GENOMIC'] for evidence in genomic_dnaseqevidences: midpoint = int(round((evidence.start + (evidence.end-evidence.start)/2)/1000))*1000 start = max(1, midpoint - 5000) end = min(len(evidence.contig.residues), start + 10000) neighbor_evidences = DBSession.query(DNAsequenceevidence).filter_by(dna_type='GENOMIC').filter_by(contig_id=evidence.contig_id).filter(DNAsequenceevidence.end >= start).filter(DNAsequenceevidence.start <= end).options(joinedload('locus'), joinedload('strain')).all() neighbors[evidence.strain.format_name] = {'neighbors': [x.to_json() for x in sorted(neighbor_evidences, key=lambda x: x.start if x.strand == '+' else x.end) if x.locus.bioent_status == 'Active' or x.locus_id == locus_id], 'start': start, 'end': end} return neighbors
def get_go_evidence(locus_id, go_id, reference_id, with_children): query = DBSession.query(Goevidence) if locus_id is not None: query = query.filter_by(locus_id=locus_id) if reference_id is not None: query = query.filter_by(reference_id=reference_id) if go_id is not None: if with_children: child_ids = list(get_all_bioconcept_children(go_id)) num_chunks = int(ceil(1.0*len(child_ids)/500)) evidences = [] for i in range(num_chunks): subquery = query.filter(Goevidence.go_id.in_(child_ids[i*500:(i+1)*500])) if len(evidences) + subquery.count() > query_limit: return None evidences.extend([x for x in subquery.all()]) return evidences else: query = query.filter_by(go_id=go_id) if query.count() > query_limit: return None return query.all()
def make_alignment(locus_id=None): if locus_id is None: return {'Error': 'No locus_id given.'} evidences = get_alignment_evidence(locus_id=locus_id) if evidences is None: return {'Error': 'Too much data to display.'} try: locus = DBSession.query(Locus).filter_by(id=locus_id).first() obj_json = locus.to_min_json() dnasequenceevidence = DBSession.query(DNAsequenceevidence).filter_by(strain_id=1).filter_by(dna_type='GENOMIC').filter_by(locus_id=locus_id).first() if dnasequenceevidence is not None: obj_json['coordinates'] = { 'start': dnasequenceevidence.start, 'end': dnasequenceevidence.end, } obj_json['contig'] = dnasequenceevidence.contig.to_min_json() obj_json['strand'] = dnasequenceevidence.strand obj_json['introns'] = [] obj_json['dna_length'] = len(dnasequenceevidence.residues) proteinsequenceevidence = DBSession.query(Proteinsequenceevidence).filter_by(strain_id=1).filter_by(locus_id=locus_id).first() if proteinsequenceevidence is not None: obj_json['protein_length'] = len(proteinsequenceevidence.residues) #Alignment data alignment_evidences = get_alignment_evidence(locus_id=locus_id) if evidences is None: return {'Error': 'Too much data to display.'} ordered_strains = ['S288C', 'X2180-1A', 'SEY6210', 'W303', 'JK9-3d', 'FL100', 'CEN.PK', 'D273-10B', 'Sigma1278b', 'RM11-1a', 'SK1', 'Y55'] alignment_evidences.sort(key=lambda x: float('infinity') if x.strain.display_name not in ordered_strains else ordered_strains.index(x.strain.display_name)) reference_aligment = [x for x in alignment_evidences if x.sequence_type == 'Genomic DNA' and x.strain_id == 1][0] if dnasequenceevidence is not None: for tag in dnasequenceevidence.tags: if tag.class_type == 'INTRON': coords = switch_to_alignment_coord(reference_aligment.residues_with_gaps, [tag.relative_start, tag.relative_end]) obj_json['introns'].append({'start': coords[0], 'end': coords[1]}) obj_json['aligned_dna_sequences'] = [{'strain_id': x.strain_id, 'strain_display_name': x.strain.display_name, 'strain_link': x.strain.link, 'sequence': x.residues_with_gaps} for x in alignment_evidences if x.sequence_type == 'Genomic DNA'] obj_json['aligned_protein_sequences'] = [{'strain_id': x.strain_id, 'strain_display_name': x.strain.display_name, 'strain_link': x.strain.link, 'sequence': x.residues_with_gaps} for x in alignment_evidences if x.sequence_type == 'Protein'] obj_json['variant_data_dna'] = calculate_variant_data('DNA', obj_json['aligned_dna_sequences'], obj_json['introns']) obj_json['variant_data_protein'] = calculate_variant_data('Protein', obj_json['aligned_protein_sequences'], obj_json['introns']) except: print locus_id return obj_json
def make_ontology_graph(bioconcept_id, class_type, filter_f, subtype_f): full_ontology = None all_children = [] bioconcept = DBSession.query(Bioconcept).filter_by(id=bioconcept_id).first() parent_relations = [x for x in get_relations(Bioconceptrelation, None, child_ids=[bioconcept_id]) if not x.relation_type.endswith('SLIM')] child_relations = [x for x in get_relations(Bioconceptrelation, None, parent_ids=[bioconcept_id]) if not x.relation_type.endswith('SLIM')] if len(parent_relations) > 0: grandparent_relations = [x for x in get_relations(Bioconceptrelation, None, child_ids=[x.parent_id for x in parent_relations]) if not x.relation_type.endswith('SLIM')] greatgrandparent_relations = [x for x in get_relations(Bioconceptrelation, None, child_ids=[x.parent_id for x in grandparent_relations]) if not x.relation_type.endswith('SLIM')] greatgreatgrandparent_relations = [x for x in get_relations(Bioconceptrelation, None, child_ids=[x.parent_id for x in greatgrandparent_relations]) if not x.relation_type.endswith('SLIM')] nodes = [create_node(bioconcept, True, subtype_f(bioconcept))] parents = [x.parent for x in parent_relations] parents.extend([x.parent for x in grandparent_relations]) parents.extend([x.parent for x in greatgrandparent_relations]) parents.extend([x.parent for x in greatgreatgrandparent_relations]) viable_ids = set([x.child_id for x in child_relations if filter_f(x.child)]) #If there are too many children, hide some. hidden_children_count = 0 all_children = sorted([x.child for x in child_relations], key=lambda x: x.display_name.lower()) if len(viable_ids) > 8: hidden_children_count = len(viable_ids)-7 viable_ids = set(list(viable_ids)[:7]) viable_ids.update([x.id for x in parents if filter_f(x)]) viable_ids.add(bioconcept_id) nodes.extend([create_node(x.child, False, subtype_f(x.child)) for x in child_relations if x.child_id in viable_ids]) nodes.extend([create_node(x, False, subtype_f(x)) for x in parents if x.id in viable_ids]) relations = set() relations.update(child_relations) relations.update(parent_relations) relations.update(grandparent_relations) relations.update(greatgrandparent_relations) relations.update(greatgreatgrandparent_relations) edges = [create_edge(x.child_id, x.parent_id, x.relation_type) for x in relations if x.child_id in viable_ids and x.parent_id in viable_ids] if hidden_children_count > 0: nodes.insert(0, {'data':{'id':'NodeMoreChildren', 'name':str(hidden_children_count) + ' more children', 'link': None, 'sub_type':subtype_f(bioconcept)}}) edges.insert(0, {'data':{'target': 'NodeMoreChildren', 'source': 'Node' + str(bioconcept_id), 'name':None}}) else: nodes = [create_node(bioconcept, True, subtype_f(bioconcept))] nodes.extend([create_node(x.child, False, subtype_f(x.child)) for x in child_relations]) edges = [create_edge(x.child_id, x.parent_id, x.relation_type) for x in child_relations] if bioconcept.class_type == 'OBSERVABLE': grandchild_relations = [x for x in get_relations(Bioconceptrelation, None, parent_ids=[x.child_id for x in child_relations]) if x.relation_type == 'is a'] nodes.extend([create_node(x.child, False, subtype_f(x.child)) for x in grandchild_relations]) edges.extend([create_edge(x.child_id, x.parent_id, x.relation_type) for x in grandchild_relations]) observables = DBSession.query(Observable).all() elements = [x.to_min_json() for x in sorted(observables, key=lambda x: x.display_name)] child_to_parent = dict([(x.child_id, x.parent_id) for y in observables for x in y.children]) full_ontology = {'elements': elements, 'child_to_parent': child_to_parent} obj_json = {'nodes': list(nodes), 'edges': edges, 'all_children': [x.to_min_json() for x in all_children]} if full_ontology is not None: obj_json['full_ontology'] = full_ontology return obj_json
def make_lsp_graph(locus_id, node_max=100, edge_max=250): #Get interactors bioconcept_ids = [(x.interaction_type, x.interactor_id) for x in DBSession.query(Bioconceptinteraction).filter_by(bioentity_id=locus_id).all()] bioitem_ids = [(x.interaction_type, x.interactor_id) for x in DBSession.query(Bioiteminteraction).filter_by(bioentity_id=locus_id).all()] print len(bioconcept_ids) + len(bioitem_ids) interactor_to_bioent_ids = dict() bioent_id_to_interactor_ids = dict() #Get next level num_chunks = int(ceil(1.0*len(bioconcept_ids)/500)) for i in range(0, num_chunks): for interaction in DBSession.query(Bioconceptinteraction).filter(Bioconceptinteraction.interactor_id.in_([x[1] for x in bioconcept_ids[i*500:(i+1)*500]])).all(): key = (interaction.interaction_type, interaction.interactor_id) bioentity_id = interaction.bioentity_id if key in interactor_to_bioent_ids: interactor_to_bioent_ids[key].add(bioentity_id) else: interactor_to_bioent_ids[key] = set([bioentity_id]) if bioentity_id in bioent_id_to_interactor_ids: bioent_id_to_interactor_ids[bioentity_id].add(key) else: bioent_id_to_interactor_ids[bioentity_id] = set([key]) num_chunks = int(ceil(1.0*len(bioitem_ids)/500)) for i in range(0, num_chunks): for interaction in DBSession.query(Bioiteminteraction).filter(Bioiteminteraction.interactor_id.in_([x[1] for x in bioitem_ids[i*500:(i+1)*500]])).all(): key = (interaction.interaction_type, interaction.interactor_id) bioentity_id = interaction.bioentity_id if key in interactor_to_bioent_ids: interactor_to_bioent_ids[key].add(bioentity_id) else: interactor_to_bioent_ids[key] = set([bioentity_id]) if bioentity_id in bioent_id_to_interactor_ids: bioent_id_to_interactor_ids[bioentity_id].add(key) else: bioent_id_to_interactor_ids[bioentity_id] = set([key]) bioent_ids_in_use = set() min_cutoff = max(len(y) for y in bioent_id_to_interactor_ids.values()) while len(bioent_ids_in_use) + len([x for x, y in bioent_id_to_interactor_ids.iteritems() if len(y) == min_cutoff]) < node_max: bioent_ids_in_use.update([x for x, y in bioent_id_to_interactor_ids.iteritems() if len(y) == min_cutoff]) min_cutoff -= 1 #Pick out cutoff pair_to_score = dict() for bioent1_id in bioent_ids_in_use: for bioent2_id in bioent_ids_in_use: if bioent1_id < bioent2_id: overlap = bioent_id_to_interactor_ids[bioent1_id] & bioent_id_to_interactor_ids[bioent2_id] domain_count = len([x for x in overlap if x[0] == 'DOMAIN']) phenotype_count = len([x for x in overlap if x[0] == 'PHENOTYPE']) go_count = len([x for x in overlap if x[0] == 'GO']) score = int(ceil(.5*domain_count + .5*phenotype_count + go_count)) pair_to_score[(bioent1_id, bioent2_id)] = score interactions = DBSession.query(Bioentityinteraction).filter(Bioentityinteraction.bioentity_id.in_(bioent_ids_in_use)).filter(Bioentityinteraction.evidence_count > 2).all() pair_to_interactions = dict() for interaction in interactions: if interaction.bioentity_id < interaction.interactor_id: key = (interaction.bioentity_id, interaction.interactor_id) if interaction.interaction_type == 'EXPRESSION': score = max(0, (interaction.coeff - .75)*20) else: score = interaction.evidence_count-2 elif interaction.bioentity_id > interaction.interactor_id: key = (interaction.interactor_id, interaction.bioentity_id) if interaction.interaction_type == 'EXPRESSION': score = max(0, (interaction.coeff - .75)*20) else: score = interaction.evidence_count-2 else: key = (interaction.bioentity_id, interaction.interactor_id) if interaction.interaction_type == 'EXPRESSION': score = max(0, (interaction.coeff - .75)*10) else: score = 1.0*(interaction.evidence_count-2)/2 if key in pair_to_score: pair_to_score[key] += score else: pair_to_score[key] = score if key in pair_to_interactions: pair_to_interactions[key].append(interaction) else: pair_to_interactions[key] = [interaction] min_edge_cutoff = max(pair_to_score.values()) score_to_bioent_ids = dict([(i, set()) for i in range(0, min_edge_cutoff+1)]) for bioent_id in bioent_ids_in_use: if bioent_id < locus_id: score = pair_to_score[(bioent_id, locus_id)] elif bioent_id > locus_id: score = pair_to_score[(locus_id, bioent_id)] else: score = min_edge_cutoff+1 for i in range(0, score): score_to_bioent_ids[i].add(bioent_id) pairs_in_use = set() while len([x for x,y in pair_to_score.iteritems() if y>=min_edge_cutoff and x[0] in score_to_bioent_ids[min_edge_cutoff] and x[1] in score_to_bioent_ids[min_edge_cutoff]]) < edge_max and min_edge_cutoff > 0: pairs_in_use.update([x for x,y in pair_to_score.iteritems() if y>=min_edge_cutoff and x[0] in score_to_bioent_ids[min_edge_cutoff] and x[1] in score_to_bioent_ids[min_edge_cutoff]]) min_edge_cutoff -= 1 new_bioent_ids_in_use = score_to_bioent_ids[min_edge_cutoff+1] id_to_nodes = {} id_to_nodes.update([(x.id, create_lsp_node(x, x.id==locus_id)) for x in DBSession.query(Locus).filter(Locus.id.in_(new_bioent_ids_in_use)).all()]) edges = [] for bioent1_id, bioent2_id in pairs_in_use: interaction_types = [x[0] for x in bioent_id_to_interactor_ids[bioent1_id] & bioent_id_to_interactor_ids[bioent2_id]] interaction_type_to_count = {} for interaction_type in interaction_types: if interaction_type in interaction_type_to_count: interaction_type_to_count[interaction_type] += 1 else: interaction_type_to_count[interaction_type] = 1 if (bioent1_id, bioent2_id) in pair_to_interactions: for interaction in pair_to_interactions[(bioent1_id, bioent2_id)]: interaction_type_to_count[interaction.interaction_type] = interaction.evidence_count-2 for interaction_type, count in interaction_type_to_count.iteritems(): if bioent1_id != bioent2_id: edges.append(create_lsp_edge(bioent1_id, bioent2_id, interaction_type, min(count, 5))) #Pick out interactors to highlight interactor_ids_in_use = set() for bioent_id in new_bioent_ids_in_use: interactor_ids_in_use.update(bioent_id_to_interactor_ids[bioent_id]) interactor_id_to_score = dict() for interactor_id, bioent_ids in interactor_to_bioent_ids.iteritems(): overlap = len(bioent_ids & new_bioent_ids_in_use) if overlap > 1: score = 1.0*overlap/len(bioent_ids) else: score = 0 interactor_id_to_score[interactor_id] = score top_interactors = [x for x in sorted(interactor_ids_in_use, key=lambda x: interactor_id_to_score[x], reverse=True)][:20] top_bioitems = [] top_bioconcepts = [] for interactor_id in top_interactors: if interactor_id_to_score[interactor_id] > 0: if interactor_id in bioconcept_ids: top_bioconcepts.append(interactor_id) elif interactor_id in bioitem_ids: top_bioitems.append(interactor_id) top_bioconcept_info = [] top_bioitem_info = [] if len(top_bioconcepts) > 0: top_bioconcept_info.extend([get_class_too(x) for x in DBSession.query(Bioconcept).filter(Bioconcept.id.in_([x[1] for x in top_bioconcepts])).all()]) if len(top_bioitems) > 0: top_bioitem_info.extend([get_class_too(x) for x in DBSession.query(Bioitem).filter(Bioitem.id.in_([x[1] for x in top_bioitems])).all()]) for interactor_id in top_bioconcepts: for bioent_id in interactor_to_bioent_ids[interactor_id]: if bioent_id in id_to_nodes: id_to_nodes[bioent_id]['data']['BIOCONCEPT' + str(interactor_id[1])] = True for interactor_id in top_bioitems: for bioent_id in interactor_to_bioent_ids[interactor_id]: if bioent_id in id_to_nodes: id_to_nodes[bioent_id]['data']['BIOITEM' + str(interactor_id[1])] = True return {'nodes': id_to_nodes.values(), 'edges': edges, 'top_bioconcepts': top_bioconcept_info, 'top_bioitems': top_bioitem_info}
def make_interaction_graph(locus_ids, interaction_cls, interaction_type, min_evidence_count=0): edges = [create_interaction_graph_edge(x) for x in get_interactions_among(locus_ids, interaction_cls, interaction_type, min_evidence_count)] nodes = [create_interaction_graph_node(x) for x in DBSession.query(Locus).filter(Locus.id.in_(locus_ids))] return {'nodes': list(nodes), 'edges': edges}
def make_graph(bioent_id, interaction_cls, interaction_type, bioentity_type, node_max=50, edge_max=100, bioent_max=30, interactor_max=30): interactions = DBSession.query(interaction_cls).filter_by(bioentity_id=bioent_id).filter_by(interaction_type=interaction_type).all() interactor_ids = set([x.interactor_id for x in interactions]) id_to_bioentity = {} id_to_interactor = {} interactor_ids_to_bioent_ids = {} bioent_ids_to_interactor_ids = {} all_relevant_interactions = [] if len(interactor_ids) > 0: all_relevant_interactions = DBSession.query(interaction_cls).filter_by(interaction_type=interaction_type).filter(interaction_cls.interactor_id.in_(interactor_ids)).options(joinedload('bioentity'), joinedload('interactor')).all() for interaction in all_relevant_interactions: bioentity_id = interaction.bioentity_id interactor_id = interaction.interactor_id id_to_bioentity[bioentity_id] = interaction.bioentity id_to_interactor[interactor_id] = interaction.interactor if interaction.bioentity.class_type == bioentity_type: if interactor_id in interactor_ids_to_bioent_ids: interactor_ids_to_bioent_ids[interactor_id].add(bioentity_id) else: interactor_ids_to_bioent_ids[interactor_id] = {bioentity_id} if bioentity_id in bioent_ids_to_interactor_ids: bioent_ids_to_interactor_ids[bioentity_id].add(interactor_id) else: bioent_ids_to_interactor_ids[bioentity_id] = {interactor_id} cutoff = 1 node_count = len(bioent_ids_to_interactor_ids) + len(interactor_ids_to_bioent_ids) edge_count = len(all_relevant_interactions) bioent_count = len(bioent_ids_to_interactor_ids) interactor_count = len(interactor_ids_to_bioent_ids) bioent_ids_in_use = set([x for x, y in bioent_ids_to_interactor_ids.iteritems() if len(y) >= cutoff]) interactor_ids_in_use = set([x for x, y in interactor_ids_to_bioent_ids.iteritems() if len(y & bioent_ids_in_use) > 1]) interactions_in_use = [x for x in all_relevant_interactions] while node_count > node_max or edge_count > edge_max or bioent_count > bioent_max or interactor_count > interactor_max: cutoff = cutoff + 1 bioent_ids_in_use = set([x for x, y in bioent_ids_to_interactor_ids.iteritems() if len(y) >= cutoff]) interactor_ids_in_use = set([x for x, y in interactor_ids_to_bioent_ids.iteritems() if len(y & bioent_ids_in_use) > 1]) interactions_in_use = [x for x in all_relevant_interactions if x.bioentity_id in bioent_ids_in_use and x.interactor_id in interactor_ids_in_use] node_count = len(bioent_ids_in_use) + len(interactor_ids_in_use) edge_count = len(interactions_in_use) bioent_count = len(bioent_ids_in_use) interactor_count = len(interactor_ids_in_use) interactions_in_use = [x for x in all_relevant_interactions if x.bioentity_id in bioent_ids_in_use and x.interactor_id in interactor_ids_in_use] if bioent_count > 0 and interactor_count > 0: edges = [create_interaction_edge(interaction, interaction_type) for interaction in interactions_in_use] bioent_to_score = dict({(x, len(bioent_ids_to_interactor_ids[x]&interactor_ids_in_use)) for x in bioent_ids_in_use}) bioent_to_score[bioent_id] = 0 id_to_nodes = dict([(x, create_bioent_node(id_to_bioentity[x], x==bioent_id, bioent_to_score[x])) for x in bioent_ids_in_use]) id_to_nodes.update([(x, create_interactor_node(id_to_interactor[x], max(bioent_to_score[y] for y in interactor_ids_to_bioent_ids[x]&bioent_ids_in_use))) for x in interactor_ids_in_use]) max_cutoff = max(bioent_to_score.values()) id_to_nodes[bioent_id]['data']['gene_count'] = max_cutoff return {'nodes': id_to_nodes.values(), 'edges': edges, 'max_cutoff': max_cutoff, 'min_cutoff':cutoff if len(bioent_ids_in_use) == 1 else min([bioent_to_score[x] for x in bioent_ids_in_use if x != bioent_id])} else: return {'nodes':[], 'edges':[], 'max_cutoff':0, 'min_cutoff':0}
def make_graph(bioent_id): neighbor_id_to_genevidence_count = dict([(x.interactor_id, x.evidence_count) for x in DBSession.query(Bioentityinteraction).filter_by(interaction_type='GENINTERACTION').filter_by(bioentity_id=bioent_id).all()]) neighbor_id_to_physevidence_count = dict([(x.interactor_id, x.evidence_count) for x in DBSession.query(Bioentityinteraction).filter_by(interaction_type='PHYSINTERACTION').filter_by(bioentity_id=bioent_id).all()]) all_neighbor_ids = set() all_neighbor_ids.update(neighbor_id_to_genevidence_count.keys()) all_neighbor_ids.update(neighbor_id_to_physevidence_count.keys()) max_union_count = 0 max_phys_count = 0 max_gen_count = 0 evidence_count_to_neighbors = [set() for _ in range(11)] evidence_count_to_genetic = [set() for _ in range(11)] evidence_count_to_physical = [set() for _ in range(11)] for neighbor_id in all_neighbor_ids: genevidence_count = min(10, 0 if neighbor_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[neighbor_id]) physevidence_count = min(10, 0 if neighbor_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[neighbor_id]) gen_and_phys = min(10, max(genevidence_count, physevidence_count)) max_gen_count = max_gen_count if genevidence_count <= max_gen_count else genevidence_count max_phys_count = max_phys_count if physevidence_count <= max_phys_count else physevidence_count max_union_count = max_union_count if gen_and_phys <= max_union_count else gen_and_phys evidence_count_to_neighbors[gen_and_phys].add(neighbor_id) evidence_count_to_genetic[genevidence_count].add(neighbor_id) evidence_count_to_physical[physevidence_count].add(neighbor_id) #Apply 100 node cutoff min_evidence_count = 10 usable_neighbor_ids = set() while len(usable_neighbor_ids) + len(evidence_count_to_neighbors[min_evidence_count]) < 100 and min_evidence_count > 0: usable_neighbor_ids.update(evidence_count_to_neighbors[min_evidence_count]) min_evidence_count = min_evidence_count - 1 tangent_to_genevidence_count = dict([((x.bioentity_id, x.interactor_id), x.evidence_count) for x in get_interactions_among(usable_neighbor_ids, Bioentityinteraction, 'GENINTERACTION')]) tangent_to_physevidence_count = dict([((x.bioentity_id, x.interactor_id), x.evidence_count) for x in get_interactions_among(usable_neighbor_ids, Bioentityinteraction, 'PHYSINTERACTION')]) evidence_count_to_phys_tangents = [set() for _ in range(11)] evidence_count_to_gen_tangents = [set() for _ in range(11)] for tangent, evidence_count in tangent_to_genevidence_count.iteritems(): bioent1_id, bioent2_id = tangent if bioent1_id != bioent_id and bioent2_id != bioent_id: bioent1_count = max(0 if bioent1_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[bioent1_id], 0 if bioent1_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[bioent1_id]) bioent2_count = max(0 if bioent2_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[bioent2_id], 0 if bioent2_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[bioent2_id]) index = min(10, bioent1_count, bioent2_count, evidence_count) evidence_count_to_gen_tangents[index].add(tangent) for tangent, evidence_count in tangent_to_physevidence_count.iteritems(): bioent1_id, bioent2_id = tangent if bioent1_id != bioent_id and bioent2_id != bioent_id: bioent1_count = max(0 if bioent1_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[bioent1_id], 0 if bioent1_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[bioent1_id]) bioent2_count = max(0 if bioent2_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[bioent2_id], 0 if bioent2_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[bioent2_id]) index = min(10, bioent1_count, bioent2_count, evidence_count) evidence_count_to_phys_tangents[index].add(tangent) #Apply 250 edge cutoff old_min_evidence_count = min_evidence_count min_evidence_count = 10 edges = [] nodes = [create_node(DBSession.query(Bioentity).filter_by(id=bioent_id).first(), True, max_gen_count, max_phys_count)] while len(edges) + len(evidence_count_to_physical[min_evidence_count]) + len(evidence_count_to_genetic[min_evidence_count]) + len(evidence_count_to_phys_tangents[min_evidence_count]) + len(evidence_count_to_gen_tangents[min_evidence_count]) < 250 and min_evidence_count > old_min_evidence_count: for neighbor_id in evidence_count_to_neighbors[min_evidence_count]: physical_count = 0 if neighbor_id not in neighbor_id_to_physevidence_count else neighbor_id_to_physevidence_count[neighbor_id] genetic_count = 0 if neighbor_id not in neighbor_id_to_genevidence_count else neighbor_id_to_genevidence_count[neighbor_id] nodes.append(create_node(DBSession.query(Bioentity).filter_by(id=neighbor_id).first(), False, genetic_count, physical_count)) for genetic_id in evidence_count_to_genetic[min_evidence_count]: genevidence_count = neighbor_id_to_genevidence_count[genetic_id] edges.append(create_edge(bioent_id, genetic_id, genevidence_count, 'GENETIC')) for physical_id in evidence_count_to_physical[min_evidence_count]: physevidence_count = neighbor_id_to_physevidence_count[physical_id] edges.append(create_edge(bioent_id, physical_id, physevidence_count, 'PHYSICAL')) for tangent in evidence_count_to_gen_tangents[min_evidence_count]: bioent1_id, bioent2_id = tangent gen_ev_count = tangent_to_genevidence_count[tangent] edges.append(create_edge(bioent1_id, bioent2_id, gen_ev_count, 'GENETIC')) for tangent in evidence_count_to_phys_tangents[min_evidence_count]: bioent1_id, bioent2_id = tangent phys_ev_count = tangent_to_physevidence_count[tangent] edges.append(create_edge(bioent1_id, bioent2_id, phys_ev_count, 'PHYSICAL')) min_evidence_count = min_evidence_count - 1 return {'nodes': nodes, 'edges': edges, 'min_evidence_cutoff':min_evidence_count+1, 'max_evidence_cutoff':max_union_count, 'max_phys_cutoff': max_phys_count, 'max_gen_cutoff': max_gen_count}
def make_contig(contig_id): contig = DBSession.query(Contig).filter(Contig.id == contig_id).first() return contig.to_json()