class ClusterCladeEnrichment(db.Model): __tablename__ = 'cluster_clade_enrichment' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) cluster_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE')) gene_family_method_id = db.Column( db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE')) gene_family_method = db.relationship('GeneFamilyMethod', backref=db.backref( 'clade_enrichment', lazy='dynamic', passive_deletes=True), lazy='joined') cluster = db.relationship('CoexpressionCluster', backref=db.backref('clade_enrichment', lazy='dynamic', passive_deletes=True), lazy='joined') clade = db.relationship('Clade', backref=db.backref('enriched_clusters', lazy='dynamic', passive_deletes=True), lazy='joined') """ Counts required to calculate the enrichment, store here for quick access """ cluster_count = db.Column(db.Integer) cluster_size = db.Column(db.Integer) clade_count = db.Column(db.Integer) clade_size = db.Column(db.Integer) """ Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric distribution and applying FDR correction (aka. BH) """ enrichment = db.Column(db.Float) p_value = db.Column(db.Float) corrected_p_value = db.Column(db.Float) @property def cluster_percentage(self): return self.cluster_count * 100 / self.cluster_size @property def genome_percentage(self): return self.clade_count * 100 / self.clade_size
class CoexpressionClusterSimilarity(db.Model): __tablename__ = 'coexpression_cluster_similarity' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) source_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) target_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) gene_family_method_id = db.Column('gene_family_method_id', db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) jaccard_index = db.Column(db.Float, index=True) p_value = db.Column(db.Float, index=True) corrected_p_value = db.Column(db.Float, index=True) source = db.relationship('CoexpressionCluster', backref=db.backref('similarity_sources', lazy='dynamic', passive_deletes=True), lazy='joined', foreign_keys=[source_id]) target = db.relationship('CoexpressionCluster', backref=db.backref('similarity_targets', lazy='dynamic', passive_deletes=True), lazy='joined', foreign_keys=[target_id]) gene_family_method = db.relationship('GeneFamilyMethod', backref=db.backref( 'CoexpressionClusterSimilarities', passive_deletes=True), lazy='joined') @staticmethod def empty_table(): """ Delete all content from this table. Use carefully ! """ CoexpressionClusterSimilarity.query.delete()
class SequenceSequenceCladeAssociation(db.Model): __tablename__ = 'sequence_sequence_clade' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_one_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) sequence_two_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE'), index=True) tree_id = db.Column(db.Integer, db.ForeignKey('trees.id', ondelete='CASCADE'), index=True) duplication = db.Column(db.SmallInteger) duplication_consistency_score = db.Column(db.Float) tree = db.relationship('Tree', lazy='joined', backref=db.backref('sequence_sequence_clade_associations', lazy='dynamic', passive_deletes=True) ) clade = db.relationship('Clade', lazy='joined', backref=db.backref('sequence_sequence_clade_associations', lazy='dynamic', passive_deletes=True) ) def __str__(self): return "%d" % self.id @property def readable_type(self): """ Returns type (duplication or speciation) in a human-readable format :return: string Duplication or Speciation """ return "Duplication" if self.duplication else "Speciation" @property def readable_score(self): """ Returns the duplication consistency score in a nicer format :return: string with dup. consistency score in .%3 - format. Or "Not available" for speciations. """ return "%.3f" % self.duplication_consistency_score if self.duplication else "Not available"
class SequenceFamilyAssociation(db.Model): __tablename__ = 'sequence_family' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) gene_family_id = db.Column( db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE')) sequence = db.relationship('Sequence', backref=db.backref('family_associations', lazy='dynamic', passive_deletes=True), lazy='joined') family = db.relationship('GeneFamily', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class FamilyGOAssociation(db.Model): __tablename__ = 'family_go' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) gene_family_id = db.Column( db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE')) go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE')) gene_family = db.relationship('GeneFamily', backref=db.backref('go_annotations', lazy='dynamic', passive_deletes=True), lazy='joined') go_term = db.relationship('GO', backref=db.backref('family_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class SequenceInterproAssociation(db.Model): __tablename__ = 'sequence_interpro' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) interpro_id = db.Column(db.Integer, db.ForeignKey('interpro.id', ondelete='CASCADE')) start = db.Column(db.Integer, default=None) stop = db.Column(db.Integer, default=None) sequence = db.relationship('Sequence', backref=db.backref('interpro_associations', lazy='dynamic', passive_deletes=True), lazy='joined') domain = db.relationship('Interpro', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class SequenceCoexpressionClusterAssociation(db.Model): __tablename__ = 'sequence_coexpression_cluster' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) probe = db.Column(db.String(50), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) coexpression_cluster_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) sequence = db.relationship('Sequence', backref=db.backref( 'coexpression_cluster_associations', lazy='dynamic', passive_deletes=True), lazy='joined') coexpression_cluster = db.relationship('CoexpressionCluster', backref=db.backref( 'sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class SequenceGOAssociation(db.Model): __tablename__ = 'sequence_go' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE')) evidence = db.Column( db.Enum('EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD', 'RCA', 'TAS', 'NAS', 'IC', 'ND', 'IEA', name='evidence')) source = db.Column(db.Text) predicted = db.Column(db.SmallInteger, default=False) prediction_data = db.Column(db.Text) sequence = db.relationship('Sequence', backref=db.backref('go_associations', lazy='dynamic', passive_deletes=True), lazy='joined') go = db.relationship('GO', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined') def __init__(self, sequence_id, go_id, evidence, source, predicted=False, prediction_data=None): self.sequence_id = sequence_id self.go_id = go_id self.evidence = evidence self.source = source self.predicted = predicted self.prediction_data = prediction_data @property def data(self): """ Property to get the information in the prediction_data as a dict. Useful for showing these values in e.g. jinja2 templates :return: de-serialized prediction_data (json) """ return json.loads(self.prediction_data)
class ExpressionNetworkMethod(db.Model): __tablename__ = 'expression_network_methods' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id'), index=True) description = db.Column(db.Text) edge_type = db.Column(db.Enum("rank", "weight", name='edge_type')) probe_count = db.Column(db.Integer) hrr_cutoff = db.Column(db.Integer) pcc_cutoff = db.Column(db.Float) enable_second_level = db.Column(db.SmallInteger) probes = db.relationship('ExpressionNetwork', backref=db.backref('method', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) clustering_methods = db.relationship('CoexpressionClusteringMethod', backref='network_method', lazy='dynamic', cascade='all, delete-orphan', passive_deletes=True) def __init__(self, species_id, description, edge_type="rank"): self.species_id = species_id self.description = description self.edge_type = edge_type self.enable_second_level = False def __repr__(self): return str(self.id) + ". " + self.description + ' [' + str(self.species) + ']' @staticmethod def update_count(): """ To avoid long count queries the number of networks for each method can be precalculated and stored in the database using this function """ methods = ExpressionNetworkMethod.query.all() for m in methods: m.probe_count = m.probes.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod @benchmark def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100): """ Function to calculate the ECC scores in and between genes of different networks ORM free method for speed ! :param network_method_ids: array of networks (using their internal id !) to compare :param gene_family_method_id: internal id of the type of family methods to be used for the comparison """ network_families = {} sequence_network = {} sequence_network_method = {} sequence_family = {} family_sequence = {} # Get all the network information and store in dictionary for n in network_method_ids: current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id, ExpressionNetwork.__table__.c.network, ExpressionNetwork.__table__.c.method_id]). where(ExpressionNetwork.__table__.c.method_id == n). where(ExpressionNetwork.__table__.c.sequence_id.isnot(None)) ).fetchall() for sequence, network, network_method_id in current_network: if sequence is not None: sequence_network[int(sequence)] = network sequence_network_method[int(sequence)] = int(network_method_id) # Get family data and store in dictionary current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id]). select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)). where(GeneFamily.__table__.c.method_id == gene_family_method_id) ).fetchall() for sequence, family, method in current_families: sequence_family[int(sequence)] = int(family) if family not in family_sequence.keys(): family_sequence[int(family)] = [] family_sequence[int(family)].append(int(sequence)) # Create a dict (key = network) with the families present in that network # Families that occur multiple times should be present multiple times as this is used # to set threshholds later ! for sequence, network_method in sequence_network_method.items(): # ignore sequences without a family, ideally this shouldn't happen if network_method not in network_families.keys(): network_families[network_method] = [] if sequence in sequence_family.keys(): family = sequence_family[sequence] network_families[network_method].append(family) # Determine threshold and p-value # A background model will be computed for each combination of networks, an ECC score will need to be better # than 95 % of the randomly found values to be considered significant thresholds = {} print("Starting permutation tests") for n in network_method_ids: thresholds[n] = {} for m in network_method_ids: thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n], network_families[m], max_size=max_size) # Data loaded start calculating ECCs new_ecc_scores = [] for family, sequences in family_sequence.items(): for i in range(len(sequences) - 1): query = sequences[i] for j in range(i+1, len(sequences)): target = sequences[j] if query in sequence_network.keys() and target in sequence_network.keys() and query != target: # Ignore genes with overlapping neighborhoods if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]): ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query], sequence_network[target], sequence_family, thresholds[sequence_network_method[query]][sequence_network_method[target]], family, max_size=max_size) if significant: new_ecc_scores.append({ 'query_id': query, 'target_id': target, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[query], 'target_network_method_id': sequence_network_method[target], }) # add reciprocal relation new_ecc_scores.append({ 'query_id': target, 'target_id': query, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[target], 'target_network_method_id': sequence_network_method[query], }) if len(new_ecc_scores) > 400: db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores) new_ecc_scores = [] db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores) @staticmethod def __neighborhoods_overlap(neighborhood_a, neighborhood_b): """ Checks if two genes have overlapping networks :param neighborhood_a: neighborhood for first gene (string as stored in database) :param neighborhood_b: neighborhood for second gene (string as stored in database) :return: Bool, true if networks overlap """ genes_a = set([n['gene_id'] for n in json.loads(neighborhood_a) if n['gene_id'] is not None]) genes_b = set([n['gene_id'] for n in json.loads(neighborhood_b) if n['gene_id'] is not None]) return len(genes_a.intersection(genes_b)) > 0 @staticmethod def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30): """ Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for each gene. Next the ECC score is calculated :param q_network: network for the query gene :param t_network: network for the target gene :param families: dictionary that links a sequence id (key) to a family id (value) :param thresholds: :param query_family: name of the input gene family :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant """ q_data = json.loads(q_network) t_data = json.loads(t_network) q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None] t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None] q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family] t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family] # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families]))) if len(q_families) == 0 or len(t_families) == 0: return 0.0, False else: ecc = jaccard(q_families, t_families) q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size t = thresholds[q_size-1][t_size-1] return ecc, ecc > t @staticmethod @benchmark def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5): """ Empirically determine (permutation test) thresholds for ECC :param families_a: families of species_a (list of internal family ids) :param families_b: families of species_b (list of internal family ids) :param max_size: maximum number of families (default = 30) :param iterations: number of permutations done :param step: step size :return: matrix (list of lists) with the thresholds at various family sizes """ thresholds = [] for i in range(0, max_size, step): print("%d done" % i) new_threshholds = [] for j in range(0, max_size, step): scores = [] for _ in range(iterations): if i+1 < len(families_a) and j+1 < len(families_b): i_fams = random.sample(families_a, i+1) j_fams = random.sample(families_b, j+1) scores.append(jaccard(i_fams, j_fams)) else: # Cannot calculate threshold with these families, add 1 scores.append(1) # TODO (maybe?): cutoff is hard coded here, replace ? print(iterations, len(scores), scores) scores = sorted(scores) for _ in range(step): new_threshholds.append(scores[int(iterations*0.95)]) for _ in range(step): thresholds.append(new_threshholds) return thresholds
class Species(db.Model): __tablename__ = 'species' id = db.Column(db.Integer, primary_key=True) code = db.Column(db.String(50, collation=SQL_COLLATION), unique=True) name = db.Column(db.String(200, collation=SQL_COLLATION)) data_type = db.Column(db.Enum('genome', 'transcriptome', name='data_type')) color = db.Column(db.String(7), default="#C7C7C7") highlight = db.Column(db.String(7), default="#DEDEDE") sequence_count = db.Column(db.Integer) network_count = db.Column(db.Integer) profile_count = db.Column(db.Integer) description = db.Column(db.Text) sequences = db.relationship('Sequence', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) networks = db.relationship('ExpressionNetworkMethod', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) profiles = db.relationship('ExpressionProfile', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) expression_specificities = db.relationship('ExpressionSpecificityMethod', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) condition_tissues = db.relationship('ConditionTissue', backref='species', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) def __init__(self, code, name, data_type='genome', color="#C7C7C7", highlight="#DEDEDE", description=None): self.code = code self.name = name self.data_type = data_type self.color = color self.highlight = highlight self.sequence_count = 0 self.profile_count = 0 self.network_count = 0 self.description = description def __repr__(self): return str(self.id) + ". " + self.name @property def has_interpro(self): from conekt.models.sequences import Sequence from conekt.models.relationships.sequence_interpro import SequenceInterproAssociation domain = SequenceInterproAssociation.query.join( Sequence, Sequence.id == SequenceInterproAssociation.sequence_id).filter( Sequence.species_id == self.id).first() if domain is not None: return True else: return False @property def has_go(self): from conekt.models.sequences import Sequence from conekt.models.relationships.sequence_go import SequenceGOAssociation go = SequenceGOAssociation.query.join( Sequence, Sequence.id == SequenceGOAssociation.sequence_id).filter( Sequence.species_id == self.id).first() if go is not None: return True else: return False @staticmethod def add(code, name, data_type='genome', color="#C7C7C7", highlight="#DEDEDE", description=None): new_species = Species(code, name, data_type=data_type, color=color, highlight=highlight, description=description) species = Species.query.filter_by(code=code).first() # species is not in the DB yet, add it if species is None: try: db.session.add(new_species) db.session.commit() except: db.rollback() return new_species.id else: return species.id @staticmethod def update_counts(): """ To avoid long counts the number of sequences, profiles and networks can be precalculated and stored in the database using this function. """ species = Species.query.all() for s in species: s.sequence_count = s.sequences.count() s.profile_count = s.profiles.count() s.network_count = s.networks.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e)
class CoexpressionClusteringMethod(db.Model): __tablename__ = 'coexpression_clustering_methods' id = db.Column(db.Integer, primary_key=True) network_method_id = db.Column(db.Integer, db.ForeignKey( 'expression_network_methods.id', ondelete='CASCADE'), index=True) method = db.Column(db.Text) cluster_count = db.Column(db.Integer) clusters = db.relationship('CoexpressionCluster', backref=db.backref('method', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) @staticmethod def update_counts(): """ To avoid long counts the number of clusters per method can be precalculated and stored in the database using this function """ methods = CoexpressionClusteringMethod.query.all() for m in methods: m.cluster_count = m.clusters.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def clusters_from_neighborhoods(method, network_method_id): probes = ExpressionNetwork.query.filter_by( method_id=network_method_id).all() # Load all probes clusters = defaultdict(list) clusters_orm = {} sequence_to_probe = {} for p in probes: # Only consider probes linked with sequences if p.sequence_id is not None: sequence_to_probe[p.sequence_id] = p.probe neighborhood = json.loads(p.network) sequence_ids = [ n["gene_id"] for n in neighborhood if "gene_id" in n.keys() and n["gene_id"] is not None ] # check if there are neighbors for this sequence if len(sequence_ids) > 0: clusters[p.sequence.name] = [p.sequence_id] + sequence_ids # If there are valid clusters add them to the database if len(clusters) > 0: # Add new method first new_method = CoexpressionClusteringMethod() new_method.network_method_id = network_method_id new_method.method = method new_method.cluster_count = len(clusters) db.session.add(new_method) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add Clusters for cluster in clusters.keys(): clusters_orm[cluster] = CoexpressionCluster() clusters_orm[cluster].method_id = new_method.id clusters_orm[cluster].name = cluster db.session.add(clusters_orm[cluster]) if len(clusters_orm) % 400 == 0: try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add sequence cluster relations for i, (cluster, members) in enumerate(clusters.items()): for sequence_id in members: relation = SequenceCoexpressionClusterAssociation() relation.sequence_id = sequence_id relation.coexpression_cluster_id = clusters_orm[cluster].id relation.probe = sequence_to_probe[ sequence_id] if sequence_id in sequence_to_probe.keys( ) else None db.session.add(relation) if i % 20 == 0: try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def build_hcca_clusters(method, network_method_id, step_size=3, hrr_cutoff=30, min_cluster_size=40, max_cluster_size=200): """ method to build HCCA clusters for a certain network :param method: Name for the current clustering method :param network_method_id: ID for the network to cluster :param step_size: desired step_size for the HCCA algorithm :param hrr_cutoff: desired hrr_cutoff for the HCCA algorithm :param min_cluster_size: minimal cluster size :param max_cluster_size: maximum cluster size """ network_data = {} sequence_probe = {} # Get network from DB print("Loading Network data from DB...", sep='') ExpressionNetworkMethod.query.get_or_404( network_method_id) # Check if method exists probes = ExpressionNetwork.query.filter_by( method_id=network_method_id).all() # Load all probes for p in probes: # Loop over probes and store hrr for all neighbors if p.sequence_id is not None: neighborhood = json.loads(p.network) network_data[p.sequence_id] = { nb["gene_id"]: nb["hrr"] for nb in neighborhood if "gene_id" in nb.keys() and "hrr" in nb.keys() and nb["gene_id"] is not None } sequence_probe[p.sequence_id] = p.probe # Double check edges are reciprocally defined for sequence, data in network_data.items(): for neighbor, score in data.items(): if neighbor not in network_data.keys(): network_data[neighbor] = {sequence: score} else: if sequence not in network_data[neighbor].keys(): network_data[neighbor][sequence] = score print("Done!\nStarting to build Clusters...\n") # Build clusters hcca_util = HCCA(step_size=step_size, hrr_cutoff=hrr_cutoff, min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size) hcca_util.load_data(network_data) hcca_util.build_clusters() # Add new method to DB clusters = list(set([t[1] for t in hcca_util.clusters])) if len(clusters) > 0: print("Done building clusters, adding clusters to DB") # Add new method first new_method = CoexpressionClusteringMethod() new_method.network_method_id = network_method_id new_method.method = method new_method.cluster_count = len(clusters) db.session.add(new_method) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add cluster and store as dict cluster_dict = {} for c in clusters: cluster_dict[c] = CoexpressionCluster() cluster_dict[c].method_id = new_method.id cluster_dict[c].name = c db.session.add(cluster_dict[c]) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Link sequences to clusters for i, t in enumerate(hcca_util.clusters): gene_id, cluster_name, _ = t relation = SequenceCoexpressionClusterAssociation() relation.probe = sequence_probe[ gene_id] if gene_id in sequence_probe.keys() else None relation.sequence_id = gene_id relation.coexpression_cluster_id = cluster_dict[ cluster_name].id if cluster_name in cluster_dict.keys( ) else None if relation.coexpression_cluster_id is not None: db.session.add(relation) if i > 0 and i % 400 == 0: # Add relations in sets of 400 try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add remaining relations try: db.session.commit() except Exception as e: db.session.rollback() print(e) else: print("No clusters found! Not adding anything to DB !") @staticmethod def add_lstrap_coexpression_clusters(cluster_file, description, network_id, prefix='cluster_', min_size=10): """ Adds MCL clusters, as produced by LSTrAP, to the database :param cluster_file: path to file with clusters :param description: description to add to database for this set of clusters :param network_id: network the clusters are based on :param prefix: prefix for individual clsuter names (default 'cluster_') :param min_size: minimal size of a cluster (default = 10) :return: ID of new clustering method """ # get all sequences from the database and create a dictionary sequences = Sequence.query.all() sequence_dict = {} for member in sequences: sequence_dict[member.name.upper()] = member # add coexpression clustering method to the database clustering_method = CoexpressionClusteringMethod() clustering_method.network_method_id = network_id clustering_method.method = description try: db.session.add(clustering_method) db.session.commit() except Exception as e: db.session.rollback() print(e) quit() with open(cluster_file) as f: i = 1 for line in f: probes = [p for p in line.strip().split()] genes = [p.replace('.1', '') for p in probes] cluster_id = "%s%04d" % (prefix, i) if len(probes) >= min_size: i += 1 new_cluster = CoexpressionCluster() new_cluster.method_id = clustering_method.id new_cluster.name = cluster_id db.session.add(new_cluster) try: db.session.commit() except Exception as e: db.session.rollback() print(e) continue for p, g in zip(probes, genes): new_association = SequenceCoexpressionClusterAssociation( ) new_association.probe = p new_association.sequence_id = None if g.upper() in sequence_dict.keys(): new_association.sequence_id = sequence_dict[ g.upper()].id new_association.coexpression_cluster_id = new_cluster.id db.session.add(new_association) try: db.session.commit() except Exception as e: db.session.rollback() print(e) return clustering_method.id
class TreeMethod(db.Model): __tablename__ = 'tree_methods' id = db.Column(db.Integer, primary_key=True) description = db.Column(db.Text) gene_family_method_id = db.Column(db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) trees = db.relationship('Tree', backref=db.backref('method', lazy='joined'), lazy='dynamic', passive_deletes=True) def reconcile_trees(self): print("\n1.====================Getting into function reconcile_trees") # Fetch required data from the database sequences = Sequence.query.all() #print("\n1.1.=============================Sequences Joined: " + ', '.join(sequences)) #FAILS, bad print statement for list obj clades = Clade.query.all() #print("\n1.2. =========================Clades: ", *clades, sep='\n') # print works seq_to_species = {s.name: s.species.code for s in sequences} #print("\n2.=========================seq_to_species: ", *seq_to_species, sep='::') seq_to_id = {s.name: s.id for s in sequences} clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} new_associations = [] phyloxml_data = {} for t in self.trees: # Load tree from Newick string and start reconciliating tree = newick.loads(t.data_newick)[0] print("\n3.=========================tree loaded ok") for node in tree.walk(): if len(node.descendants) != 2: #print("\n4.==========length of node descendant=" + str(len(node.descendants))) if not node.is_binary: print("\n5.================Non-Binary-node: " + str(node.is_binary)) # Print warning in case there is a non-binary node #sdash: commenting out this original print statement because none binary-node doesn't have id nor label. Process stops at this print statement for non-binary trees. print( "Non-Binary tree: " + t.data_newick ) #sdash: this print statement will show which tree is non-binary and is skipped. Doesn't stop the reconcile process. #sdash May-03-2019#original# #print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) # Otherwise it is a leaf node and can be skipped continue branch_one_seq = [ l.name.strip() for l in node.descendants[0].get_leaves() ] # print("\n6.===============Branch-one-seq: " + ', '.join(branch_one_seq)) branch_two_seq = [ l.name.strip() for l in node.descendants[1].get_leaves() ] # print("\n7.===============Branch-two-seq: " + ', '.join(branch_two_seq)) branch_one_species = set([ seq_to_species[s] for s in branch_one_seq if s in seq_to_species.keys() ]) print( "\n8.===============Branch-one-spp: " + ', '.join(branch_one_species) ) #Empty set, length=0; seq_to_species length=143271; SO, problem in forming this set definition ## TO DO: #Possibly the seq name seq_to_species doesn't match in branch_one_seq and # hence, it is an empty set. Next check this possibility. Tue June 25. branch_two_species = set([ seq_to_species[s] for s in branch_two_seq if s in seq_to_species.keys() ]) print("\n9.===============Branch-two-spp: " + ', '.join(branch_two_species)) all_species = branch_one_species.union(branch_two_species) clade, _ = phylo.get_clade(all_species, clade_to_species) duplication = phylo.is_duplication(branch_one_species, branch_two_species, clade_to_species) duplication_consistency = None if duplication: duplication_consistency = phylo.duplication_consistency( branch_one_species, branch_two_species) tags = [ clade_to_id[clade] if clade is not None else 0, 'D' if duplication else 'S', duplication_consistency if duplication else 0 ] node.name = '_'.join([str(t) for t in tags]) if clade is not None: for seq_one in branch_one_seq: for seq_two in branch_two_seq: new_associations.append({ 'sequence_one_id': seq_to_id[seq_one], 'sequence_two_id': seq_to_id[seq_two], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) new_associations.append({ 'sequence_one_id': seq_to_id[seq_two], 'sequence_two_id': seq_to_id[seq_one], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) if len(new_associations) > 400: db.engine.execute( SequenceSequenceCladeAssociation.__table__.insert(), new_associations) new_associations = [] # add newick tree to memory phyloxml_data[t.id] = newick.dumps([tree]) db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(), new_associations) # Update PhyloXML data file for all trees for t in self.trees: if t.id in phyloxml_data.keys(): t.data_phyloxml = phyloxml_data[t.id] db.session.commit()
class GO(db.Model): __tablename__ = 'go' id = db.Column(db.Integer, primary_key=True) label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) name = db.Column(db.Text) type = db.Column(db.Enum('biological_process', 'molecular_function', 'cellular_component', name='go_type')) description = db.Column(db.Text) obsolete = db.Column(db.SmallInteger) is_a = db.Column(db.Text) extended_go = db.Column(db.Text) species_counts = db.Column(db.Text) sequences = db.relationship('Sequence', secondary=sequence_go, lazy='dynamic') # Other properties # # sequence_associations declared in 'SequenceGOAssociation' # enriched_clusters declared in 'ClusterGOEnrichment' def __init__(self, label, name, go_type, description, obsolete, is_a, extended_go): self.label = label self.name = name self.type = go_type self.description = description self.obsolete = obsolete self.is_a = is_a self.extended_go = extended_go self.species_counts = "" def set_all(self, label, name, go_type, description, extended_go): self.label = label self.name = name self.type = go_type self.description = description self.extended_go = extended_go self.species_counts = "" @property def short_type(self): if self.type == 'biological_process': return 'BP' elif self.type == 'molecular_function': return 'MF' elif self.type == 'cellular_component': return 'CC' else: return 'UNK' @property def readable_type(self): if self.type == 'biological_process': return 'Biological process' elif self.type == 'molecular_function': return 'Molecular function' elif self.type == 'cellular_component': return 'Cellular component' else: return 'Unknown type' @property def parent_count(self): """ Returns total number of genes 'above' this gene in the DAG :return: """ return len(self.extended_go.split(';')) if self.extended_go != '' else 0 @property def interpro_stats(self): from conekt.models.interpro import Interpro return Interpro.sequence_stats_subquery(self.sequences) @property def go_stats(self): return GO.sequence_stats_subquery(self.sequences) @property def family_stats(self): from conekt.models.gene_families import GeneFamily return GeneFamily.sequence_stats_subquery(self.sequences) def species_occurrence(self, species_id): """ count how many genes have the current GO term in a given species :param species_id: internal id of the selected species :return: count of sequences with this term associated """ count = 0 sequences = self.sequences.all() for s in sequences: if s.species_id == species_id: count += 1 return count @staticmethod def sequence_stats(sequence_ids, exclude_predicted=True): """ Takes a list of sequence IDs and returns InterPro stats for those sequences :param sequence_ids: list of sequence ids :param exclude_predicted: if True (default) predicted GO labels will be excluded :return: dict with for each InterPro domain linked with any of the input sequences stats """ query = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)) if exclude_predicted: query = query.filter(SequenceGOAssociation.predicted == 0) data = query.all() return GO.__sequence_stats_associations(data) @staticmethod def sequence_stats_subquery(sequences, exclude_predicted=True): subquery = sequences.subquery() query = SequenceGOAssociation.query if exclude_predicted: query = query.filter(SequenceGOAssociation.predicted == 0) data = query.join(subquery, SequenceGOAssociation.sequence_id == subquery.c.id).all() return GO.__sequence_stats_associations(data) @staticmethod def __sequence_stats_associations(associations): output = {} for d in associations: if d.go_id not in output.keys(): output[d.go_id] = { 'go': d.go, 'count': 1, 'sequences': [d.sequence_id], 'species': [d.sequence.species_id] } else: output[d.go_id]['count'] += 1 if d.sequence_id not in output[d.go_id]['sequences']: output[d.go_id]['sequences'].append(d.sequence_id) if d.sequence.species_id not in output[d.go_id]['species']: output[d.go_id]['species'].append(d.sequence.species_id) for k, v in output.items(): v['species_count'] = len(v['species']) v['sequence_count'] = len(v['sequences']) return output @staticmethod def update_species_counts(): """ Adds phylo-profile to each go-label, results are stored in the database :param exclude_predicted: if True (default) predicted GO labels will be excluded """ # link species to sequences sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall() sequence_to_species = {} for seq_id, species_id in sequences: if species_id is not None: sequence_to_species[seq_id] = int(species_id) # get go for all genes associations = db.engine.execute( db.select([SequenceGOAssociation.__table__.c.sequence_id, SequenceGOAssociation.__table__.c.go_id], distinct=True)\ .where(SequenceGOAssociation.__table__.c.predicted == 0))\ .fetchall() count = {} for seq_id, go_id in associations: species_id = sequence_to_species[seq_id] if go_id not in count.keys(): count[go_id] = {} if species_id not in count[go_id]: count[go_id][species_id] = 1 else: count[go_id][species_id] += 1 # update counts for go_id, data in count.items(): db.engine.execute(db.update(GO.__table__) .where(GO.__table__.c.id == go_id) .values(species_counts=json.dumps(data))) @staticmethod def add_from_obo(filename, empty=True, compressed=False): """ Parses GeneOntology's OBO file and adds it to the database :param filename: Path to the OBO file to parse :param compressed: load data from .gz file if true (default: False) :param empty: Empty the database first when true (default: True) """ # If required empty the table first if empty: try: db.session.query(GO).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) obo_parser = OBOParser() obo_parser.readfile(filename, compressed=compressed) obo_parser.extend_go() for i, term in enumerate(obo_parser.terms): go = GO(term.id, term.name, term.namespace, term.definition, term.is_obsolete, ";".join(term.is_a), ";".join(term.extended_go)) db.session.add(go) if i % 40 == 0: # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_go_from_plaza(filename): """ Adds GO annotation from PLAZA 3.0 to the database :param filename: Path to the annotation file :return: """ go_parser = GOParser() go_parser.read_plaza_go(filename) gene_hash = {} go_hash = {} all_sequences = Sequence.query.all() all_go = GO.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for term in all_go: go_hash[term.label] = term associations = [] for gene, terms in go_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for term in terms: if term["id"] in go_hash.keys(): current_term = go_hash[term["id"]] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": term["evidence"], "source": term["source"]} associations.append(association) else: print(term, "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] # Add extended GOs for gene, terms in go_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] new_terms = [] current_terms = [] for term in terms: if term["id"] not in current_terms: current_terms.append(term["id"]) for term in terms: if term["id"] in go_hash.keys(): extended_terms = go_hash[term["id"]].extended_go.split(";") for extended_term in extended_terms: if extended_term not in current_terms and extended_term not in new_terms: new_terms.append(extended_term) for new_term in new_terms: if new_term in go_hash.keys(): current_term = go_hash[new_term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": None, "source": "Extended"} associations.append(association) if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) @staticmethod def add_go_from_tab(filename, species_id, source="Source not provided"): gene_hash = {} go_hash = {} all_sequences = Sequence.query.filter_by(species_id=species_id).all() all_go = GO.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for term in all_go: go_hash[term.label] = term associations = [] gene_go = defaultdict(list) with open(filename, "r") as f: for line in f: gene, term, evidence = line.strip().split('\t') if gene in gene_hash.keys(): current_sequence = gene_hash[gene] if term in go_hash.keys(): current_term = go_hash[term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": evidence, "source": source} associations.append(association) if term not in gene_go[gene]: gene_go[gene].append(term) else: print(term, "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] # Add extended GOs for gene, terms in gene_go.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] new_terms = [] current_terms = [] for term in terms: if term not in current_terms: current_terms.append(term) for term in terms: if term in go_hash.keys(): extended_terms = go_hash[term].extended_go.split(";") for extended_term in extended_terms: if extended_term not in current_terms and extended_term not in new_terms: new_terms.append(extended_term) for new_term in new_terms: if new_term in go_hash.keys(): current_term = go_hash[new_term] association = { "sequence_id": current_sequence.id, "go_id": current_term.id, "evidence": None, "source": "Extended"} associations.append(association) if len(associations) > 400: db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) associations = [] db.engine.execute(SequenceGOAssociation.__table__.insert(), associations) @staticmethod def predict_from_network(expression_network_method_id, threshold=5, source="PlaNet Prediction"): """ Function to transfer GO terms from neighbors in the network. If n or more (based on threshold) neighbors have a GO label (excluding other predicted labels) the term is transferred. :param expression_network_method_id: Expression network as input :param threshold: number of neighboring genes that should have the label to allow transfor :param source: Value for the source field """ from conekt.models.expression.networks import ExpressionNetworkMethod expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id) if expression_network_method is None: print("ERROR: Network Method ID %d not found" % expression_network_method_id) return # Get all genes that belong to the network probes = expression_network_method.probes.all() new_associations = [] for i, probe in enumerate(probes): print("Predicting GO for gene: %d, %s (%d out of %d)" % (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count)) # Get neighborhood from database neighborhood = json.loads(probe.network) # Get sequence ids from genes in first level neighborhood sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n] # If the number of genes in the neighborhood is smaller than the threshold skip (no prediction possible) # If there is no sequence associated with the probe skip as well if len(sequence_ids) < threshold or probe.sequence_id is None: continue # Get own GO terms own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id) own_terms = list(set([a.go_id for a in own_associations])) # Get GO terms from neighbors associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\ filter(SequenceGOAssociation.predicted == 0).all() # Make GO terms from neighbors unique and ignore terms the current gene has already unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms]) go_counts = defaultdict(lambda: 0) for ua in unique_associations: go_counts[ua[1]] += 1 # Determine new terms (that occurred equal or more times than the desired threshold new_terms = [{ 'go_id': k, 'score': v } for k, v in go_counts.items() if v >= threshold] # Store new terms in a list that can be added to the database for nt in new_terms: new_associations.append({ 'sequence_id': probe.sequence_id, 'go_id': nt['go_id'], 'evidence': 'IEP', 'source': source, 'predicted': True, 'prediction_data': json.dumps({'score': nt['score'], 'threshold': threshold, 'network_method': expression_network_method_id, 'prediction_method': 'Neighbor counting' }) }) # Add new labels to the database in chuncks of 400 for i in range(0, len(new_associations), 400): db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400]) @staticmethod def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"): from conekt.models.expression.networks import ExpressionNetworkMethod expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id) if expression_network_method is None: print("ERROR: Network Method ID %d not found" % expression_network_method_id) return probes = expression_network_method.probes.all() # Get all GO terms and get background # Important, counts are obtained from precomputed counts in the species_counts field !! go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall() go_background = defaultdict(lambda: 0) for go_id, counts_json in go_data: if counts_json is not "": counts = json.loads(counts_json) if str(expression_network_method.species_id) in counts.keys(): go_background[go_id] = counts[str(expression_network_method.species_id)] new_associations = [] for i, probe in enumerate(probes): print("Predicting GO for gene: %d, %s (%d out of %d)" % (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count)) # Get neighborhood from database neighborhood = json.loads(probe.network) # Get sequence ids from genes in first level neighborhood sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n] # Get own GO terms own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id) own_terms = list(set([a.go_id for a in own_associations])) # Get GO terms from neighbors associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\ filter(SequenceGOAssociation.predicted == 0).all() # Make GO terms from neighbors unique and ignore terms the current gene has already unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms]) go_counts = defaultdict(lambda: 0) for ua in unique_associations: go_counts[ua[1]] += 1 # find significantly enriched GO terms and store them enriched_go = [] for go_id, count in go_counts.items(): p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes)) if p_value < cutoff: enriched_go.append((go_id, p_value)) # apply FDR correction to the p-values corrected_p = fdr_correction([a[1] for a in enriched_go]) # push new prediction in a dict that will be added to the DB for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go): new_associations.append({ 'sequence_id': probe.sequence_id, 'go_id': go_id, 'evidence': 'IEP', 'source': source, 'predicted': True, 'prediction_data': json.dumps({'p-cutoff': cutoff, 'p-value': p_value, 'p-value (FDR)': corrected_p, 'network_method': expression_network_method_id, 'prediction_method': 'Neighborhood enrichment' }) }) # Add new labels to the database in chuncks of 400 for i in range(0, len(new_associations), 400): db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
class ExpressionSpecificityMethod(db.Model): __tablename__ = 'expression_specificity_method' id = db.Column(db.Integer, primary_key=True) description = db.Column(db.Text) conditions = db.Column(db.Text) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) specificities = db.relationship('ExpressionSpecificity', backref='method', lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) condition_tissue = db.relationship('ConditionTissue', backref='expression_specificity_method', lazy='joined', cascade="all, delete-orphan", passive_deletes=True, uselist=False) menu_order = db.Column(db.Integer) def __repr__(self): return str( self.id) + ". " + self.description + ' [' + self.species.name + ']' @staticmethod def calculate_specificities(species_id, description, remove_background=False): """ Function that calculates condition specificities for each profile. No grouping is applied, each condition is used as is :param species_id: internal species ID :param description: description for the method to determine the specificity :param remove_background: when true the lowest value of each profile is substracted from all values (can be off use with noisy data derived from microarrays. """ conditions = [] # get profile from the database (ORM free for speed) profiles = db.engine.execute( db.select([ ExpressionProfile.__table__.c.id, ExpressionProfile.__table__.c.profile ]).where(ExpressionProfile.__table__.c.species_id == species_id)).fetchall() # detect all conditions for profile_id, profile in profiles: profile_data = json.loads(profile) for condition in profile_data['order']: if condition not in conditions: conditions.append(condition) # convert list into dictionary and run function conditions_dict = {k: k for k in conditions} return ExpressionSpecificityMethod.calculate_tissue_specificities( species_id, description, conditions_dict, conditions, remove_background=remove_background) @staticmethod def calculate_tissue_specificities(species_id, description, condition_to_tissue, order, remove_background=False, use_max=True): """ Function calculates tissue specific genes based on the expression conditions. A dict is required to link specific conditions to the correct tissues. This also allows conditions to be excluded in case they are unrelated with a specific tissue. :param species_id: internal species ID :param description: description for the method to determine the specificity :param condition_to_tissue: dict to connect a condition to a tissue :param order: preferred order of the conditions, will match tissues to it :param remove_background: substracts the lowest value to correct for background noise :param use_max: uses the maximum of mean values instead of the mean of all values :return id of the new method """ new_method = ExpressionSpecificityMethod() new_method.species_id = species_id new_method.description = description new_method.menu_order = 0 tissues = [] for c in order: if c in condition_to_tissue.keys(): v = condition_to_tissue[c] if v not in tissues: tissues.append(v) # get profile from the database (ORM free for speed) profiles = db.engine.execute( db.select([ ExpressionProfile.__table__.c.id, ExpressionProfile.__table__.c.profile ]).where(ExpressionProfile.__table__.c.species_id == species_id)).fetchall() new_method.conditions = json.dumps(tissues) db.session.add(new_method) db.session.commit() # detect specifities and add to the database specificities = [] for profile_id, profile in profiles: # prepare profile data for calculation profile_data = json.loads(profile) profile_means = {} for t in tissues: values = [] means = [] valid_conditions = [ k for k in profile_data['data'] if k in condition_to_tissue and condition_to_tissue[k] == t ] for k, v in profile_data['data'].items(): if k in valid_conditions: values += v means.append(mean(v)) if not use_max: profile_means[t] = mean(values) if len(values) > 0 else 0 else: profile_means[t] = max(means) if len(means) > 0 else 0 # substract minimum value to remove background # experimental code ! if remove_background: minimum = min([v for k, v in profile_means.items()]) for k in profile_means.keys(): profile_means[k] -= minimum # determine spm score for each condition profile_specificities = [] profile_tau = tau([v for _, v in profile_means.items()]) profile_entropy = entropy_from_values( [v for _, v in profile_means.items()]) for t in tissues: score = expression_specificity(t, profile_means) new_specificity = { 'profile_id': profile_id, 'condition': t, 'score': score, 'entropy': profile_entropy, 'tau': profile_tau, 'method_id': new_method.id, } profile_specificities.append(new_specificity) # sort conditions and add top one profile_specificities = sorted(profile_specificities, key=lambda x: x['score'], reverse=True) specificities.append(profile_specificities[0]) # write specificities to db if there are more than 400 (ORM free for speed) if len(specificities) > 400: db.engine.execute(ExpressionSpecificity.__table__.insert(), specificities) specificities = [] # write remaining specificities to the db db.engine.execute(ExpressionSpecificity.__table__.insert(), specificities) return new_method.id
class TreeMethod(db.Model): __tablename__ = 'tree_methods' id = db.Column(db.Integer, primary_key=True) description = db.Column(db.Text) gene_family_method_id = db.Column(db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) trees = db.relationship('Tree', backref=db.backref('method', lazy='joined'), lazy='dynamic', passive_deletes=True) def reconcile_trees(self): # Fetch required data from the database sequences = Sequence.query.all() clades = Clade.query.all() seq_to_species = {s.name: s.species.code for s in sequences} seq_to_id = {s.name: s.id for s in sequences} clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} new_associations = [] phyloxml_data = {} for t in self.trees: # Load tree from Newick string and start reconciliating tree = newick.loads(t.data_newick)[0] for node in tree.walk(): if len(node.descendants) != 2: if not node.is_binary: # Print warning in case there is a non-binary node print( "[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) # Otherwise it is a leaf node and can be skipped continue branch_one_seq = [ l.name.strip() for l in node.descendants[0].get_leaves() ] branch_two_seq = [ l.name.strip() for l in node.descendants[1].get_leaves() ] branch_one_species = set([ seq_to_species[s] for s in branch_one_seq if s in seq_to_species.keys() ]) branch_two_species = set([ seq_to_species[s] for s in branch_two_seq if s in seq_to_species.keys() ]) all_species = branch_one_species.union(branch_two_species) clade, _ = phylo.get_clade(all_species, clade_to_species) duplication = phylo.is_duplication(branch_one_species, branch_two_species, clade_to_species) duplication_consistency = None if duplication: duplication_consistency = phylo.duplication_consistency( branch_one_species, branch_two_species) tags = [ clade_to_id[clade] if clade is not None else 0, 'D' if duplication else 'S', duplication_consistency if duplication else 0 ] node.name = '_'.join([str(t) for t in tags]) if clade is not None: for seq_one in branch_one_seq: for seq_two in branch_two_seq: new_associations.append({ 'sequence_one_id': seq_to_id[seq_one], 'sequence_two_id': seq_to_id[seq_two], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) new_associations.append({ 'sequence_one_id': seq_to_id[seq_two], 'sequence_two_id': seq_to_id[seq_one], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) if len(new_associations) > 400: db.engine.execute( SequenceSequenceCladeAssociation.__table__.insert(), new_associations) new_associations = [] # add newick tree to memory phyloxml_data[t.id] = newick.dumps([tree]) db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(), new_associations) # Update PhyloXML data file for all trees for t in self.trees: if t.id in phyloxml_data.keys(): t.data_phyloxml = phyloxml_data[t.id] db.session.commit()
class ExpressionProfile(db.Model): __tablename__ = 'expression_profiles' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True) profile = db.deferred(db.Column(db.Text)) specificities = db.relationship('ExpressionSpecificity', backref=db.backref('profile', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) def __init__(self, probe, sequence_id, profile): self.probe = probe self.sequence_id = sequence_id self.profile = profile @staticmethod def __profile_to_table(data): """ Internal function to convert an expression profile (dict) to a tabular text :param data: Dict with expression profile :return: table (string) """ output = [["condition", "mean", "min", "max"]] order = data["order"] for o in order: try: values = data["data"][o] output.append( [o, str(mean(values)), str(min(values)), str(max(values))]) except Exception as e: print(e) return '\n'.join(['\t'.join(l) for l in output]) @property def table(self): """ Returns the condition expression as a tabular text file :return: table with data (string) """ table = ExpressionProfile.__profile_to_table(json.loads(self.profile)) return table def tissue_table(self, condition_tissue_id, use_means=True): """ Returns the tissue expression as a tabular text file :param condition_tissue_id: condition_tissue_id for the conversion :param use_means: Use the mean of the condition (recommended) :return: table with data (string) """ table = ExpressionProfile.__profile_to_table( self.tissue_profile(condition_tissue_id, use_means=use_means)) return table @property def low_abundance(self, cutoff=10): """ Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff :param cutoff: cutoff for expression, default = 10 :return: True in case of low abundance otherwise False """ data = json.loads(self.profile) checks = [mean(v) > cutoff for _, v in data["data"].items()] return not any(checks) @staticmethod def convert_profile(condition_to_tissue, profile_data, use_means=True): """ Convert a full, detailed profile into a more general summarized one using conversion table stored in the database :param condition_to_tissue: dict with conversion instructions :param profile_data: profile to convert :param use_means: use means of detailed condition if True otherwise use samples independently. Default True :return: New profile """ tissues = list(set(condition_to_tissue['conversion'].values())) output = {} for t in tissues: valid_conditions = [ k for k in profile_data['data'] if k in condition_to_tissue['conversion'] and condition_to_tissue['conversion'][k] == t ] valid_values = [] for k, v in profile_data['data'].items(): if k in valid_conditions: if use_means: valid_values.append(mean(v)) else: valid_values += v output[t] = valid_values if len(valid_values) > 0 else [0] return { 'order': condition_to_tissue['order'], 'colors': condition_to_tissue['colors'], 'data': output } def tissue_profile(self, condition_tissue_id, use_means=True): """ Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue). :param condition_tissue_id: identifier of the conversion table :param use_means: store the mean of the condition rather than individual values. The matches the spm calculations better. :return: parsed profile """ ct = ConditionTissue.query.get(condition_tissue_id) condition_to_tissue = json.loads(ct.data) profile_data = json.loads(self.profile) output = ExpressionProfile.convert_profile(condition_to_tissue, profile_data, use_means=use_means) return output @staticmethod def get_heatmap(species_id, probes, zlog=True, raw=False): """ Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order' the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed :param species_id: species id (internal database id) :param probes: a list of probes to include in the heatmap :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition) """ profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\ filter(ExpressionProfile.probe.in_(probes)).all() order = [] output = [] not_found = [p.lower() for p in probes] for profile in profiles: name = profile.probe data = json.loads(profile.profile) order = data['order'] experiments = data['data'] with contextlib.suppress(ValueError): not_found.remove(profile.probe.lower()) with contextlib.suppress(ValueError): not_found.remove(profile.sequence.name.lower()) values = {} for o in order: values[o] = mean(experiments[o]) row_mean = mean(values.values()) row_max = max(values.values()) for o in order: if zlog: if row_mean == 0 or values[o] == 0: values[o] = '-' else: try: values[o] = log(values[o] / row_mean, 2) except ValueError as _: print("Unable to calculate log()", values[o], row_mean) values[o] = '-' else: if row_max != 0 and not raw: values[o] = values[o] / row_max output.append({ "name": name, "values": values, "sequence_id": profile.sequence_id, "shortest_alias": profile.sequence.shortest_alias }) if len(not_found) > 0: flash("Couldn't find profile for: %s" % ", ".join(not_found), "warning") return {'order': order, 'heatmap_data': output} @staticmethod def get_profiles(species_id, probes, limit=1000): """ Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly long queries :param species_id: internal id of the species :param probes: probe names to fetch :param limit: maximum number of probes to get :return: List of ExpressionProfile objects including the full profiles """ profiles = ExpressionProfile.query.\ options(undefer('profile')).\ filter(ExpressionProfile.probe.in_(probes)).\ filter_by(species_id=species_id).\ options(joinedload('sequence').load_only('name').noload('xrefs')).\ limit(limit).all() return profiles @staticmethod def add_profile_from_lstrap(matrix_file, annotation_file, species_id, order_color_file=None): """ Function to convert an (normalized) expression matrix (lstrap output) into a profile :param matrix_file: path to the expression matrix :param annotation_file: path to the file assigning samples to conditions :param species_id: internal id of the species :param order_color_file: tab delimited file that contains the order and color of conditions """ annotation = {} with open(annotation_file, 'r') as fin: # get rid of the header _ = fin.readline() for line in fin: parts = line.strip().split('\t') if len(parts) > 1: run, description = parts annotation[run] = description order, colors = [], [] if order_color_file is not None: with open(order_color_file, 'r') as fin: for line in fin: try: o, c = line.strip().split('\t') order.append(o) colors.append(c) except Exception as _: pass # build conversion table for sequences sequences = Sequence.query.filter_by(species_id=species_id).all() sequence_dict = {} # key = sequence name uppercase, value internal id for s in sequences: sequence_dict[s.name.upper()] = s.id with open(matrix_file) as fin: # read header _, *colnames = fin.readline().rstrip().split() colnames = [c.replace('.htseq', '') for c in colnames] # determine order after annotation is not defined if order is None: order = [] for c in colnames: if c in annotation.keys(): if annotation[c] not in order: order.append(annotation[c]) order.sort() # read each line and build profile new_probes = [] for line in fin: transcript, *values = line.rstrip().split() profile = defaultdict(list) for c, v in zip(colnames, values): if c in annotation.keys(): condition = annotation[c] profile[condition].append(float(v)) new_probe = { "species_id": species_id, "probe": transcript, "sequence_id": sequence_dict[transcript.upper()] if transcript.upper() in sequence_dict.keys() else None, "profile": json.dumps({ "order": order, "colors": colors, "data": profile }) } new_probes.append(new_probe) if len(new_probes) > 400: db.engine.execute(ExpressionProfile.__table__.insert(), new_probes) new_probes = [] db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
class Sequence(db.Model): __tablename__ = 'sequences' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) name = db.Column(db.String(50, collation=SQL_COLLATION), index=True) description = db.Column(db.Text) coding_sequence = db.deferred(db.Column(db.Text)) type = db.Column(db.Enum('protein_coding', 'TE', 'RNA', name='sequence_type'), default='protein_coding') is_mitochondrial = db.Column(db.SmallInteger, default=False) is_chloroplast = db.Column(db.SmallInteger, default=False) expression_profiles = db.relationship('ExpressionProfile', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) network_nodes = db.relationship('ExpressionNetwork', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) # Other properties # # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation' # interpro_associations declared in 'SequenceInterproAssociation' # go_associations declared in 'SequenceGOAssociation' # family_associations declared in 'SequenceFamilyAssociation' go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic') interpro_domains = db.relationship('Interpro', secondary=sequence_interpro, lazy='dynamic') families = db.relationship('GeneFamily', secondary=sequence_family, lazy='dynamic') coexpression_clusters = db.relationship( 'CoexpressionCluster', secondary=sequence_coexpression_cluster, backref=db.backref('sequences', lazy='dynamic'), lazy='dynamic') ecc_query_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id", backref=db.backref('query_sequence', lazy='joined'), lazy='dynamic') ecc_target_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id", backref=db.backref('target_sequence', lazy='joined'), lazy='dynamic') clade_associations_one = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id", backref=db.backref('sequence_one', lazy='joined'), lazy='dynamic') clade_associations_two = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id", backref=db.backref('sequence_two', lazy='joined'), lazy='dynamic') xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined') def __init__(self, species_id, name, coding_sequence, type='protein_coding', is_chloroplast=False, is_mitochondrial=False, description=None): self.species_id = species_id self.name = name self.description = description self.coding_sequence = coding_sequence self.type = type self.is_chloroplast = is_chloroplast self.is_mitochondrial = is_mitochondrial @property def protein_sequence(self): """ Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and break after adding a stop codon (indicated by '*') :return: The amino acid sequence based on the coding sequence """ return translate(self.coding_sequence) @property def aliases(self): """ Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs :return: human readable string with aliases or None """ t = [x.name for x in self.xrefs if x.platform == 'token'] return ", ".join(t) if len(t) > 0 else None @property def shortest_alias(self): """ Returns the shortest alias :return: string with shortest alias or None (in case no aliases exist) """ t = [x.name for x in self.xrefs if x.platform == 'token'] return min(t, key=len) if len(t) > 0 else None @property def display_name(self): """ Returns a name to display (from xrefs with display) if available otherwise return name :return: display name """ t = [x.name for x in self.xrefs if x.platform == 'display'] return t[0] if len(t) > 0 else self.name @property def best_name(self): """ Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g. graphs :return: string with best name to show in graphs, ... """ if self.display_name is not self.name: return self.display_name elif self.shortest_alias is not None: return self.shortest_alias else: return self.name @property def readable_type(self): """ Converts the type table to a readable string :return: string with readable version of the sequence type """ conversion = { 'protein_coding': 'protein coding', 'TE': 'transposable element', 'RNA': 'RNA' } if self.type in conversion.keys(): return conversion[self.type] else: return 'other' @staticmethod def add_from_fasta(filename, species_id, compressed=False): fasta_data = Fasta() fasta_data.readfile(filename, compressed=compressed) new_sequences = [] # Loop over sequences, sorted by name (key here) and add to db for name, sequence in sorted(fasta_data.sequences.items(), key=operator.itemgetter(0)): new_sequence = { "species_id": species_id, "name": name, "description": None, "coding_sequence": sequence, "type": "protein_coding", "is_mitochondrial": False, "is_chloroplast": False } new_sequences.append(new_sequence) # add 400 sequences at the time, more can cause problems with some database engines if len(new_sequences) > 400: db.engine.execute(Sequence.__table__.insert(), new_sequences) new_sequences = [] # add the last set of sequences db.engine.execute(Sequence.__table__.insert(), new_sequences) return len(fasta_data.sequences.keys()) @staticmethod def add_descriptions(filename, species_id): sequences = Sequence.query.filter_by(species_id=species_id).all() seq_dict = {} for s in sequences: seq_dict[s.name] = s with open(filename, "r") as f_in: for i, line in enumerate(f_in): try: name, description = line.strip().split('\t') except ValueError: print("Cannot parse line %d: \"%s\"" % (i, line), file=sys.stderr) finally: if name in seq_dict.keys(): seq_dict[name].description = description if i % 400 == 0: db.session.commit() db.session.commit() @staticmethod def export_cds(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out) @staticmethod def export_protein(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)
class Interpro(db.Model): __tablename__ = 'interpro' id = db.Column(db.Integer, primary_key=True) label = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) description = db.Column(db.Text) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='SET NULL'), index=True) sequences = db.relationship('Sequence', secondary=sequence_interpro, lazy='dynamic') # Other properties # sequence_associations = defined in SequenceInterproRelationship def __init__(self, label, description): self.label = label self.description = description @property def species_codes(self): """ Finds all species the family has genes from :return: a list of all species (codes) """ sequences = self.sequences.options(joinedload('species')).all() output = [] for s in sequences: if s.species.code not in output: output.append(s.species.code) return output @property def species_counts(self): """ Generates a phylogenetic profile of a gene family :return: a dict with counts per species (codes are keys) """ sequences = self.sequences.options(joinedload('species')).all() output = {} for s in sequences: if s.species.code not in output: output[s.species.code] = 1 else: output[s.species.code] += 1 return output @staticmethod def sequence_stats(sequence_ids): """ Takes a list of sequence IDs and returns InterPro stats for those sequences :param sequence_ids: list of sequence ids :return: dict with for each InterPro domain linked with any of the input sequences stats """ data = SequenceInterproAssociation.query.filter(SequenceInterproAssociation.sequence_id.in_(sequence_ids)).all() return Interpro.__sequence_stats_associations(data) @staticmethod def sequence_stats_subquery(sequences): subquery = sequences.subquery() data = SequenceInterproAssociation.query.join(subquery, SequenceInterproAssociation.sequence_id == subquery.c.id).all() return Interpro.__sequence_stats_associations(data) @staticmethod def __sequence_stats_associations(associations): output = {} for d in associations: if d.interpro_id not in output.keys(): output[d.interpro_id] = { 'domain': d.domain, 'count': 1, 'sequences': [d.sequence_id], 'species': [d.sequence.species_id] } else: output[d.interpro_id]['count'] += 1 if d.sequence_id not in output[d.interpro_id]['sequences']: output[d.interpro_id]['sequences'].append(d.sequence_id) if d.sequence.species_id not in output[d.interpro_id]['species']: output[d.interpro_id]['species'].append(d.sequence.species_id) for k, v in output.items(): v['species_count'] = len(v['species']) v['sequence_count'] = len(v['sequences']) return output @property def interpro_stats(self): sequence_ids = [s.id for s in self.sequences.all()] return Interpro.sequence_stats_subquery(self.sequences) @property def go_stats(self): from conekt.models.go import GO return GO.sequence_stats_subquery(self.sequences) @property def family_stats(self): from conekt.models.gene_families import GeneFamily return GeneFamily.sequence_stats_subquery(self.sequences) @staticmethod def add_from_xml(filename, empty=True): """ Populates interpro table with domains and descriptions from the official website's XML file :param filename: path to XML file :param empty: If True the interpro table will be cleared before uploading the new domains, default = True """ # If required empty the table first if empty: try: db.session.query(Interpro).delete() db.session.commit() except Exception as e: db.session.rollback() print(e) interpro_parser = InterproParser() interpro_parser.readfile(filename) for i, domain in enumerate(interpro_parser.domains): interpro = Interpro(domain.label, domain.description) db.session.add(interpro) if i % 40 == 0: # commit to the db frequently to allow WHOOSHEE's indexing function to work without timing out try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_interpro_from_plaza(filename): """ Adds GO annotation from PLAZA 3.0 to the database :param filename: Path to the annotation file :return: """ interpro_parser = InterproDomainParser() interpro_parser.read_plaza_interpro(filename) gene_hash = {} domain_hash = {} all_sequences = Sequence.query.all() all_domains = Interpro.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for domain in all_domains: domain_hash[domain.label] = domain new_domains = [] for gene, domains in interpro_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for domain in domains: if domain["id"] in domain_hash.keys(): current_domain = domain_hash[domain["id"]] new_domain = {"sequence_id": current_sequence.id, "interpro_id": current_domain.id, "start": domain["start"], "stop": domain["stop"]} new_domains.append(new_domain) else: print(domain["id"], "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(new_domains) > 400: db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) new_domains = [] db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) @staticmethod def add_interpro_from_interproscan(filename, species_id): """ Adds GO annotation from InterProScan Output :param filename: Path to the annotation file :return: """ interpro_parser = InterproDomainParser() interpro_parser.read_interproscan(filename) gene_hash = {} domain_hash = {} all_sequences = Sequence.query.filter_by(species_id=species_id) all_domains = Interpro.query.all() for sequence in all_sequences: gene_hash[sequence.name] = sequence for domain in all_domains: domain_hash[domain.label] = domain new_domains = [] for gene, domains in interpro_parser.annotation.items(): if gene in gene_hash.keys(): current_sequence = gene_hash[gene] for domain in domains: if domain["id"] in domain_hash.keys(): current_domain = domain_hash[domain["id"]] new_domain = {"sequence_id": current_sequence.id, "interpro_id": current_domain.id, "start": domain["start"], "stop": domain["stop"]} new_domains.append(new_domain) else: print(domain["id"], "not found in the database.") else: print("Gene", gene, "not found in the database.") if len(new_domains) > 400: db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains) new_domains = [] db.engine.execute(SequenceInterproAssociation.__table__.insert(), new_domains)
class Clade(db.Model): __tablename__ = 'clades' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(50, collation=SQL_COLLATION), unique=True, index=True) species = db.Column(db.Text(collation=SQL_COLLATION)) species_count = db.Column(db.Integer) newick_tree = db.Column(db.Text) families = db.relationship('GeneFamily', backref='clade', lazy='dynamic') interpro = db.relationship('Interpro', backref='clade', lazy='dynamic') def __init__(self, name, species, tree): self.name = name self.species = json.dumps(species) self.species_count = len(species) self.newick_tree = tree def __repr__(self): return str(self.id) + ". " + self.name @staticmethod def add_clade(name, species, tree): """ Add a clade to the database :param name: name of the clade :param species: list with codes (!) of the species in the clade :param tree: newick tree for this clade. Will be stored in the database and used for visualizations """ new_clade = Clade(name, species, tree) db.session.add(new_clade) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def add_clades_from_json(data): """ Adds a clade from a dict with clade details :param data: dict with clade details """ for c, data in data.items(): Clade.add_clade(c, data['species'], data['tree']) @staticmethod def update_clades(): """ Loop over all families and determine what clade they belong too. Results are stored in the database """ clades = Clade.query.all() families = GeneFamily.query.all() clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} for f in families: family_species = f.species_codes # skip for families without members if len(family_species) == 0: f.clade_id = None continue # find the clade with the fewest species that contains all the codes selected_clade, _ = get_clade(family_species, clade_to_species) if selected_clade is None: f.clade_id = None else: f.clade_id = clade_to_id[selected_clade] try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def update_clades_interpro(): """ Loop over all families and determine what clade they belong too """ clades = Clade.query.all() interpro = Interpro.query.all() clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} for i in interpro: interpro_species = i.species_codes # skip for families without members if len(interpro_species) == 0: i.clade_id = None continue # find the clade with the fewest species that contains all the codes selected_clade, _ = get_clade(interpro_species, clade_to_species) if selected_clade is None: i.clade_id = None else: i.clade_id = clade_to_id[selected_clade] try: db.session.commit() except Exception as e: db.session.rollback() print(e) @property def newick_tree_species(self): """ Returns a Newick tree with the species present in the current clade. :return: Newick tree (string) with species for the current clade """ species = {s.code: s.name for s in Species.query.all()} tree = newick.loads(self.newick_tree)[0] for code, name in species.items(): node = tree.get_node(code) if node is not None: node.name = name return newick.dumps([tree])
class SequenceSequenceECCAssociation(db.Model): __tablename__ = 'sequence_sequence_ecc' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) query_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) target_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) ecc = db.Column(db.Float) p_value = db.Column(db.Float) corrected_p_value = db.Column(db.Float) gene_family_method_id = db.Column( db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE')) query_network_method_id = db.Column( db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE')) target_network_method_id = db.Column( db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE')) gene_family_method = db.relationship('GeneFamilyMethod', lazy='joined', backref=db.backref( 'ecc_as_family_method', lazy='dynamic', passive_deletes=True)) query_expression_network_method = db.relationship( 'ExpressionNetworkMethod', foreign_keys=[query_network_method_id], lazy='joined', backref=db.backref('ecc_as_query_method', lazy='dynamic', passive_deletes=True)) target_expression_network_method = db.relationship( 'ExpressionNetworkMethod', foreign_keys=[target_network_method_id], lazy='joined', backref=db.backref('ecc_as_target_method', lazy='dynamic', passive_deletes=True)) @staticmethod def get_ecc_network(sequence, network, family): """ Get network connecting a specific sequence to all genes with significant Expression Context Conservation. :param sequence: internal ID of sequence :param network: network method ID to consider :param family: kind of gene families used to detect ECC :return: network dict (can be made compatible using CytoscapeHelper) """ data = SequenceSequenceECCAssociation.query.filter( and_( SequenceSequenceECCAssociation.query_id == sequence, SequenceSequenceECCAssociation.query_network_method_id == network, SequenceSequenceECCAssociation.gene_family_method_id == family)).all() # return an empty dict in case there are no hits for this query if len(data) < 1: return {'nodes': [], 'edges': []} # add the query node d = data[0] nodes = [{ "id": d.query_sequence.name, "name": d.query_sequence.name, "species_id": d.query_sequence.species_id, "species_name": d.query_sequence.species.name, "gene_id": d.query_id, "gene_name": d.query_sequence.name, "network_method_id": network, "node_type": "query" }] edges = [] networks = {} for d in data: nodes.append({ "id": d.target_sequence.name, "name": d.target_sequence.name, "species_id": d.target_sequence.species_id, "species_name": d.target_sequence.species.name, "gene_id": d.target_id, "network_method_id": d.target_network_method_id, "gene_name": d.target_sequence.name }) if d.target_network_method_id not in networks.keys(): networks[d.target_network_method_id] = [] networks[d.target_network_method_id].append(d.target_id) # TODO: add p-value and corrected p once implemented edges.append({ "source": d.query_sequence.name, "target": d.target_sequence.name, "ecc_score": d.ecc, "edge_type": 0 }) for n, sequences in networks.items(): new_data = SequenceSequenceECCAssociation.query.filter( and_( SequenceSequenceECCAssociation.query_id.in_(sequences), SequenceSequenceECCAssociation.target_id.in_(sequences), SequenceSequenceECCAssociation.target_network_method_id == n, SequenceSequenceECCAssociation.query_network_method_id == n, SequenceSequenceECCAssociation.gene_family_method_id == family, SequenceSequenceECCAssociation.query_id != SequenceSequenceECCAssociation.target_id)).all() for nd in new_data: # TODO: add p-value and corrected p once implemented # make sure the connection doesn't exist already if not any(d['source'] == nd.target_sequence.name and d['target'] == nd.query_sequence.name for d in edges): edges.append({ "source": nd.query_sequence.name, "target": nd.target_sequence.name, "ecc_score": nd.ecc, "edge_type": 1 }) return {"nodes": nodes, "edges": edges} @staticmethod def get_ecc_pair_network(ecc_id): """ Get all data for an SequenceSequenceECCAssociation to make a ECC graph, similar to the pairwise comparisons in Movahedi et al. :param ecc_id: interal id of the SequenceSequenceECCAssociation :return: ecc pair with neighborhood as graph dict """ association = SequenceSequenceECCAssociation.query.get_or_404(ecc_id) nodes = [ { "id": association.query_sequence.name, "name": association.query_sequence.name, "species_id": association.query_sequence.species_id, "species_name": association.query_sequence.species.name, "gene_id": association.query_id, "gene_name": association.query_sequence.name, "network_method_id": association.query_network_method_id, "node_type": "query" }, { "id": association.target_sequence.name, "name": association.target_sequence.name, "species_id": association.target_sequence.species_id, "species_name": association.target_sequence.species.name, "gene_id": association.target_id, "gene_name": association.target_sequence.name, "network_method_id": association.target_network_method_id, "node_type": "query" }, ] edges = [{ "source": association.query_sequence.name, "target": association.target_sequence.name, "ecc_score": association.ecc, 'ecc_pair_color': "#D33", "edge_type": "ecc" }] query_network = association.query_sequence.network_nodes.filter_by( method_id=association.query_network_method_id).first_or_404( ).network target_network = association.target_sequence.network_nodes.filter_by( method_id=association.target_network_method_id).first_or_404( ).network query_network_data = json.loads(query_network) target_network_data = json.loads(target_network) sequences = [ association.query_sequence.id, association.target_sequence.id ] for n in query_network_data: gene_id = n['gene_id'] if 'gene_id' in n.keys() else None gene_name = n['gene_name'] if 'gene_name' in n.keys() else None if gene_id not in sequences: nodes.append({ "id": gene_name, "name": gene_name, "species_id": association.query_sequence.species_id, "species_name": association.query_sequence.species.name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": association.query_network_method_id, "node_type": "target" }) sequences.append(gene_id) edges.append({ "source": association.query_sequence.name, "target": gene_name, "link_score": n['link_score'] if 'link_score' in n else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) for n in target_network_data: gene_id = n['gene_id'] if 'gene_id' in n.keys() else None gene_name = n['gene_name'] if 'gene_name' in n.keys() else None if gene_id not in sequences: sequences.append(gene_id) nodes.append({ "id": gene_name, "name": gene_name, "species_id": association.target_sequence.species_id, "species_name": association.target_sequence.species.name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": association.target_network_method_id, "node_type": "target" }) edges.append({ "source": association.target_sequence.name, "target": gene_name, "link_score": n['link_score'] if 'link_score' in n else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) return { "nodes": nodes, "edges": edges }, association.gene_family_method_id @staticmethod def get_ecc_multi_network(gf_method_id, sequence_ids): """ Creates an ECC network for multiple genes, the resulting network will contain all ECC partners of the input genes. Pruning this network keeping only genes with non-unique label co-occurances is recommended ! :param gf_method_id: gene family method used to detect ECC :param sequence_ids: sequences to include as the core of the network :return: network dict """ associations = SequenceSequenceECCAssociation.query.\ filter(SequenceSequenceECCAssociation.gene_family_method_id == gf_method_id).\ filter(and_(SequenceSequenceECCAssociation.query_id.in_(sequence_ids), SequenceSequenceECCAssociation.target_id.in_(sequence_ids))).\ all() nodes, edges = [], [] node_sequence_ids = [] networks = [] for a in associations: query_network = a.query_sequence.network_nodes.filter_by( method_id=a.query_network_method_id).first_or_404().network target_network = a.target_sequence.network_nodes.filter_by( method_id=a.target_network_method_id).first_or_404().network if query_network not in networks: networks.append((a.query_id, a.query_sequence.name, a.query_sequence.species_id, a.query_sequence.species.name, a.query_network_method_id, query_network)) if target_network not in networks: networks.append((a.target_id, a.target_sequence.name, a.target_sequence.species_id, a.target_sequence.species.name, a.target_network_method_id, target_network)) if a.query_id not in node_sequence_ids: node_sequence_ids.append(a.query_id) nodes.append({ "id": a.query_sequence.name, "name": a.query_sequence.name, "species_id": a.query_sequence.species_id, "species_name": a.query_sequence.species.name, "gene_id": a.query_id, "gene_name": a.query_sequence.name, "network_method_id": a.query_network_method_id, "node_type": "query" }) if a.target_id not in node_sequence_ids: node_sequence_ids.append(a.target_id) nodes.append({ "id": a.target_sequence.name, "name": a.target_sequence.name, "species_id": a.target_sequence.species_id, "species_name": a.target_sequence.species.name, "gene_id": a.target_id, "gene_name": a.target_sequence.name, "network_method_id": a.target_network_method_id, "node_type": "query" }) edges.append({ "source": a.query_sequence.name, "target": a.target_sequence.name, "ecc_score": a.ecc, 'ecc_pair_color': "#D33", "edge_type": "ecc" }) new_edges = [] for sequence_id, sequence_name, species_id, species_name, network_method_id, n in networks: network_data = json.loads(n) for node in network_data: gene_id = node['gene_id'] if 'gene_id' in node.keys() else None gene_name = node['gene_name'] if 'gene_name' in node.keys( ) else None if gene_id not in node_sequence_ids: node_sequence_ids.append(gene_id) nodes.append({ "id": gene_name, "name": gene_name, "species_id": species_id, "species_name": species_name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": network_method_id, "node_type": "target" }) if (sequence_name, gene_name) not in new_edges: new_edges.append((sequence_name, gene_name)) new_edges.append((gene_name, sequence_name)) edges.append({ "source": sequence_name, "target": gene_name, "link_score": node['link_score'] if 'link_score' in node else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) return {"nodes": nodes, "edges": edges}, gf_method_id