class ClusterCladeEnrichment(db.Model): __tablename__ = 'cluster_clade_enrichment' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) cluster_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE')) gene_family_method_id = db.Column( db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE')) gene_family_method = db.relationship('GeneFamilyMethod', backref=db.backref( 'clade_enrichment', lazy='dynamic', passive_deletes=True), lazy='joined') cluster = db.relationship('CoexpressionCluster', backref=db.backref('clade_enrichment', lazy='dynamic', passive_deletes=True), lazy='joined') clade = db.relationship('Clade', backref=db.backref('enriched_clusters', lazy='dynamic', passive_deletes=True), lazy='joined') """ Counts required to calculate the enrichment, store here for quick access """ cluster_count = db.Column(db.Integer) cluster_size = db.Column(db.Integer) clade_count = db.Column(db.Integer) clade_size = db.Column(db.Integer) """ Enrichment score (log-transformed), p-value and corrected p-value. Calculated using the hypergeometric distribution and applying FDR correction (aka. BH) """ enrichment = db.Column(db.Float) p_value = db.Column(db.Float) corrected_p_value = db.Column(db.Float) @property def cluster_percentage(self): return self.cluster_count * 100 / self.cluster_size @property def genome_percentage(self): return self.clade_count * 100 / self.clade_size
class CoexpressionClusterSimilarity(db.Model): __tablename__ = 'coexpression_cluster_similarity' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) source_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) target_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) gene_family_method_id = db.Column('gene_family_method_id', db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) jaccard_index = db.Column(db.Float, index=True) p_value = db.Column(db.Float, index=True) corrected_p_value = db.Column(db.Float, index=True) source = db.relationship('CoexpressionCluster', backref=db.backref('similarity_sources', lazy='dynamic', passive_deletes=True), lazy='joined', foreign_keys=[source_id]) target = db.relationship('CoexpressionCluster', backref=db.backref('similarity_targets', lazy='dynamic', passive_deletes=True), lazy='joined', foreign_keys=[target_id]) gene_family_method = db.relationship('GeneFamilyMethod', backref=db.backref( 'CoexpressionClusterSimilarities', passive_deletes=True), lazy='joined') @staticmethod def empty_table(): """ Delete all content from this table. Use carefully ! """ CoexpressionClusterSimilarity.query.delete()
class SequenceSequenceCladeAssociation(db.Model): __tablename__ = 'sequence_sequence_clade' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_one_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) sequence_two_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) clade_id = db.Column(db.Integer, db.ForeignKey('clades.id', ondelete='CASCADE'), index=True) tree_id = db.Column(db.Integer, db.ForeignKey('trees.id', ondelete='CASCADE'), index=True) duplication = db.Column(db.SmallInteger) duplication_consistency_score = db.Column(db.Float) tree = db.relationship('Tree', lazy='joined', backref=db.backref('sequence_sequence_clade_associations', lazy='dynamic', passive_deletes=True) ) clade = db.relationship('Clade', lazy='joined', backref=db.backref('sequence_sequence_clade_associations', lazy='dynamic', passive_deletes=True) ) def __str__(self): return "%d" % self.id @property def readable_type(self): """ Returns type (duplication or speciation) in a human-readable format :return: string Duplication or Speciation """ return "Duplication" if self.duplication else "Speciation" @property def readable_score(self): """ Returns the duplication consistency score in a nicer format :return: string with dup. consistency score in .%3 - format. Or "Not available" for speciations. """ return "%.3f" % self.duplication_consistency_score if self.duplication else "Not available"
class SequenceFamilyAssociation(db.Model): __tablename__ = 'sequence_family' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) gene_family_id = db.Column( db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE')) sequence = db.relationship('Sequence', backref=db.backref('family_associations', lazy='dynamic', passive_deletes=True), lazy='joined') family = db.relationship('GeneFamily', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class FamilyGOAssociation(db.Model): __tablename__ = 'family_go' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) gene_family_id = db.Column( db.Integer, db.ForeignKey('gene_families.id', ondelete='CASCADE')) go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE')) gene_family = db.relationship('GeneFamily', backref=db.backref('go_annotations', lazy='dynamic', passive_deletes=True), lazy='joined') go_term = db.relationship('GO', backref=db.backref('family_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class SequenceInterproAssociation(db.Model): __tablename__ = 'sequence_interpro' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) interpro_id = db.Column(db.Integer, db.ForeignKey('interpro.id', ondelete='CASCADE')) start = db.Column(db.Integer, default=None) stop = db.Column(db.Integer, default=None) sequence = db.relationship('Sequence', backref=db.backref('interpro_associations', lazy='dynamic', passive_deletes=True), lazy='joined') domain = db.relationship('Interpro', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class SequenceCoexpressionClusterAssociation(db.Model): __tablename__ = 'sequence_coexpression_cluster' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) probe = db.Column(db.String(50), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) coexpression_cluster_id = db.Column( db.Integer, db.ForeignKey('coexpression_clusters.id', ondelete='CASCADE')) sequence = db.relationship('Sequence', backref=db.backref( 'coexpression_cluster_associations', lazy='dynamic', passive_deletes=True), lazy='joined') coexpression_cluster = db.relationship('CoexpressionCluster', backref=db.backref( 'sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined')
class CoexpressionClusteringMethod(db.Model): __tablename__ = 'coexpression_clustering_methods' id = db.Column(db.Integer, primary_key=True) network_method_id = db.Column(db.Integer, db.ForeignKey( 'expression_network_methods.id', ondelete='CASCADE'), index=True) method = db.Column(db.Text) cluster_count = db.Column(db.Integer) clusters = db.relationship('CoexpressionCluster', backref=db.backref('method', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) @staticmethod def update_counts(): """ To avoid long counts the number of clusters per method can be precalculated and stored in the database using this function """ methods = CoexpressionClusteringMethod.query.all() for m in methods: m.cluster_count = m.clusters.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def clusters_from_neighborhoods(method, network_method_id): probes = ExpressionNetwork.query.filter_by( method_id=network_method_id).all() # Load all probes clusters = defaultdict(list) clusters_orm = {} sequence_to_probe = {} for p in probes: # Only consider probes linked with sequences if p.sequence_id is not None: sequence_to_probe[p.sequence_id] = p.probe neighborhood = json.loads(p.network) sequence_ids = [ n["gene_id"] for n in neighborhood if "gene_id" in n.keys() and n["gene_id"] is not None ] # check if there are neighbors for this sequence if len(sequence_ids) > 0: clusters[p.sequence.name] = [p.sequence_id] + sequence_ids # If there are valid clusters add them to the database if len(clusters) > 0: # Add new method first new_method = CoexpressionClusteringMethod() new_method.network_method_id = network_method_id new_method.method = method new_method.cluster_count = len(clusters) db.session.add(new_method) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add Clusters for cluster in clusters.keys(): clusters_orm[cluster] = CoexpressionCluster() clusters_orm[cluster].method_id = new_method.id clusters_orm[cluster].name = cluster db.session.add(clusters_orm[cluster]) if len(clusters_orm) % 400 == 0: try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add sequence cluster relations for i, (cluster, members) in enumerate(clusters.items()): for sequence_id in members: relation = SequenceCoexpressionClusterAssociation() relation.sequence_id = sequence_id relation.coexpression_cluster_id = clusters_orm[cluster].id relation.probe = sequence_to_probe[ sequence_id] if sequence_id in sequence_to_probe.keys( ) else None db.session.add(relation) if i % 20 == 0: try: db.session.commit() except Exception as e: db.session.rollback() print(e) try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod def build_hcca_clusters(method, network_method_id, step_size=3, hrr_cutoff=30, min_cluster_size=40, max_cluster_size=200): """ method to build HCCA clusters for a certain network :param method: Name for the current clustering method :param network_method_id: ID for the network to cluster :param step_size: desired step_size for the HCCA algorithm :param hrr_cutoff: desired hrr_cutoff for the HCCA algorithm :param min_cluster_size: minimal cluster size :param max_cluster_size: maximum cluster size """ network_data = {} sequence_probe = {} # Get network from DB print("Loading Network data from DB...", sep='') ExpressionNetworkMethod.query.get_or_404( network_method_id) # Check if method exists probes = ExpressionNetwork.query.filter_by( method_id=network_method_id).all() # Load all probes for p in probes: # Loop over probes and store hrr for all neighbors if p.sequence_id is not None: neighborhood = json.loads(p.network) network_data[p.sequence_id] = { nb["gene_id"]: nb["hrr"] for nb in neighborhood if "gene_id" in nb.keys() and "hrr" in nb.keys() and nb["gene_id"] is not None } sequence_probe[p.sequence_id] = p.probe # Double check edges are reciprocally defined for sequence, data in network_data.items(): for neighbor, score in data.items(): if neighbor not in network_data.keys(): network_data[neighbor] = {sequence: score} else: if sequence not in network_data[neighbor].keys(): network_data[neighbor][sequence] = score print("Done!\nStarting to build Clusters...\n") # Build clusters hcca_util = HCCA(step_size=step_size, hrr_cutoff=hrr_cutoff, min_cluster_size=min_cluster_size, max_cluster_size=max_cluster_size) hcca_util.load_data(network_data) hcca_util.build_clusters() # Add new method to DB clusters = list(set([t[1] for t in hcca_util.clusters])) if len(clusters) > 0: print("Done building clusters, adding clusters to DB") # Add new method first new_method = CoexpressionClusteringMethod() new_method.network_method_id = network_method_id new_method.method = method new_method.cluster_count = len(clusters) db.session.add(new_method) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add cluster and store as dict cluster_dict = {} for c in clusters: cluster_dict[c] = CoexpressionCluster() cluster_dict[c].method_id = new_method.id cluster_dict[c].name = c db.session.add(cluster_dict[c]) try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Link sequences to clusters for i, t in enumerate(hcca_util.clusters): gene_id, cluster_name, _ = t relation = SequenceCoexpressionClusterAssociation() relation.probe = sequence_probe[ gene_id] if gene_id in sequence_probe.keys() else None relation.sequence_id = gene_id relation.coexpression_cluster_id = cluster_dict[ cluster_name].id if cluster_name in cluster_dict.keys( ) else None if relation.coexpression_cluster_id is not None: db.session.add(relation) if i > 0 and i % 400 == 0: # Add relations in sets of 400 try: db.session.commit() except Exception as e: db.session.rollback() print(e) # Add remaining relations try: db.session.commit() except Exception as e: db.session.rollback() print(e) else: print("No clusters found! Not adding anything to DB !") @staticmethod def add_lstrap_coexpression_clusters(cluster_file, description, network_id, prefix='cluster_', min_size=10): """ Adds MCL clusters, as produced by LSTrAP, to the database :param cluster_file: path to file with clusters :param description: description to add to database for this set of clusters :param network_id: network the clusters are based on :param prefix: prefix for individual clsuter names (default 'cluster_') :param min_size: minimal size of a cluster (default = 10) :return: ID of new clustering method """ # get all sequences from the database and create a dictionary sequences = Sequence.query.all() sequence_dict = {} for member in sequences: sequence_dict[member.name.upper()] = member # add coexpression clustering method to the database clustering_method = CoexpressionClusteringMethod() clustering_method.network_method_id = network_id clustering_method.method = description try: db.session.add(clustering_method) db.session.commit() except Exception as e: db.session.rollback() print(e) quit() with open(cluster_file) as f: i = 1 for line in f: probes = [p for p in line.strip().split()] genes = [p.replace('.1', '') for p in probes] cluster_id = "%s%04d" % (prefix, i) if len(probes) >= min_size: i += 1 new_cluster = CoexpressionCluster() new_cluster.method_id = clustering_method.id new_cluster.name = cluster_id db.session.add(new_cluster) try: db.session.commit() except Exception as e: db.session.rollback() print(e) continue for p, g in zip(probes, genes): new_association = SequenceCoexpressionClusterAssociation( ) new_association.probe = p new_association.sequence_id = None if g.upper() in sequence_dict.keys(): new_association.sequence_id = sequence_dict[ g.upper()].id new_association.coexpression_cluster_id = new_cluster.id db.session.add(new_association) try: db.session.commit() except Exception as e: db.session.rollback() print(e) return clustering_method.id
class TreeMethod(db.Model): __tablename__ = 'tree_methods' id = db.Column(db.Integer, primary_key=True) description = db.Column(db.Text) gene_family_method_id = db.Column(db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) trees = db.relationship('Tree', backref=db.backref('method', lazy='joined'), lazy='dynamic', passive_deletes=True) def reconcile_trees(self): print("\n1.====================Getting into function reconcile_trees") # Fetch required data from the database sequences = Sequence.query.all() #print("\n1.1.=============================Sequences Joined: " + ', '.join(sequences)) #FAILS, bad print statement for list obj clades = Clade.query.all() #print("\n1.2. =========================Clades: ", *clades, sep='\n') # print works seq_to_species = {s.name: s.species.code for s in sequences} #print("\n2.=========================seq_to_species: ", *seq_to_species, sep='::') seq_to_id = {s.name: s.id for s in sequences} clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} new_associations = [] phyloxml_data = {} for t in self.trees: # Load tree from Newick string and start reconciliating tree = newick.loads(t.data_newick)[0] print("\n3.=========================tree loaded ok") for node in tree.walk(): if len(node.descendants) != 2: #print("\n4.==========length of node descendant=" + str(len(node.descendants))) if not node.is_binary: print("\n5.================Non-Binary-node: " + str(node.is_binary)) # Print warning in case there is a non-binary node #sdash: commenting out this original print statement because none binary-node doesn't have id nor label. Process stops at this print statement for non-binary trees. print( "Non-Binary tree: " + t.data_newick ) #sdash: this print statement will show which tree is non-binary and is skipped. Doesn't stop the reconcile process. #sdash May-03-2019#original# #print("[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) # Otherwise it is a leaf node and can be skipped continue branch_one_seq = [ l.name.strip() for l in node.descendants[0].get_leaves() ] # print("\n6.===============Branch-one-seq: " + ', '.join(branch_one_seq)) branch_two_seq = [ l.name.strip() for l in node.descendants[1].get_leaves() ] # print("\n7.===============Branch-two-seq: " + ', '.join(branch_two_seq)) branch_one_species = set([ seq_to_species[s] for s in branch_one_seq if s in seq_to_species.keys() ]) print( "\n8.===============Branch-one-spp: " + ', '.join(branch_one_species) ) #Empty set, length=0; seq_to_species length=143271; SO, problem in forming this set definition ## TO DO: #Possibly the seq name seq_to_species doesn't match in branch_one_seq and # hence, it is an empty set. Next check this possibility. Tue June 25. branch_two_species = set([ seq_to_species[s] for s in branch_two_seq if s in seq_to_species.keys() ]) print("\n9.===============Branch-two-spp: " + ', '.join(branch_two_species)) all_species = branch_one_species.union(branch_two_species) clade, _ = phylo.get_clade(all_species, clade_to_species) duplication = phylo.is_duplication(branch_one_species, branch_two_species, clade_to_species) duplication_consistency = None if duplication: duplication_consistency = phylo.duplication_consistency( branch_one_species, branch_two_species) tags = [ clade_to_id[clade] if clade is not None else 0, 'D' if duplication else 'S', duplication_consistency if duplication else 0 ] node.name = '_'.join([str(t) for t in tags]) if clade is not None: for seq_one in branch_one_seq: for seq_two in branch_two_seq: new_associations.append({ 'sequence_one_id': seq_to_id[seq_one], 'sequence_two_id': seq_to_id[seq_two], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) new_associations.append({ 'sequence_one_id': seq_to_id[seq_two], 'sequence_two_id': seq_to_id[seq_one], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) if len(new_associations) > 400: db.engine.execute( SequenceSequenceCladeAssociation.__table__.insert(), new_associations) new_associations = [] # add newick tree to memory phyloxml_data[t.id] = newick.dumps([tree]) db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(), new_associations) # Update PhyloXML data file for all trees for t in self.trees: if t.id in phyloxml_data.keys(): t.data_phyloxml = phyloxml_data[t.id] db.session.commit()
class TreeMethod(db.Model): __tablename__ = 'tree_methods' id = db.Column(db.Integer, primary_key=True) description = db.Column(db.Text) gene_family_method_id = db.Column(db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE'), index=True) trees = db.relationship('Tree', backref=db.backref('method', lazy='joined'), lazy='dynamic', passive_deletes=True) def reconcile_trees(self): # Fetch required data from the database sequences = Sequence.query.all() clades = Clade.query.all() seq_to_species = {s.name: s.species.code for s in sequences} seq_to_id = {s.name: s.id for s in sequences} clade_to_species = {c.name: json.loads(c.species) for c in clades} clade_to_id = {c.name: c.id for c in clades} new_associations = [] phyloxml_data = {} for t in self.trees: # Load tree from Newick string and start reconciliating tree = newick.loads(t.data_newick)[0] for node in tree.walk(): if len(node.descendants) != 2: if not node.is_binary: # Print warning in case there is a non-binary node print( "[%d, %s] Skipping node... Can only reconcile binary nodes ..." % (tree.id, tree.label)) # Otherwise it is a leaf node and can be skipped continue branch_one_seq = [ l.name.strip() for l in node.descendants[0].get_leaves() ] branch_two_seq = [ l.name.strip() for l in node.descendants[1].get_leaves() ] branch_one_species = set([ seq_to_species[s] for s in branch_one_seq if s in seq_to_species.keys() ]) branch_two_species = set([ seq_to_species[s] for s in branch_two_seq if s in seq_to_species.keys() ]) all_species = branch_one_species.union(branch_two_species) clade, _ = phylo.get_clade(all_species, clade_to_species) duplication = phylo.is_duplication(branch_one_species, branch_two_species, clade_to_species) duplication_consistency = None if duplication: duplication_consistency = phylo.duplication_consistency( branch_one_species, branch_two_species) tags = [ clade_to_id[clade] if clade is not None else 0, 'D' if duplication else 'S', duplication_consistency if duplication else 0 ] node.name = '_'.join([str(t) for t in tags]) if clade is not None: for seq_one in branch_one_seq: for seq_two in branch_two_seq: new_associations.append({ 'sequence_one_id': seq_to_id[seq_one], 'sequence_two_id': seq_to_id[seq_two], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) new_associations.append({ 'sequence_one_id': seq_to_id[seq_two], 'sequence_two_id': seq_to_id[seq_one], 'tree_id': t.id, 'clade_id': clade_to_id[clade], 'duplication': 1 if duplication else 0, 'duplication_consistency_score': duplication_consistency }) if len(new_associations) > 400: db.engine.execute( SequenceSequenceCladeAssociation.__table__.insert(), new_associations) new_associations = [] # add newick tree to memory phyloxml_data[t.id] = newick.dumps([tree]) db.engine.execute(SequenceSequenceCladeAssociation.__table__.insert(), new_associations) # Update PhyloXML data file for all trees for t in self.trees: if t.id in phyloxml_data.keys(): t.data_phyloxml = phyloxml_data[t.id] db.session.commit()
class ExpressionProfile(db.Model): __tablename__ = 'expression_profiles' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) probe = db.Column(db.String(50, collation=SQL_COLLATION), index=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE'), index=True) profile = db.deferred(db.Column(db.Text)) specificities = db.relationship('ExpressionSpecificity', backref=db.backref('profile', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) def __init__(self, probe, sequence_id, profile): self.probe = probe self.sequence_id = sequence_id self.profile = profile @staticmethod def __profile_to_table(data): """ Internal function to convert an expression profile (dict) to a tabular text :param data: Dict with expression profile :return: table (string) """ output = [["condition", "mean", "min", "max"]] order = data["order"] for o in order: try: values = data["data"][o] output.append( [o, str(mean(values)), str(min(values)), str(max(values))]) except Exception as e: print(e) return '\n'.join(['\t'.join(l) for l in output]) @property def table(self): """ Returns the condition expression as a tabular text file :return: table with data (string) """ table = ExpressionProfile.__profile_to_table(json.loads(self.profile)) return table def tissue_table(self, condition_tissue_id, use_means=True): """ Returns the tissue expression as a tabular text file :param condition_tissue_id: condition_tissue_id for the conversion :param use_means: Use the mean of the condition (recommended) :return: table with data (string) """ table = ExpressionProfile.__profile_to_table( self.tissue_profile(condition_tissue_id, use_means=use_means)) return table @property def low_abundance(self, cutoff=10): """ Checks if the mean expression value in any conditions in the plot is higher than the desired cutoff :param cutoff: cutoff for expression, default = 10 :return: True in case of low abundance otherwise False """ data = json.loads(self.profile) checks = [mean(v) > cutoff for _, v in data["data"].items()] return not any(checks) @staticmethod def convert_profile(condition_to_tissue, profile_data, use_means=True): """ Convert a full, detailed profile into a more general summarized one using conversion table stored in the database :param condition_to_tissue: dict with conversion instructions :param profile_data: profile to convert :param use_means: use means of detailed condition if True otherwise use samples independently. Default True :return: New profile """ tissues = list(set(condition_to_tissue['conversion'].values())) output = {} for t in tissues: valid_conditions = [ k for k in profile_data['data'] if k in condition_to_tissue['conversion'] and condition_to_tissue['conversion'][k] == t ] valid_values = [] for k, v in profile_data['data'].items(): if k in valid_conditions: if use_means: valid_values.append(mean(v)) else: valid_values += v output[t] = valid_values if len(valid_values) > 0 else [0] return { 'order': condition_to_tissue['order'], 'colors': condition_to_tissue['colors'], 'data': output } def tissue_profile(self, condition_tissue_id, use_means=True): """ Applies a conversion to the profile, grouping several condition into one more general feature (e.g. tissue). :param condition_tissue_id: identifier of the conversion table :param use_means: store the mean of the condition rather than individual values. The matches the spm calculations better. :return: parsed profile """ ct = ConditionTissue.query.get(condition_tissue_id) condition_to_tissue = json.loads(ct.data) profile_data = json.loads(self.profile) output = ExpressionProfile.convert_profile(condition_to_tissue, profile_data, use_means=use_means) return output @staticmethod def get_heatmap(species_id, probes, zlog=True, raw=False): """ Returns a heatmap for a given species (species_id) and a list of probes. It returns a dict with 'order' the order of the experiments and 'heatmap' another dict with the actual data. Data is zlog transformed :param species_id: species id (internal database id) :param probes: a list of probes to include in the heatmap :param zlog: enable zlog transformation (otherwise normalization against highest expressed condition) """ profiles = ExpressionProfile.query.options(undefer('profile')).filter_by(species_id=species_id).\ filter(ExpressionProfile.probe.in_(probes)).all() order = [] output = [] not_found = [p.lower() for p in probes] for profile in profiles: name = profile.probe data = json.loads(profile.profile) order = data['order'] experiments = data['data'] with contextlib.suppress(ValueError): not_found.remove(profile.probe.lower()) with contextlib.suppress(ValueError): not_found.remove(profile.sequence.name.lower()) values = {} for o in order: values[o] = mean(experiments[o]) row_mean = mean(values.values()) row_max = max(values.values()) for o in order: if zlog: if row_mean == 0 or values[o] == 0: values[o] = '-' else: try: values[o] = log(values[o] / row_mean, 2) except ValueError as _: print("Unable to calculate log()", values[o], row_mean) values[o] = '-' else: if row_max != 0 and not raw: values[o] = values[o] / row_max output.append({ "name": name, "values": values, "sequence_id": profile.sequence_id, "shortest_alias": profile.sequence.shortest_alias }) if len(not_found) > 0: flash("Couldn't find profile for: %s" % ", ".join(not_found), "warning") return {'order': order, 'heatmap_data': output} @staticmethod def get_profiles(species_id, probes, limit=1000): """ Gets the data for a set of probes (including the full profiles), a limit can be provided to avoid overly long queries :param species_id: internal id of the species :param probes: probe names to fetch :param limit: maximum number of probes to get :return: List of ExpressionProfile objects including the full profiles """ profiles = ExpressionProfile.query.\ options(undefer('profile')).\ filter(ExpressionProfile.probe.in_(probes)).\ filter_by(species_id=species_id).\ options(joinedload('sequence').load_only('name').noload('xrefs')).\ limit(limit).all() return profiles @staticmethod def add_profile_from_lstrap(matrix_file, annotation_file, species_id, order_color_file=None): """ Function to convert an (normalized) expression matrix (lstrap output) into a profile :param matrix_file: path to the expression matrix :param annotation_file: path to the file assigning samples to conditions :param species_id: internal id of the species :param order_color_file: tab delimited file that contains the order and color of conditions """ annotation = {} with open(annotation_file, 'r') as fin: # get rid of the header _ = fin.readline() for line in fin: parts = line.strip().split('\t') if len(parts) > 1: run, description = parts annotation[run] = description order, colors = [], [] if order_color_file is not None: with open(order_color_file, 'r') as fin: for line in fin: try: o, c = line.strip().split('\t') order.append(o) colors.append(c) except Exception as _: pass # build conversion table for sequences sequences = Sequence.query.filter_by(species_id=species_id).all() sequence_dict = {} # key = sequence name uppercase, value internal id for s in sequences: sequence_dict[s.name.upper()] = s.id with open(matrix_file) as fin: # read header _, *colnames = fin.readline().rstrip().split() colnames = [c.replace('.htseq', '') for c in colnames] # determine order after annotation is not defined if order is None: order = [] for c in colnames: if c in annotation.keys(): if annotation[c] not in order: order.append(annotation[c]) order.sort() # read each line and build profile new_probes = [] for line in fin: transcript, *values = line.rstrip().split() profile = defaultdict(list) for c, v in zip(colnames, values): if c in annotation.keys(): condition = annotation[c] profile[condition].append(float(v)) new_probe = { "species_id": species_id, "probe": transcript, "sequence_id": sequence_dict[transcript.upper()] if transcript.upper() in sequence_dict.keys() else None, "profile": json.dumps({ "order": order, "colors": colors, "data": profile }) } new_probes.append(new_probe) if len(new_probes) > 400: db.engine.execute(ExpressionProfile.__table__.insert(), new_probes) new_probes = [] db.engine.execute(ExpressionProfile.__table__.insert(), new_probes)
class SequenceSequenceECCAssociation(db.Model): __tablename__ = 'sequence_sequence_ecc' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) query_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) target_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) ecc = db.Column(db.Float) p_value = db.Column(db.Float) corrected_p_value = db.Column(db.Float) gene_family_method_id = db.Column( db.Integer, db.ForeignKey('gene_family_methods.id', ondelete='CASCADE')) query_network_method_id = db.Column( db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE')) target_network_method_id = db.Column( db.Integer, db.ForeignKey('expression_network_methods.id', ondelete='CASCADE')) gene_family_method = db.relationship('GeneFamilyMethod', lazy='joined', backref=db.backref( 'ecc_as_family_method', lazy='dynamic', passive_deletes=True)) query_expression_network_method = db.relationship( 'ExpressionNetworkMethod', foreign_keys=[query_network_method_id], lazy='joined', backref=db.backref('ecc_as_query_method', lazy='dynamic', passive_deletes=True)) target_expression_network_method = db.relationship( 'ExpressionNetworkMethod', foreign_keys=[target_network_method_id], lazy='joined', backref=db.backref('ecc_as_target_method', lazy='dynamic', passive_deletes=True)) @staticmethod def get_ecc_network(sequence, network, family): """ Get network connecting a specific sequence to all genes with significant Expression Context Conservation. :param sequence: internal ID of sequence :param network: network method ID to consider :param family: kind of gene families used to detect ECC :return: network dict (can be made compatible using CytoscapeHelper) """ data = SequenceSequenceECCAssociation.query.filter( and_( SequenceSequenceECCAssociation.query_id == sequence, SequenceSequenceECCAssociation.query_network_method_id == network, SequenceSequenceECCAssociation.gene_family_method_id == family)).all() # return an empty dict in case there are no hits for this query if len(data) < 1: return {'nodes': [], 'edges': []} # add the query node d = data[0] nodes = [{ "id": d.query_sequence.name, "name": d.query_sequence.name, "species_id": d.query_sequence.species_id, "species_name": d.query_sequence.species.name, "gene_id": d.query_id, "gene_name": d.query_sequence.name, "network_method_id": network, "node_type": "query" }] edges = [] networks = {} for d in data: nodes.append({ "id": d.target_sequence.name, "name": d.target_sequence.name, "species_id": d.target_sequence.species_id, "species_name": d.target_sequence.species.name, "gene_id": d.target_id, "network_method_id": d.target_network_method_id, "gene_name": d.target_sequence.name }) if d.target_network_method_id not in networks.keys(): networks[d.target_network_method_id] = [] networks[d.target_network_method_id].append(d.target_id) # TODO: add p-value and corrected p once implemented edges.append({ "source": d.query_sequence.name, "target": d.target_sequence.name, "ecc_score": d.ecc, "edge_type": 0 }) for n, sequences in networks.items(): new_data = SequenceSequenceECCAssociation.query.filter( and_( SequenceSequenceECCAssociation.query_id.in_(sequences), SequenceSequenceECCAssociation.target_id.in_(sequences), SequenceSequenceECCAssociation.target_network_method_id == n, SequenceSequenceECCAssociation.query_network_method_id == n, SequenceSequenceECCAssociation.gene_family_method_id == family, SequenceSequenceECCAssociation.query_id != SequenceSequenceECCAssociation.target_id)).all() for nd in new_data: # TODO: add p-value and corrected p once implemented # make sure the connection doesn't exist already if not any(d['source'] == nd.target_sequence.name and d['target'] == nd.query_sequence.name for d in edges): edges.append({ "source": nd.query_sequence.name, "target": nd.target_sequence.name, "ecc_score": nd.ecc, "edge_type": 1 }) return {"nodes": nodes, "edges": edges} @staticmethod def get_ecc_pair_network(ecc_id): """ Get all data for an SequenceSequenceECCAssociation to make a ECC graph, similar to the pairwise comparisons in Movahedi et al. :param ecc_id: interal id of the SequenceSequenceECCAssociation :return: ecc pair with neighborhood as graph dict """ association = SequenceSequenceECCAssociation.query.get_or_404(ecc_id) nodes = [ { "id": association.query_sequence.name, "name": association.query_sequence.name, "species_id": association.query_sequence.species_id, "species_name": association.query_sequence.species.name, "gene_id": association.query_id, "gene_name": association.query_sequence.name, "network_method_id": association.query_network_method_id, "node_type": "query" }, { "id": association.target_sequence.name, "name": association.target_sequence.name, "species_id": association.target_sequence.species_id, "species_name": association.target_sequence.species.name, "gene_id": association.target_id, "gene_name": association.target_sequence.name, "network_method_id": association.target_network_method_id, "node_type": "query" }, ] edges = [{ "source": association.query_sequence.name, "target": association.target_sequence.name, "ecc_score": association.ecc, 'ecc_pair_color': "#D33", "edge_type": "ecc" }] query_network = association.query_sequence.network_nodes.filter_by( method_id=association.query_network_method_id).first_or_404( ).network target_network = association.target_sequence.network_nodes.filter_by( method_id=association.target_network_method_id).first_or_404( ).network query_network_data = json.loads(query_network) target_network_data = json.loads(target_network) sequences = [ association.query_sequence.id, association.target_sequence.id ] for n in query_network_data: gene_id = n['gene_id'] if 'gene_id' in n.keys() else None gene_name = n['gene_name'] if 'gene_name' in n.keys() else None if gene_id not in sequences: nodes.append({ "id": gene_name, "name": gene_name, "species_id": association.query_sequence.species_id, "species_name": association.query_sequence.species.name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": association.query_network_method_id, "node_type": "target" }) sequences.append(gene_id) edges.append({ "source": association.query_sequence.name, "target": gene_name, "link_score": n['link_score'] if 'link_score' in n else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) for n in target_network_data: gene_id = n['gene_id'] if 'gene_id' in n.keys() else None gene_name = n['gene_name'] if 'gene_name' in n.keys() else None if gene_id not in sequences: sequences.append(gene_id) nodes.append({ "id": gene_name, "name": gene_name, "species_id": association.target_sequence.species_id, "species_name": association.target_sequence.species.name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": association.target_network_method_id, "node_type": "target" }) edges.append({ "source": association.target_sequence.name, "target": gene_name, "link_score": n['link_score'] if 'link_score' in n else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) return { "nodes": nodes, "edges": edges }, association.gene_family_method_id @staticmethod def get_ecc_multi_network(gf_method_id, sequence_ids): """ Creates an ECC network for multiple genes, the resulting network will contain all ECC partners of the input genes. Pruning this network keeping only genes with non-unique label co-occurances is recommended ! :param gf_method_id: gene family method used to detect ECC :param sequence_ids: sequences to include as the core of the network :return: network dict """ associations = SequenceSequenceECCAssociation.query.\ filter(SequenceSequenceECCAssociation.gene_family_method_id == gf_method_id).\ filter(and_(SequenceSequenceECCAssociation.query_id.in_(sequence_ids), SequenceSequenceECCAssociation.target_id.in_(sequence_ids))).\ all() nodes, edges = [], [] node_sequence_ids = [] networks = [] for a in associations: query_network = a.query_sequence.network_nodes.filter_by( method_id=a.query_network_method_id).first_or_404().network target_network = a.target_sequence.network_nodes.filter_by( method_id=a.target_network_method_id).first_or_404().network if query_network not in networks: networks.append((a.query_id, a.query_sequence.name, a.query_sequence.species_id, a.query_sequence.species.name, a.query_network_method_id, query_network)) if target_network not in networks: networks.append((a.target_id, a.target_sequence.name, a.target_sequence.species_id, a.target_sequence.species.name, a.target_network_method_id, target_network)) if a.query_id not in node_sequence_ids: node_sequence_ids.append(a.query_id) nodes.append({ "id": a.query_sequence.name, "name": a.query_sequence.name, "species_id": a.query_sequence.species_id, "species_name": a.query_sequence.species.name, "gene_id": a.query_id, "gene_name": a.query_sequence.name, "network_method_id": a.query_network_method_id, "node_type": "query" }) if a.target_id not in node_sequence_ids: node_sequence_ids.append(a.target_id) nodes.append({ "id": a.target_sequence.name, "name": a.target_sequence.name, "species_id": a.target_sequence.species_id, "species_name": a.target_sequence.species.name, "gene_id": a.target_id, "gene_name": a.target_sequence.name, "network_method_id": a.target_network_method_id, "node_type": "query" }) edges.append({ "source": a.query_sequence.name, "target": a.target_sequence.name, "ecc_score": a.ecc, 'ecc_pair_color': "#D33", "edge_type": "ecc" }) new_edges = [] for sequence_id, sequence_name, species_id, species_name, network_method_id, n in networks: network_data = json.loads(n) for node in network_data: gene_id = node['gene_id'] if 'gene_id' in node.keys() else None gene_name = node['gene_name'] if 'gene_name' in node.keys( ) else None if gene_id not in node_sequence_ids: node_sequence_ids.append(gene_id) nodes.append({ "id": gene_name, "name": gene_name, "species_id": species_id, "species_name": species_name, "gene_id": gene_id, "gene_name": gene_name, "network_method_id": network_method_id, "node_type": "target" }) if (sequence_name, gene_name) not in new_edges: new_edges.append((sequence_name, gene_name)) new_edges.append((gene_name, sequence_name)) edges.append({ "source": sequence_name, "target": gene_name, "link_score": node['link_score'] if 'link_score' in node else 0, "edge_type": "expression", 'ecc_pair_color': "#3D3" }) return {"nodes": nodes, "edges": edges}, gf_method_id
class ExpressionNetworkMethod(db.Model): __tablename__ = 'expression_network_methods' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id'), index=True) description = db.Column(db.Text) edge_type = db.Column(db.Enum("rank", "weight", name='edge_type')) probe_count = db.Column(db.Integer) hrr_cutoff = db.Column(db.Integer) pcc_cutoff = db.Column(db.Float) enable_second_level = db.Column(db.SmallInteger) probes = db.relationship('ExpressionNetwork', backref=db.backref('method', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) clustering_methods = db.relationship('CoexpressionClusteringMethod', backref='network_method', lazy='dynamic', cascade='all, delete-orphan', passive_deletes=True) def __init__(self, species_id, description, edge_type="rank"): self.species_id = species_id self.description = description self.edge_type = edge_type self.enable_second_level = False def __repr__(self): return str(self.id) + ". " + self.description + ' [' + str(self.species) + ']' @staticmethod def update_count(): """ To avoid long count queries the number of networks for each method can be precalculated and stored in the database using this function """ methods = ExpressionNetworkMethod.query.all() for m in methods: m.probe_count = m.probes.count() try: db.session.commit() except Exception as e: db.session.rollback() print(e) @staticmethod @benchmark def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100): """ Function to calculate the ECC scores in and between genes of different networks ORM free method for speed ! :param network_method_ids: array of networks (using their internal id !) to compare :param gene_family_method_id: internal id of the type of family methods to be used for the comparison """ network_families = {} sequence_network = {} sequence_network_method = {} sequence_family = {} family_sequence = {} # Get all the network information and store in dictionary for n in network_method_ids: current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id, ExpressionNetwork.__table__.c.network, ExpressionNetwork.__table__.c.method_id]). where(ExpressionNetwork.__table__.c.method_id == n). where(ExpressionNetwork.__table__.c.sequence_id.isnot(None)) ).fetchall() for sequence, network, network_method_id in current_network: if sequence is not None: sequence_network[int(sequence)] = network sequence_network_method[int(sequence)] = int(network_method_id) # Get family data and store in dictionary current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id]). select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)). where(GeneFamily.__table__.c.method_id == gene_family_method_id) ).fetchall() for sequence, family, method in current_families: sequence_family[int(sequence)] = int(family) if family not in family_sequence.keys(): family_sequence[int(family)] = [] family_sequence[int(family)].append(int(sequence)) # Create a dict (key = network) with the families present in that network # Families that occur multiple times should be present multiple times as this is used # to set threshholds later ! for sequence, network_method in sequence_network_method.items(): # ignore sequences without a family, ideally this shouldn't happen if network_method not in network_families.keys(): network_families[network_method] = [] if sequence in sequence_family.keys(): family = sequence_family[sequence] network_families[network_method].append(family) # Determine threshold and p-value # A background model will be computed for each combination of networks, an ECC score will need to be better # than 95 % of the randomly found values to be considered significant thresholds = {} print("Starting permutation tests") for n in network_method_ids: thresholds[n] = {} for m in network_method_ids: thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n], network_families[m], max_size=max_size) # Data loaded start calculating ECCs new_ecc_scores = [] for family, sequences in family_sequence.items(): for i in range(len(sequences) - 1): query = sequences[i] for j in range(i+1, len(sequences)): target = sequences[j] if query in sequence_network.keys() and target in sequence_network.keys() and query != target: # Ignore genes with overlapping neighborhoods if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]): ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query], sequence_network[target], sequence_family, thresholds[sequence_network_method[query]][sequence_network_method[target]], family, max_size=max_size) if significant: new_ecc_scores.append({ 'query_id': query, 'target_id': target, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[query], 'target_network_method_id': sequence_network_method[target], }) # add reciprocal relation new_ecc_scores.append({ 'query_id': target, 'target_id': query, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[target], 'target_network_method_id': sequence_network_method[query], }) if len(new_ecc_scores) > 400: db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores) new_ecc_scores = [] db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores) @staticmethod def __neighborhoods_overlap(neighborhood_a, neighborhood_b): """ Checks if two genes have overlapping networks :param neighborhood_a: neighborhood for first gene (string as stored in database) :param neighborhood_b: neighborhood for second gene (string as stored in database) :return: Bool, true if networks overlap """ genes_a = set([n['gene_id'] for n in json.loads(neighborhood_a) if n['gene_id'] is not None]) genes_b = set([n['gene_id'] for n in json.loads(neighborhood_b) if n['gene_id'] is not None]) return len(genes_a.intersection(genes_b)) > 0 @staticmethod def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30): """ Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for each gene. Next the ECC score is calculated :param q_network: network for the query gene :param t_network: network for the target gene :param families: dictionary that links a sequence id (key) to a family id (value) :param thresholds: :param query_family: name of the input gene family :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant """ q_data = json.loads(q_network) t_data = json.loads(t_network) q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None] t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None] q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family] t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family] # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families]))) if len(q_families) == 0 or len(t_families) == 0: return 0.0, False else: ecc = jaccard(q_families, t_families) q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size t = thresholds[q_size-1][t_size-1] return ecc, ecc > t @staticmethod @benchmark def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5): """ Empirically determine (permutation test) thresholds for ECC :param families_a: families of species_a (list of internal family ids) :param families_b: families of species_b (list of internal family ids) :param max_size: maximum number of families (default = 30) :param iterations: number of permutations done :param step: step size :return: matrix (list of lists) with the thresholds at various family sizes """ thresholds = [] for i in range(0, max_size, step): print("%d done" % i) new_threshholds = [] for j in range(0, max_size, step): scores = [] for _ in range(iterations): if i+1 < len(families_a) and j+1 < len(families_b): i_fams = random.sample(families_a, i+1) j_fams = random.sample(families_b, j+1) scores.append(jaccard(i_fams, j_fams)) else: # Cannot calculate threshold with these families, add 1 scores.append(1) # TODO (maybe?): cutoff is hard coded here, replace ? print(iterations, len(scores), scores) scores = sorted(scores) for _ in range(step): new_threshholds.append(scores[int(iterations*0.95)]) for _ in range(step): thresholds.append(new_threshholds) return thresholds
class Sequence(db.Model): __tablename__ = 'sequences' id = db.Column(db.Integer, primary_key=True) species_id = db.Column(db.Integer, db.ForeignKey('species.id', ondelete='CASCADE'), index=True) name = db.Column(db.String(50, collation=SQL_COLLATION), index=True) description = db.Column(db.Text) coding_sequence = db.deferred(db.Column(db.Text)) type = db.Column(db.Enum('protein_coding', 'TE', 'RNA', name='sequence_type'), default='protein_coding') is_mitochondrial = db.Column(db.SmallInteger, default=False) is_chloroplast = db.Column(db.SmallInteger, default=False) expression_profiles = db.relationship('ExpressionProfile', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) network_nodes = db.relationship('ExpressionNetwork', backref=db.backref('sequence', lazy='joined'), lazy='dynamic', cascade="all, delete-orphan", passive_deletes=True) # Other properties # # coexpression_cluster_associations declared in 'SequenceCoexpressionClusterAssociation' # interpro_associations declared in 'SequenceInterproAssociation' # go_associations declared in 'SequenceGOAssociation' # family_associations declared in 'SequenceFamilyAssociation' go_labels = db.relationship('GO', secondary=sequence_go, lazy='dynamic') interpro_domains = db.relationship('Interpro', secondary=sequence_interpro, lazy='dynamic') families = db.relationship('GeneFamily', secondary=sequence_family, lazy='dynamic') coexpression_clusters = db.relationship( 'CoexpressionCluster', secondary=sequence_coexpression_cluster, backref=db.backref('sequences', lazy='dynamic'), lazy='dynamic') ecc_query_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.query_id == Sequence.id", backref=db.backref('query_sequence', lazy='joined'), lazy='dynamic') ecc_target_associations = db.relationship( 'SequenceSequenceECCAssociation', primaryjoin="SequenceSequenceECCAssociation.target_id == Sequence.id", backref=db.backref('target_sequence', lazy='joined'), lazy='dynamic') clade_associations_one = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_one_id == Sequence.id", backref=db.backref('sequence_one', lazy='joined'), lazy='dynamic') clade_associations_two = db.relationship( 'SequenceSequenceCladeAssociation', primaryjoin= "SequenceSequenceCladeAssociation.sequence_two_id == Sequence.id", backref=db.backref('sequence_two', lazy='joined'), lazy='dynamic') xrefs = db.relationship('XRef', secondary=sequence_xref, lazy='joined') def __init__(self, species_id, name, coding_sequence, type='protein_coding', is_chloroplast=False, is_mitochondrial=False, description=None): self.species_id = species_id self.name = name self.description = description self.coding_sequence = coding_sequence self.type = type self.is_chloroplast = is_chloroplast self.is_mitochondrial = is_mitochondrial @property def protein_sequence(self): """ Function to translate the coding sequence to the amino acid sequence. Will start at the first start codon and break after adding a stop codon (indicated by '*') :return: The amino acid sequence based on the coding sequence """ return translate(self.coding_sequence) @property def aliases(self): """ Returns a readable string with the aliases or tokens stored for this sequence in the table xrefs :return: human readable string with aliases or None """ t = [x.name for x in self.xrefs if x.platform == 'token'] return ", ".join(t) if len(t) > 0 else None @property def shortest_alias(self): """ Returns the shortest alias :return: string with shortest alias or None (in case no aliases exist) """ t = [x.name for x in self.xrefs if x.platform == 'token'] return min(t, key=len) if len(t) > 0 else None @property def display_name(self): """ Returns a name to display (from xrefs with display) if available otherwise return name :return: display name """ t = [x.name for x in self.xrefs if x.platform == 'display'] return t[0] if len(t) > 0 else self.name @property def best_name(self): """ Checks if there is a display name, if not checks the shortest alias, otherwise returns name. To be used in e.g. graphs :return: string with best name to show in graphs, ... """ if self.display_name is not self.name: return self.display_name elif self.shortest_alias is not None: return self.shortest_alias else: return self.name @property def readable_type(self): """ Converts the type table to a readable string :return: string with readable version of the sequence type """ conversion = { 'protein_coding': 'protein coding', 'TE': 'transposable element', 'RNA': 'RNA' } if self.type in conversion.keys(): return conversion[self.type] else: return 'other' @staticmethod def add_from_fasta(filename, species_id, compressed=False): fasta_data = Fasta() fasta_data.readfile(filename, compressed=compressed) new_sequences = [] # Loop over sequences, sorted by name (key here) and add to db for name, sequence in sorted(fasta_data.sequences.items(), key=operator.itemgetter(0)): new_sequence = { "species_id": species_id, "name": name, "description": None, "coding_sequence": sequence, "type": "protein_coding", "is_mitochondrial": False, "is_chloroplast": False } new_sequences.append(new_sequence) # add 400 sequences at the time, more can cause problems with some database engines if len(new_sequences) > 400: db.engine.execute(Sequence.__table__.insert(), new_sequences) new_sequences = [] # add the last set of sequences db.engine.execute(Sequence.__table__.insert(), new_sequences) return len(fasta_data.sequences.keys()) @staticmethod def add_descriptions(filename, species_id): sequences = Sequence.query.filter_by(species_id=species_id).all() seq_dict = {} for s in sequences: seq_dict[s.name] = s with open(filename, "r") as f_in: for i, line in enumerate(f_in): try: name, description = line.strip().split('\t') except ValueError: print("Cannot parse line %d: \"%s\"" % (i, line), file=sys.stderr) finally: if name in seq_dict.keys(): seq_dict[name].description = description if i % 400 == 0: db.session.commit() db.session.commit() @staticmethod def export_cds(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.coding_sequence), file=f_out) @staticmethod def export_protein(filename): sequences = Sequence.query.options(undefer('coding_sequence')).all() with open(filename, "w") as f_out: for s in sequences: print(">%s\n%s" % (s.name, s.protein_sequence), file=f_out)
class SequenceGOAssociation(db.Model): __tablename__ = 'sequence_go' __table_args__ = {'extend_existing': True} id = db.Column(db.Integer, primary_key=True) sequence_id = db.Column(db.Integer, db.ForeignKey('sequences.id', ondelete='CASCADE')) go_id = db.Column(db.Integer, db.ForeignKey('go.id', ondelete='CASCADE')) evidence = db.Column( db.Enum('EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD', 'RCA', 'TAS', 'NAS', 'IC', 'ND', 'IEA', name='evidence')) source = db.Column(db.Text) predicted = db.Column(db.SmallInteger, default=False) prediction_data = db.Column(db.Text) sequence = db.relationship('Sequence', backref=db.backref('go_associations', lazy='dynamic', passive_deletes=True), lazy='joined') go = db.relationship('GO', backref=db.backref('sequence_associations', lazy='dynamic', passive_deletes=True), lazy='joined') def __init__(self, sequence_id, go_id, evidence, source, predicted=False, prediction_data=None): self.sequence_id = sequence_id self.go_id = go_id self.evidence = evidence self.source = source self.predicted = predicted self.prediction_data = prediction_data @property def data(self): """ Property to get the information in the prediction_data as a dict. Useful for showing these values in e.g. jinja2 templates :return: de-serialized prediction_data (json) """ return json.loads(self.prediction_data)