def export_protein_sequences(SEQUENCE_PATH): """ Exports amino acid sequences for protein_coding transcripts as gzipped fasta files to the desired path """ if not os.path.exists(SEQUENCE_PATH): os.makedirs(SEQUENCE_PATH) species = Species.query.all() for s in species: filename = s.code + ".aa.fasta.gz" filename = os.path.join(SEQUENCE_PATH, filename) sequences = db.engine.execute( db.select([ Sequence.__table__.c.name, Sequence.__table__.c.type, Sequence.__table__.c.coding_sequence ]).where(Sequence.__table__.c.species_id == s.id)).fetchall() with gzip.open(filename, 'wb') as f: for (name, sequence_type, sequence) in sequences: if sequence_type == "protein_coding": f.write( bytes(">" + name + '\n' + translate(sequence) + '\n', 'UTF-8'))
def generate(selected_species): sequences = db.engine.execute(db.select([Sequence.__table__.c.name, Sequence.__table__.c.coding_sequence]). where(Sequence.__table__.c.species_id == selected_species)).\ fetchall() for name, coding_sequence in sequences: yield ">" + name + '\n' + coding_sequence + '\n'
def update_species_counts(): """ Adds phylo-profile to each go-label, results are stored in the database :param exclude_predicted: if True (default) predicted GO labels will be excluded """ # link species to sequences sequences = db.engine.execute(db.select([Sequence.__table__.c.id, Sequence.__table__.c.species_id])).fetchall() sequence_to_species = {} for seq_id, species_id in sequences: if species_id is not None: sequence_to_species[seq_id] = int(species_id) # get go for all genes associations = db.engine.execute( db.select([SequenceGOAssociation.__table__.c.sequence_id, SequenceGOAssociation.__table__.c.go_id], distinct=True)\ .where(SequenceGOAssociation.__table__.c.predicted == 0))\ .fetchall() count = {} for seq_id, go_id in associations: species_id = sequence_to_species[seq_id] if go_id not in count.keys(): count[go_id] = {} if species_id not in count[go_id]: count[go_id][species_id] = 1 else: count[go_id][species_id] += 1 # update counts for go_id, data in count.items(): db.engine.execute(db.update(GO.__table__) .where(GO.__table__.c.id == go_id) .values(species_counts=json.dumps(data)))
def calculate_specificities(species_id, description, remove_background=False): """ Function that calculates condition specificities for each profile. No grouping is applied, each condition is used as is :param species_id: internal species ID :param description: description for the method to determine the specificity :param remove_background: when true the lowest value of each profile is substracted from all values (can be off use with noisy data derived from microarrays. """ conditions = [] # get profile from the database (ORM free for speed) profiles = db.engine.execute( db.select([ ExpressionProfile.__table__.c.id, ExpressionProfile.__table__.c.profile ]).where(ExpressionProfile.__table__.c.species_id == species_id)).fetchall() # detect all conditions for profile_id, profile in profiles: profile_data = json.loads(profile) for condition in profile_data['order']: if condition not in conditions: conditions.append(condition) # convert list into dictionary and run function conditions_dict = {k: k for k in conditions} return ExpressionSpecificityMethod.calculate_tissue_specificities( species_id, description, conditions_dict, conditions, remove_background=remove_background)
def species_download_coding(species_id): """ Generates a fasta file with all coding sequences for a given species :param species_id: Internal ID of the species :return: Response with the fasta file """ output = [] current_species = Species.query.get(species_id) sequences = db.engine.execute(db.select([Sequence.__table__.c.name, Sequence.__table__.c.coding_sequence]). where(Sequence.__table__.c.species_id == current_species.id)).\ fetchall() for (name, coding_sequence) in sequences: output.append(">" + name) output.append(coding_sequence) response = make_response("\n".join(output)) response.headers[ "Content-Disposition"] = "attachment; filename=" + current_species.code + ".cds.fasta" response.headers['Content-type'] = 'text/plain' return response
def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95): """ This function will calculate ALL similarities between clusters in the database. Results will be added to the DB :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1) :param percentile_pass: percentile based cutoff (default = 0.95) """ # sqlalchemy to fetch cluster associations fields = [ SequenceCoexpressionClusterAssociation.__table__.c.sequence_id, SequenceCoexpressionClusterAssociation.__table__.c. coexpression_cluster_id ] condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None cluster_associations = db.engine.execute( db.select(fields).where(condition)).fetchall() # sqlalchemy to fetch sequence family associations fields = [ SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id ] condition = GeneFamily.__table__.c.method_id == gene_family_method_id table = join( SequenceFamilyAssociation.__table__, GeneFamily.__table__, SequenceFamilyAssociation.__table__.c.gene_family_id == GeneFamily.__table__.c.id) sequence_families = db.engine.execute( db.select(fields).select_from(table).where(condition)).fetchall() # convert sqlachemy results into dictionary sequence_to_family = { seq_id: fam_id for seq_id, fam_id, method_id in sequence_families } cluster_to_sequences = {} cluster_to_families = {} for seq_id, cluster_id in cluster_associations: if cluster_id not in cluster_to_sequences.keys(): cluster_to_sequences[cluster_id] = [] cluster_to_sequences[cluster_id].append(seq_id) for cluster_id, sequences in cluster_to_sequences.items(): families = list( set([ sequence_to_family[s] for s in sequences if s in sequence_to_family.keys() ])) if len(families) > 0: cluster_to_families[cluster_id] = families keys = list(cluster_to_families.keys()) data = [] for i in range(len(keys) - 1): for j in range(i + 1, len(keys)): current_keys = [keys[x] for x in [i, j]] current_families = [ cluster_to_families[k] for k in current_keys ] if len(current_families[0]) > 4 and len( current_families[1]) > 4: j = jaccard(current_families[0], current_families[1]) data.append([current_keys[0], current_keys[1], j]) ordered_j = sorted([a[2] for a in data]) if len(ordered_j) > 0: percentile_cutoff = ordered_j[int( len(ordered_j) * percentile_pass)] database = [{ 'source_id': d[0], 'target_id': d[1], 'gene_family_method_id': gene_family_method_id, 'jaccard_index': d[2], 'p_value': 0, 'corrected_p_value': 0 } for d in data if d[2] >= percentile_cutoff] db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(), database) else: print("No similar clusters found!")
def predict_from_network_enrichment(expression_network_method_id, cutoff=0.05, source="PlaNet Prediction"): from conekt.models.expression.networks import ExpressionNetworkMethod expression_network_method = ExpressionNetworkMethod.query.get(expression_network_method_id) if expression_network_method is None: print("ERROR: Network Method ID %d not found" % expression_network_method_id) return probes = expression_network_method.probes.all() # Get all GO terms and get background # Important, counts are obtained from precomputed counts in the species_counts field !! go_data = db.engine.execute(db.select([GO.__table__.c.id, GO.__table__.c.species_counts])).fetchall() go_background = defaultdict(lambda: 0) for go_id, counts_json in go_data: if counts_json is not "": counts = json.loads(counts_json) if str(expression_network_method.species_id) in counts.keys(): go_background[go_id] = counts[str(expression_network_method.species_id)] new_associations = [] for i, probe in enumerate(probes): print("Predicting GO for gene: %d, %s (%d out of %d)" % (probe.sequence_id, probe.sequence.name, i, expression_network_method.probe_count)) # Get neighborhood from database neighborhood = json.loads(probe.network) # Get sequence ids from genes in first level neighborhood sequence_ids = [n['gene_id'] for n in neighborhood if 'gene_id' in n] # Get own GO terms own_associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id == probe.sequence_id) own_terms = list(set([a.go_id for a in own_associations])) # Get GO terms from neighbors associations = SequenceGOAssociation.query.filter(SequenceGOAssociation.sequence_id.in_(sequence_ids)).\ filter(SequenceGOAssociation.predicted == 0).all() # Make GO terms from neighbors unique and ignore terms the current gene has already unique_associations = set([(a.sequence_id, a.go_id) for a in associations if a.go_id not in own_terms]) go_counts = defaultdict(lambda: 0) for ua in unique_associations: go_counts[ua[1]] += 1 # find significantly enriched GO terms and store them enriched_go = [] for go_id, count in go_counts.items(): p_value = hypergeo_sf(count, len(sequence_ids), go_background[go_id], len(probes)) if p_value < cutoff: enriched_go.append((go_id, p_value)) # apply FDR correction to the p-values corrected_p = fdr_correction([a[1] for a in enriched_go]) # push new prediction in a dict that will be added to the DB for corrected_p, (go_id, p_value) in zip(corrected_p, enriched_go): new_associations.append({ 'sequence_id': probe.sequence_id, 'go_id': go_id, 'evidence': 'IEP', 'source': source, 'predicted': True, 'prediction_data': json.dumps({'p-cutoff': cutoff, 'p-value': p_value, 'p-value (FDR)': corrected_p, 'network_method': expression_network_method_id, 'prediction_method': 'Neighborhood enrichment' }) }) # Add new labels to the database in chuncks of 400 for i in range(0, len(new_associations), 400): db.engine.execute(SequenceGOAssociation.__table__.insert(), new_associations[i: i + 400])
def calculate_tissue_specificities(species_id, description, condition_to_tissue, order, remove_background=False, use_max=True): """ Function calculates tissue specific genes based on the expression conditions. A dict is required to link specific conditions to the correct tissues. This also allows conditions to be excluded in case they are unrelated with a specific tissue. :param species_id: internal species ID :param description: description for the method to determine the specificity :param condition_to_tissue: dict to connect a condition to a tissue :param order: preferred order of the conditions, will match tissues to it :param remove_background: substracts the lowest value to correct for background noise :param use_max: uses the maximum of mean values instead of the mean of all values :return id of the new method """ new_method = ExpressionSpecificityMethod() new_method.species_id = species_id new_method.description = description new_method.menu_order = 0 tissues = [] for c in order: if c in condition_to_tissue.keys(): v = condition_to_tissue[c] if v not in tissues: tissues.append(v) # get profile from the database (ORM free for speed) profiles = db.engine.execute( db.select([ ExpressionProfile.__table__.c.id, ExpressionProfile.__table__.c.profile ]).where(ExpressionProfile.__table__.c.species_id == species_id)).fetchall() new_method.conditions = json.dumps(tissues) db.session.add(new_method) db.session.commit() # detect specifities and add to the database specificities = [] for profile_id, profile in profiles: # prepare profile data for calculation profile_data = json.loads(profile) profile_means = {} for t in tissues: values = [] means = [] valid_conditions = [ k for k in profile_data['data'] if k in condition_to_tissue and condition_to_tissue[k] == t ] for k, v in profile_data['data'].items(): if k in valid_conditions: values += v means.append(mean(v)) if not use_max: profile_means[t] = mean(values) if len(values) > 0 else 0 else: profile_means[t] = max(means) if len(means) > 0 else 0 # substract minimum value to remove background # experimental code ! if remove_background: minimum = min([v for k, v in profile_means.items()]) for k in profile_means.keys(): profile_means[k] -= minimum # determine spm score for each condition profile_specificities = [] profile_tau = tau([v for _, v in profile_means.items()]) profile_entropy = entropy_from_values( [v for _, v in profile_means.items()]) for t in tissues: score = expression_specificity(t, profile_means) new_specificity = { 'profile_id': profile_id, 'condition': t, 'score': score, 'entropy': profile_entropy, 'tau': profile_tau, 'method_id': new_method.id, } profile_specificities.append(new_specificity) # sort conditions and add top one profile_specificities = sorted(profile_specificities, key=lambda x: x['score'], reverse=True) specificities.append(profile_specificities[0]) # write specificities to db if there are more than 400 (ORM free for speed) if len(specificities) > 400: db.engine.execute(ExpressionSpecificity.__table__.insert(), specificities) specificities = [] # write remaining specificities to the db db.engine.execute(ExpressionSpecificity.__table__.insert(), specificities) return new_method.id
def calculate_ecc(network_method_ids, gene_family_method_id, max_size=100): """ Function to calculate the ECC scores in and between genes of different networks ORM free method for speed ! :param network_method_ids: array of networks (using their internal id !) to compare :param gene_family_method_id: internal id of the type of family methods to be used for the comparison """ network_families = {} sequence_network = {} sequence_network_method = {} sequence_family = {} family_sequence = {} # Get all the network information and store in dictionary for n in network_method_ids: current_network = db.engine.execute(db.select([ExpressionNetwork.__table__.c.sequence_id, ExpressionNetwork.__table__.c.network, ExpressionNetwork.__table__.c.method_id]). where(ExpressionNetwork.__table__.c.method_id == n). where(ExpressionNetwork.__table__.c.sequence_id.isnot(None)) ).fetchall() for sequence, network, network_method_id in current_network: if sequence is not None: sequence_network[int(sequence)] = network sequence_network_method[int(sequence)] = int(network_method_id) # Get family data and store in dictionary current_families = db.engine.execute(db.select([SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id]). select_from(SequenceFamilyAssociation.__table__.join(GeneFamily.__table__)). where(GeneFamily.__table__.c.method_id == gene_family_method_id) ).fetchall() for sequence, family, method in current_families: sequence_family[int(sequence)] = int(family) if family not in family_sequence.keys(): family_sequence[int(family)] = [] family_sequence[int(family)].append(int(sequence)) # Create a dict (key = network) with the families present in that network # Families that occur multiple times should be present multiple times as this is used # to set threshholds later ! for sequence, network_method in sequence_network_method.items(): # ignore sequences without a family, ideally this shouldn't happen if network_method not in network_families.keys(): network_families[network_method] = [] if sequence in sequence_family.keys(): family = sequence_family[sequence] network_families[network_method].append(family) # Determine threshold and p-value # A background model will be computed for each combination of networks, an ECC score will need to be better # than 95 % of the randomly found values to be considered significant thresholds = {} print("Starting permutation tests") for n in network_method_ids: thresholds[n] = {} for m in network_method_ids: thresholds[n][m] = ExpressionNetworkMethod.__set_thresholds(network_families[n], network_families[m], max_size=max_size) # Data loaded start calculating ECCs new_ecc_scores = [] for family, sequences in family_sequence.items(): for i in range(len(sequences) - 1): query = sequences[i] for j in range(i+1, len(sequences)): target = sequences[j] if query in sequence_network.keys() and target in sequence_network.keys() and query != target: # Ignore genes with overlapping neighborhoods if not ExpressionNetworkMethod.__neighborhoods_overlap(sequence_network[query], sequence_network[target]): ecc, significant = ExpressionNetworkMethod.__ecc(sequence_network[query], sequence_network[target], sequence_family, thresholds[sequence_network_method[query]][sequence_network_method[target]], family, max_size=max_size) if significant: new_ecc_scores.append({ 'query_id': query, 'target_id': target, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[query], 'target_network_method_id': sequence_network_method[target], }) # add reciprocal relation new_ecc_scores.append({ 'query_id': target, 'target_id': query, 'ecc': ecc, 'gene_family_method_id': gene_family_method_id, 'query_network_method_id': sequence_network_method[target], 'target_network_method_id': sequence_network_method[query], }) if len(new_ecc_scores) > 400: db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores) new_ecc_scores = [] db.engine.execute(SequenceSequenceECCAssociation.__table__.insert(), new_ecc_scores)