def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30): """ Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for each gene. Next the ECC score is calculated :param q_network: network for the query gene :param t_network: network for the target gene :param families: dictionary that links a sequence id (key) to a family id (value) :param thresholds: :param query_family: name of the input gene family :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant """ q_data = json.loads(q_network) t_data = json.loads(t_network) q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None] t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None] q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family] t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family] # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families]))) if len(q_families) == 0 or len(t_families) == 0: return 0.0, False else: ecc = jaccard(q_families, t_families) q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size t = thresholds[q_size-1][t_size-1] return ecc, ecc > t
def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets): start_pred = np.argmax(start_logits) end_pred = np.argmax(end_logits) if start_pred > end_pred: pred = text else: pred = get_selected_text(text, start_pred, end_pred, offsets) true = get_selected_text(text, start_idx, end_idx, offsets) return jaccard(true, pred)
def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5): """ Empirically determine (permutation test) thresholds for ECC :param families_a: families of species_a (list of internal family ids) :param families_b: families of species_b (list of internal family ids) :param max_size: maximum number of families (default = 30) :param iterations: number of permutations done :param step: step size :return: matrix (list of lists) with the thresholds at various family sizes """ thresholds = [] for i in range(0, max_size, step): print("%d done" % i) new_threshholds = [] for j in range(0, max_size, step): scores = [] for _ in range(iterations): if i+1 < len(families_a) and j+1 < len(families_b): i_fams = random.sample(families_a, i+1) j_fams = random.sample(families_b, j+1) scores.append(jaccard(i_fams, j_fams)) else: # Cannot calculate threshold with these families, add 1 scores.append(1) # TODO (maybe?): cutoff is hard coded here, replace ? print(iterations, len(scores), scores) scores = sorted(scores) for _ in range(step): new_threshholds.append(scores[int(iterations*0.95)]) for _ in range(step): thresholds.append(new_threshholds) return thresholds
classifier.extract_features() classifier.train_classifier() predicted_mass, path_predicted_mass = classifier.prediction() #STEP 2: Pre-processing of the images to enhance internal structures, before to give them to the Neural Net. predicted_mass = data_preprocessing.preprocessing(predicted_mass) predicted_mass = data_preprocessing.cropping(mask_path, predicted_mass, path_predicted_mass) #STEP 3: Loading the U-Net model and predicting masses of test set unet = UNet() predictions = unet.unet_predict(predicted_mass) #STEP 4: Segmentation process and final output segmented_images = drawer.clean_unet_images(predicted_mass, predictions) outcomes, pred_groundtruth = drawer.my_draw_contours(segmented_images, ground_path, path_predicted_mass) #STEP 5: Evaluating performance jaccard_list = jaccard(pred_groundtruth, path_predicted_mass, ground_test_path) average = sum(jaccard_list) / len(jaccard_list) minimum = min(jaccard_list) maximum = max(jaccard_list) print("Average Jaccard index: ", average) print("--------------------------------") print("Minimum Jaccard index: ", minimum) print("--------------------------------") print("Maximum Jaccard index: ", maximum)
def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95): """ This function will calculate ALL similarities between clusters in the database. Results will be added to the DB :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1) :param percentile_pass: percentile based cutoff (default = 0.95) """ # sqlalchemy to fetch cluster associations fields = [ SequenceCoexpressionClusterAssociation.__table__.c.sequence_id, SequenceCoexpressionClusterAssociation.__table__.c. coexpression_cluster_id ] condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None cluster_associations = db.engine.execute( db.select(fields).where(condition)).fetchall() # sqlalchemy to fetch sequence family associations fields = [ SequenceFamilyAssociation.__table__.c.sequence_id, SequenceFamilyAssociation.__table__.c.gene_family_id, GeneFamily.__table__.c.method_id ] condition = GeneFamily.__table__.c.method_id == gene_family_method_id table = join( SequenceFamilyAssociation.__table__, GeneFamily.__table__, SequenceFamilyAssociation.__table__.c.gene_family_id == GeneFamily.__table__.c.id) sequence_families = db.engine.execute( db.select(fields).select_from(table).where(condition)).fetchall() # convert sqlachemy results into dictionary sequence_to_family = { seq_id: fam_id for seq_id, fam_id, method_id in sequence_families } cluster_to_sequences = {} cluster_to_families = {} for seq_id, cluster_id in cluster_associations: if cluster_id not in cluster_to_sequences.keys(): cluster_to_sequences[cluster_id] = [] cluster_to_sequences[cluster_id].append(seq_id) for cluster_id, sequences in cluster_to_sequences.items(): families = list( set([ sequence_to_family[s] for s in sequences if s in sequence_to_family.keys() ])) if len(families) > 0: cluster_to_families[cluster_id] = families keys = list(cluster_to_families.keys()) data = [] for i in range(len(keys) - 1): for j in range(i + 1, len(keys)): current_keys = [keys[x] for x in [i, j]] current_families = [ cluster_to_families[k] for k in current_keys ] if len(current_families[0]) > 4 and len( current_families[1]) > 4: j = jaccard(current_families[0], current_families[1]) data.append([current_keys[0], current_keys[1], j]) ordered_j = sorted([a[2] for a in data]) if len(ordered_j) > 0: percentile_cutoff = ordered_j[int( len(ordered_j) * percentile_pass)] database = [{ 'source_id': d[0], 'target_id': d[1], 'gene_family_method_id': gene_family_method_id, 'jaccard_index': d[2], 'p_value': 0, 'corrected_p_value': 0 } for d in data if d[2] >= percentile_cutoff] db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(), database) else: print("No similar clusters found!")
def test_jaccard(self): self.assertEqual(jaccard('ab', 'bc'), 1 / 3) self.assertEqual(jaccard('ab', 'cd'), 0) self.assertEqual(jaccard('ab', 'ab'), 1)