Esempio n. 1
0
    def __ecc(q_network, t_network, families, thresholds, query_family, max_size=30):
        """
        Takes the networks neighborhoods (as stored in the databases), extracts the genes and find the families for
        each gene. Next the ECC score is calculated

        :param q_network: network for the query gene
        :param t_network: network for the target gene
        :param families: dictionary that links a sequence id (key) to a family id (value)
        :param thresholds:
        :param query_family: name of the input gene family
        :return: the ECC score for the two input neighborhoods given the families, a boolean flag if this is significant
        """
        q_data = json.loads(q_network)
        t_data = json.loads(t_network)

        q_genes = [t['gene_id'] for t in q_data if t['gene_id'] is not None]
        t_genes = [t['gene_id'] for t in t_data if t['gene_id'] is not None]

        q_families = [families[q] for q in q_genes if q in families.keys() and families[q] != query_family]
        t_families = [families[t] for t in t_genes if t in families.keys() and families[t] != query_family]

        # print("***\nQuery %d\n%s\n%s" % (query_family, ','.join([str(q) for q in q_families]), ','.join([str(t) for t in t_families])))

        if len(q_families) == 0 or len(t_families) == 0:
            return 0.0, False
        else:
            ecc = jaccard(q_families, t_families)

            q_size = len(set(q_families)) if len(set(q_families)) < max_size else max_size
            t_size = len(set(t_families)) if len(set(t_families)) < max_size else max_size

            t = thresholds[q_size-1][t_size-1]

            return ecc, ecc > t
def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
    true = get_selected_text(text, start_idx, end_idx, offsets)
    return jaccard(true, pred)
Esempio n. 3
0
    def __set_thresholds(families_a, families_b, max_size=30, iterations=1000, step=5):
        """
        Empirically determine (permutation test) thresholds for ECC

        :param families_a: families of species_a (list of internal family ids)
        :param families_b: families of species_b (list of internal family ids)
        :param max_size: maximum number of families (default = 30)
        :param iterations: number of permutations done
        :param step: step size
        :return: matrix (list of lists) with the thresholds at various family sizes
        """
        thresholds = []

        for i in range(0, max_size, step):
            print("%d done" % i)
            new_threshholds = []
            for j in range(0, max_size, step):
                scores = []
                for _ in range(iterations):
                    if i+1 < len(families_a) and j+1 < len(families_b):
                        i_fams = random.sample(families_a, i+1)
                        j_fams = random.sample(families_b, j+1)
                        scores.append(jaccard(i_fams, j_fams))
                    else:
                        # Cannot calculate threshold with these families, add 1
                        scores.append(1)

                # TODO (maybe?): cutoff is hard coded here, replace ?
                print(iterations, len(scores), scores)
                scores = sorted(scores)
                for _ in range(step):
                    new_threshholds.append(scores[int(iterations*0.95)])
            for _ in range(step):
                thresholds.append(new_threshholds)

        return thresholds
Esempio n. 4
0
classifier.extract_features()
classifier.train_classifier()
predicted_mass, path_predicted_mass = classifier.prediction()

#STEP 2:    Pre-processing of the images to enhance internal structures, before to give them to the Neural Net.
predicted_mass = data_preprocessing.preprocessing(predicted_mass)
predicted_mass = data_preprocessing.cropping(mask_path, predicted_mass,
                                             path_predicted_mass)

#STEP 3:    Loading the U-Net model and predicting masses of test set
unet = UNet()
predictions = unet.unet_predict(predicted_mass)

#STEP 4: Segmentation process and final output
segmented_images = drawer.clean_unet_images(predicted_mass, predictions)
outcomes, pred_groundtruth = drawer.my_draw_contours(segmented_images,
                                                     ground_path,
                                                     path_predicted_mass)

#STEP 5:    Evaluating performance
jaccard_list = jaccard(pred_groundtruth, path_predicted_mass, ground_test_path)
average = sum(jaccard_list) / len(jaccard_list)
minimum = min(jaccard_list)
maximum = max(jaccard_list)

print("Average Jaccard index: ", average)
print("--------------------------------")
print("Minimum Jaccard index: ", minimum)
print("--------------------------------")
print("Maximum Jaccard index: ", maximum)
Esempio n. 5
0
    def calculate_similarities(gene_family_method_id=1, percentile_pass=0.95):
        """
        This function will calculate ALL similarities between clusters in the database. Results will be added to the
        DB

        :param gene_family_method_id: Internal ID of gene family method to use to calculate the scores (default = 1)
        :param percentile_pass: percentile based cutoff (default = 0.95)
        """

        # sqlalchemy to fetch cluster associations
        fields = [
            SequenceCoexpressionClusterAssociation.__table__.c.sequence_id,
            SequenceCoexpressionClusterAssociation.__table__.c.
            coexpression_cluster_id
        ]
        condition = SequenceCoexpressionClusterAssociation.__table__.c.sequence_id is not None
        cluster_associations = db.engine.execute(
            db.select(fields).where(condition)).fetchall()

        # sqlalchemy to fetch sequence family associations
        fields = [
            SequenceFamilyAssociation.__table__.c.sequence_id,
            SequenceFamilyAssociation.__table__.c.gene_family_id,
            GeneFamily.__table__.c.method_id
        ]
        condition = GeneFamily.__table__.c.method_id == gene_family_method_id
        table = join(
            SequenceFamilyAssociation.__table__, GeneFamily.__table__,
            SequenceFamilyAssociation.__table__.c.gene_family_id ==
            GeneFamily.__table__.c.id)
        sequence_families = db.engine.execute(
            db.select(fields).select_from(table).where(condition)).fetchall()

        # convert sqlachemy results into dictionary
        sequence_to_family = {
            seq_id: fam_id
            for seq_id, fam_id, method_id in sequence_families
        }

        cluster_to_sequences = {}
        cluster_to_families = {}

        for seq_id, cluster_id in cluster_associations:
            if cluster_id not in cluster_to_sequences.keys():
                cluster_to_sequences[cluster_id] = []
            cluster_to_sequences[cluster_id].append(seq_id)

        for cluster_id, sequences in cluster_to_sequences.items():
            families = list(
                set([
                    sequence_to_family[s] for s in sequences
                    if s in sequence_to_family.keys()
                ]))
            if len(families) > 0:
                cluster_to_families[cluster_id] = families

        keys = list(cluster_to_families.keys())

        data = []

        for i in range(len(keys) - 1):
            for j in range(i + 1, len(keys)):
                current_keys = [keys[x] for x in [i, j]]
                current_families = [
                    cluster_to_families[k] for k in current_keys
                ]

                if len(current_families[0]) > 4 and len(
                        current_families[1]) > 4:
                    j = jaccard(current_families[0], current_families[1])
                    data.append([current_keys[0], current_keys[1], j])

        ordered_j = sorted([a[2] for a in data])
        if len(ordered_j) > 0:
            percentile_cutoff = ordered_j[int(
                len(ordered_j) * percentile_pass)]

            database = [{
                'source_id': d[0],
                'target_id': d[1],
                'gene_family_method_id': gene_family_method_id,
                'jaccard_index': d[2],
                'p_value': 0,
                'corrected_p_value': 0
            } for d in data if d[2] >= percentile_cutoff]

            db.engine.execute(CoexpressionClusterSimilarity.__table__.insert(),
                              database)
        else:
            print("No similar clusters found!")
Esempio n. 6
0
 def test_jaccard(self):
     self.assertEqual(jaccard('ab', 'bc'), 1 / 3)
     self.assertEqual(jaccard('ab', 'cd'), 0)
     self.assertEqual(jaccard('ab', 'ab'), 1)