Beispiel #1
0
def run_bench_time_alg_exacts_vldb(path_dataset: str,
                                   raw_data=False,
                                   figures=False):
    # get the scoring schemes (the KCFs)
    kcf1 = ScoringScheme.get_unifying_scoring_scheme_p(1.)
    kcf2 = ScoringScheme.get_extended_measure_scoring_scheme()
    kcf3 = ScoringScheme.get_induced_measure_scoring_scheme_p(1.)
    kcf4 = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)
    kcfs = [kcf1, kcf2, kcf3, kcf4]

    # optimize = optim1, preprocess = optim2
    ea = get_algorithm(alg=Algorithm.Exact,
                       parameters={
                           "optimize": False,
                           "preprocess": False
                       })
    ea_optim1 = get_algorithm(alg=Algorithm.Exact,
                              parameters={
                                  "optimize": True,
                                  "preprocess": False
                              })
    ea_optim1_optim2 = get_algorithm(alg=Algorithm.Exact,
                                     parameters={
                                         "optimize": True,
                                         "preprocess": True
                                     })
    algorithms_for_bench = [ea, ea_optim1, ea_optim1_optim2]
    # run experiment for each scoring scheme (KCF)
    for kcf in kcfs:
        bench = BenchTime(
            dataset_folder=path_dataset,
            # algorithms for the bench time
            algs=algorithms_for_bench,
            # the scoring scheme that is the kcf to consider
            scoring_scheme=kcf,
            # to select tuples of rankings with number of elements between 30 and 119 and at least 3 rankings
            dataset_selector_exp=DatasetSelector(nb_elem_min=30,
                                                 nb_elem_max=119,
                                                 nb_rankings_min=3),
            # range of size of datasets for the output
            steps=10,
            # re-compute the consensus until final time computation > 1 sec.
            # the average time computation is then returned
            repeat_time_computation_until=1.)

        # run experiment and print results. If parameter is true: also print all parameters of experiment (readme)
        # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
        bench.run(raw_data, figures=figures)
Beispiel #2
0
def run_experiment_bio_orphanet(dataset_path: str,
                                raw_data=False,
                                figures=False):
    # sets the values of b5-b4 to consider (note that b4 is set to 0)
    values_b5 = [0.0, 0.25, 0.5, 0.75, 1, 2]
    kcfs = []
    # creation of the scoring schemes (the KCFs)
    for value_b5 in values_b5:
        kcfs.append(
            ScoringScheme([[0., 1., 1., 0., value_b5, 0.],
                           [1., 1., 0., value_b5, value_b5, 0]]))

    exp1 = ExperimentOrphanet(
        dataset_folder=dataset_path,
        # the kcfs to consider
        scoring_schemes=kcfs,
        # the top-k to consider
        top_k_to_test=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
        # algorithm to compute the consensus
        algo=get_algorithm(alg=Algorithm.ParCons,
                           parameters={
                               "bound_for_exact":
                               150,
                               "auxiliary_algorithm":
                               get_algorithm(alg=Algorithm.BioConsert)
                           }),
        # selects all the tuples of rankings with at least 100 elements and 3 rankings
        # dataset_selector=DatasetSelector(nb_elem_min=100, nb_rankings_min=3)
        dataset_selector=DatasetSelector(nb_elem_min=100, nb_rankings_min=3),
    )

    # run experiment and print results. If raw_data is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    exp1.run(raw_data, figures=figures)
Beispiel #3
0
    def compute_consensus_rankings(self,
                                   dataset: Dataset,
                                   scoring_scheme: ScoringScheme,
                                   return_at_most_one_ranking=False,
                                   bench_mode=False) -> Consensus:
        """
        :param dataset: A dataset containing the rankings to aggregate
        :type dataset: Dataset (class Dataset in package 'datasets')
        :param scoring_scheme: The penalty vectors to consider
        :type scoring_scheme: ScoringScheme (class ScoringScheme in package 'distances')
        :param return_at_most_one_ranking: the algorithm should not return more than one ranking
        :type return_at_most_one_ranking: bool
        :param bench_mode: is bench mode activated. If False, the algorithm may return more information
        :type bench_mode: bool
        :return one or more rankings if the underlying algorithm can find several equivalent consensus rankings
        If the algorithm is not able to provide multiple consensus, or if return_at_most_one_ranking is True then, it
        should return a list made of the only / the first consensus found.
        In all scenario, the algorithm returns a list of consensus rankings
        :raise ScoringSchemeNotHandledException when the algorithm cannot compute the consensus because the
        implementation of the algorithm does not fit with the scoring scheme
        """

        if not dataset.is_complete and not self.is_scoring_scheme_relevant_when_incomplete_rankings(
                scoring_scheme):
            raise ScoringSchemeNotHandledException

        if scoring_scheme.is_equivalent_to([[0, 1, 1, 0, 1, 1],
                                            [1, 1, 0, 1, 1, 0]]):
            rankings_to_use = dataset.unified_rankings()
        else:
            rankings_to_use = dataset.rankings

        nb_rankings = len(rankings_to_use)
        rankings_copy = list(rankings_to_use)
        shuffle(rankings_copy)
        h = {}
        id_ranking = 0
        for ranking in rankings_copy:
            id_bucket = 0
            for bucket in ranking:
                for element in bucket:
                    if element not in h:
                        h[element] = zeros(nb_rankings, dtype=int) - 1
                    h[element][id_ranking] = id_bucket
                id_bucket += 1
            id_ranking += 1

        res = []
        for el in sorted(h.items(), key=cmp_to_key(RepeatChoice.__compare)):
            res.append([el[0]])

        # kem = KemenyComputingFactory(scoring_scheme=self.scoring_scheme)
        # kem = KendallTauGeneralizedNlogN()
        return Consensus(
            consensus_rankings=[res],
            dataset=dataset,
            scoring_scheme=scoring_scheme,
            att={ConsensusFeature.AssociatedAlgorithm: self.get_full_name()})
Beispiel #4
0
def run_bench_exact_optimized_scoring_scheme_vldb(path_dataset: str,
                                                  raw_data=False,
                                                  figures=False):
    # get the scoring schemes (the KCFs)
    kcf1 = ScoringScheme.get_unifying_scoring_scheme_p(1.)
    kcf2 = ScoringScheme.get_extended_measure_scoring_scheme()
    kcf3 = ScoringScheme.get_induced_measure_scoring_scheme_p(1.)
    kcf4 = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)
    kcfs = [kcf1, kcf2, kcf3, kcf4]

    # optimize = optim1, preprocess = optim2
    ea_optim1_optim2 = get_algorithm(alg=Algorithm.Exact,
                                     parameters={
                                         "optimize": True,
                                         "preprocess": True
                                     })

    # run experiment for each scoring scheme (KCF)
    bench = BenchScalabilityScoringScheme(
        dataset_folder=path_dataset,
        # the algorithm to consider
        alg=ea_optim1_optim2,
        # the kcfs to consider
        scoring_schemes=kcfs,
        # the dataset selector for selection according to the size
        dataset_selector_exp=DatasetSelector(nb_elem_min=130,
                                             nb_elem_max=300,
                                             nb_rankings_min=3),
        # range of size of datasets for the output
        steps=10,
        # max time computation allowed. for each kcf, the computation stops
        # when for a tuple of rankings the time computation is higher
        max_time=600,
        # re-compute the consensus until final time computation > 1 sec. The average time computation is then returned
        repeat_time_computation_until=0)

    # run experiment and print results. If parameter is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    bench.run(raw_data, figures)
Beispiel #5
0
def run_experiment_students_vldb(raw_data=False, figures=False):
    # seed 1 is set for python and numpy
    random.seed(1)
    np.random.seed(1)
    # sets the values of b5-b4 to consider (note that b4 is set to 0)
    #values_b5 = [0., 0.25, 0.5, 0.75, 1., 2]
    values_b5 = [0.]
    kcfs = []
    # creation of the scoring schemes (the KCFs)
    for value_b5 in values_b5:
        kcfs.append(
            ScoringScheme([[0., 1., 1., 0., value_b5, 0.],
                           [1., 1., 0., value_b5, value_b5, 0]]))
    """"
    the parameters are all the ones detailled in the research paper. 100 student classes, each student class
    has 280 students from track 1 and 20 from track 2. In tract 1: choose uniformly 14 classes over 17 and in track
    2: choose uniformly 9 classes over the same 17. The marks obtained by students of track 1: N(10, 5*5) and by 
    students of track 2 : N(16, 4*4). Evaluation is made using top-20 of the consensuses
    """
    exp = MarksExperiment(
        # number of tuples of rankings to create
        nb_years=100,
        # number of students in track1
        nb_students_track1=280,
        # number of students in track2
        nb_students_track2=20,
        # number of classes the students can choose
        nb_classes_total=17,
        # number of classes the students of track1 choose (uniformly)
        nb_classes_track1=14,
        # number of classes the students of track2 choose (uniformly)
        nb_classes_track2=9,
        # mean marks for students in track1 for each class (normal distribution)
        mean_track1=10,
        # square of standard deviation of students marks in track1 for each class
        variance_track1=5,
        # mean marks for students in track2 for each class (normal distribution)
        mean_track2=16,
        # square of standard deviation of students marks in track2 for each class
        variance_track2=4,
        # top-k to consider for the experiment (comparison consensus and overall average)
        topk=20,
        # kcfs to consider
        scoring_schemes=kcfs,
        # algorithm to compute consensus
        algo=get_algorithm(Algorithm.CopelandMethod))

    # run experiment and print results. If raw_data is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    exp.run(raw_data, figures)
Beispiel #6
0
def run_count_subproblems_t_vldb(path_dataset: str, raw_data=False):
    kcfs = []
    penalties_t = [0.0, 0.25, 0.5, 0.75, 1.]
    for penalty in penalties_t:
        kcfs.append(
            ScoringScheme([[0., 1., 1., 0., 1., 0],
                           [penalty, penalty, 0., penalty, penalty, penalty]]))
    bench = BenchPartitioningScoringScheme(
        dataset_folder=path_dataset,
        # the kcfs to consider
        scoring_schemes_exp=kcfs,
        # all the files (tuples of rankings) are considered
        dataset_selector_exp=DatasetSelector(),
        # = T[1], for printing changing value of T
        changing_coeff=(1, 0),
        # range of number of elements to consider for the output
        intervals=[(30, 59), (60, 99), (100, 299), (300, 1121)])

    # run experiment and print results. If parameter is true: also print all parameters of experiment (readme)
    # and the raw data that was used to compute the final data. If parameter is false, only final data is displayed
    bench.run(raw_data)
Beispiel #7
0
    def rankaggr_brute(preferences):
        '''
		For each sample compute the score given the scoring vector
		'''
        n_candidates = len(preferences[0][0])
        n_model = len(preferences[0])
        n_samples = len(preferences)
        scores = np.zeros((n_samples, n_candidates), dtype="f4")
        #perm = permutations(range(n_candidates))

        #perm = list(perm)

        #print(f"#Model: {n_model} #CAndidates: {n_candidates}")
        #print(f"#Model: {n_model} #SAmples: {n_samples}")
        #print(f"Preferences: {preferences} #Preferences: {len(preferences)}")

        for l in range(len(preferences)):
            #rofile=np.zeros((n_model,n_candidates), dtype="i2")
            profile = []
            #print("********************* PREFERENCES ORIG")
            #print(preferences[l])
            #print("********************* PREFERENCES ORDERED")
            #print(np.unique(preferences[l], axis=1))
            temp_ordered = np.flip(np.unique(preferences[l], axis=1), axis=1)
            #print("********************* PREFERENCES ORDERED INVERSE")
            #print(temp_ordered)
            for i in range(n_model):
                #temp=Vorace.sortingPref(preferences[l][i])
                temp = temp_ordered[i]
                #print("********************* FIRST PREFERENCES ORDERED INVERSE")
                #print(temp)
                #print(f"********************* PREFERENCES ORIG {l} {i}")
                #print(preferences[l][i])
                #print("********************* INDECES")
                temp = [
                    np.where(preferences[l][i] == temp[j])[0]
                    for j in range(len(temp))
                ]
                #print(temp)
                #exit()
                #print([[x] for x in temp ])
                #profile.append([[x] for x in temp ])
                profile.append(temp)

            #print(len(profile))
            ranks = Dataset(profile)
            sc = ScoringScheme()
            if len(profile[0]) > 5:
                consensus = KemRankAgg.compute_consensus(
                    ranks, sc, Algorithm.ParCons)
            else:
                consensus = KemRankAgg.compute_consensus(
                    ranks, sc, Algorithm.Exact)

            for c in range(len(consensus.consensus_rankings[0])):
                candidate = consensus.consensus_rankings[0][c][0]
                scores[l][candidate] = n_candidates - c

            #print(profile)
            #print(scores[l])
            #exit()
        #return min_dist, best_rank
        return scores
Beispiel #8
0
from corankco.dataset import Dataset
from corankco.scoringscheme import ScoringScheme
from corankco.algorithms.algorithmChoice import get_algorithm
from corankco.algorithms.algorithmChoice import Algorithm
from corankco.kemeny_computation import KemenyComputingFactory

dataset = Dataset([[[1], [2, 3]], [[3, 1], [4]], [[1], [5], [3, 2]]])
# or d = Dataset.get_rankings_from_file(file_path), with file_path is the path to fhe file
# import a list of datasets in a same folder : Dataset.get_rankings_from_folder(path_folder)

# print information about the dataset
print(dataset.description())
# choose your scoring scheme (or sc = ScoringScheme() for default scoring scheme)
sc = ScoringScheme([[0., 1., 1., 0., 1., 1.], [1., 1., 0., 1., 1., 0.]])

print("scoring scheme : " + str(sc))
# scoring scheme description
print(sc.description())

print("\n### Consensus computation ###\n")

algorithm = get_algorithm(alg=Algorithm.ParCons,
                          parameters={"bound_for_exact": 90})
# compute consensus ranking
consensus = algorithm.compute_consensus_rankings(
    dataset=dataset, scoring_scheme=sc, return_at_most_one_ranking=False)

print(consensus.description())

# if you want the consensus ranking only : print(consensus)
# to get the consensus rankings : consensus.consensus_rankings
Beispiel #9
0
 def is_scoring_scheme_relevant_when_incomplete_rankings(
         self, scoring_scheme: ScoringScheme) -> bool:
     return scoring_scheme.is_equivalent_to(ScoringScheme.get_induced_measure_scoring_scheme().penalty_vectors) or \
            scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme().penalty_vectors) or \
            scoring_scheme.is_equivalent_to(ScoringScheme.get_induced_measure_scoring_scheme_p(0.5).penalty_vectors)\
            or scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme_p(0.5).penalty_vectors)
Beispiel #10
0
    def compute_consensus_rankings(self,
                                   dataset: Dataset,
                                   scoring_scheme: ScoringScheme,
                                   return_at_most_one_ranking=False,
                                   bench_mode=False) -> Consensus:
        """
        :param dataset: A dataset containing the rankings to aggregate
        :type dataset: Dataset (class Dataset in package 'datasets')
        :param scoring_scheme: The penalty vectors to consider
        :type scoring_scheme: ScoringScheme (class ScoringScheme in package 'distances')
        :param return_at_most_one_ranking: the algorithm should not return more than one ranking
        :type return_at_most_one_ranking: bool
        :param bench_mode: is bench mode activated. If False, the algorithm may return more information
        :type bench_mode: bool
        :return one or more rankings if the underlying algorithm can find several equivalent consensus rankings
        If the algorithm is not able to provide multiple consensus, or if return_at_most_one_ranking is True then, it
        should return a list made of the only / the first consensus found.
        In all scenario, the algorithm returns a list of consensus rankings
        :raise ScoringSchemeNotHandledException when the algorithm cannot compute the consensus because the
        implementation of the algorithm does not fit with the scoring scheme
        """

        if not dataset.is_complete and not self.is_scoring_scheme_relevant_when_incomplete_rankings(
                scoring_scheme):
            raise ScoringSchemeNotHandledException

        if scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme().penalty_vectors) or \
                scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme_p(0.5).penalty_vectors):
            rankings_to_use = dataset.unified_rankings()
        else:
            rankings_to_use = dataset.rankings

        points = {}
        for ranking in rankings_to_use:
            id_bucket = 1
            for bucket in ranking:
                for elem in bucket:
                    if elem not in points:
                        points[elem] = {}
                        points[elem][0] = 0
                        points[elem][1] = 0

                    points[elem][0] += id_bucket
                    points[elem][1] += 1
                if self.useBucketIdAndNotBucketSize:
                    id_bucket += 1
                else:
                    id_bucket += len(bucket)
        lis = []
        for elem in points.keys():
            lis.append((elem, points[elem][0] * 1.0 / points[elem][1]))
        tri = sorted(lis, key=lambda col: col[1])
        consensus = []
        bucket = []
        last = -1
        for duo in tri:
            if duo[1] != last:
                last = duo[1]
                bucket = []
                consensus.append(bucket)
            bucket.append(duo[0])
        return Consensus(
            consensus_rankings=[consensus],
            dataset=dataset,
            scoring_scheme=scoring_scheme,
            att={ConsensusFeature.AssociatedAlgorithm: self.get_full_name()})
Beispiel #11
0
    def compute_consensus_rankings(self,
                                   dataset: Dataset,
                                   scoring_scheme: ScoringScheme,
                                   return_at_most_one_ranking=False,
                                   bench_mode=False) -> Consensus:
        """
        :param dataset: A dataset containing the rankings to aggregate
        :type dataset: Dataset (class Dataset in package 'datasets')
        :param scoring_scheme: The penalty vectors to consider
        :type scoring_scheme: ScoringScheme (class ScoringScheme in package 'distances')
        :param return_at_most_one_ranking: the algorithm should not return more than one ranking
        :type return_at_most_one_ranking: bool
        :param bench_mode: is bench mode activated. If False, the algorithm may return more information
        :type bench_mode: bool
        :return one or more rankings if the underlying algorithm can find several equivalent consensus rankings
        If the algorithm is not able to provide multiple consensus, or if return_at_most_one_ranking is True then, it
        should return a list made of the only / the first consensus found.
        In all scenario, the algorithm returns a list of consensus rankings
        :raise ScoringSchemeNotHandledException when the algorithm cannot compute the consensus because the
        implementation of the algorithm does not fit with the scoring scheme
        """

        if not dataset.is_complete and not self.is_scoring_scheme_relevant_when_incomplete_rankings(
                scoring_scheme):
            raise ScoringSchemeNotHandledException

        if scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme().penalty_vectors) or \
                scoring_scheme.is_equivalent_to(ScoringScheme.get_unifying_scoring_scheme_p(0.5).penalty_vectors):
            rankings_to_use = dataset.unified_rankings()
        else:
            rankings_to_use = dataset.rankings
        has = {}

        nb_rankings_needed = {}
        already_put = set()

        for ranking in rankings_to_use:
            for bucket in ranking:
                for element in bucket:
                    if element not in nb_rankings_needed:
                        nb_rankings_needed[element] = self.__h
                    else:
                        nb_rankings_needed[element] += self.__h

        bucket_res = []
        ranking_res = []

        for reorganized in zip_longest(*rankings_to_use):
            for bucket in reorganized:
                if bucket is not None:
                    for element in bucket:
                        if element not in already_put:
                            if element not in has:
                                has[element] = 1
                                if nb_rankings_needed[element] <= 1:
                                    bucket_res.append(element)
                                    already_put.add(element)
                            else:
                                has[element] += 1
                                if has[element] >= nb_rankings_needed[element]:
                                    bucket_res.append(element)
                                    already_put.add(element)
            if len(bucket_res) > 0:
                ranking_res.append(bucket_res)
                bucket_res = []

        rankings_consensus = [ranking_res] if len(ranking_res) > 0 else [[]]
        return Consensus(
            consensus_rankings=rankings_consensus,
            dataset=dataset,
            scoring_scheme=scoring_scheme,
            att={ConsensusFeature.AssociatedAlgorithm: self.get_full_name()})
                h_gene_list_scores[element] = []
            shuffle(dataset.rankings)
            for i in to_test:
                dataset_new = Dataset(dataset.rankings[0:i])
                dataset_new.name = dataset.name
                consensus = self._algo.compute_consensus_rankings(dataset_new, self._scoring_cheme, True)
                copeland_scores = consensus.copeland_scores
                for element in dataset_new.elements:
                    cop_score_element = copeland_scores.get(element)
                    h_gene_list_scores[element].append(cop_score_element)
            for element in dataset.elements:
                res += dataset.name + ";" + str(element) + ";" + str(h_gene_list_scores[element]) + "\n"
        return res

algor = get_algorithm(Algorithm.CopelandMethod)
scoring_scheme_exp = ScoringScheme.get_pseudodistance_scoring_scheme_p(1.)

"""
rates_presence_min = [0.2]
ic_rates = [0.05]

for rate_presence_minimal in rates_presence_min:
    for ic_rate in ic_rates:
        print(ic_rate)
        print(rate_presence_minimal)
        b = BootstrapExperimentBiologicalIC(dataset_folder="/home/pierre/Bureau/vldb_data/datasets/biological_dataset",
                                          algo=algor,
                                          scoring_scheme=scoring_scheme_exp,
                                          nb_bootstrap=10000,
                                          dataset_selector=DatasetSelector(
                                              nb_rankings_min=20, nb_elem_min=200, nb_elem_max=219),