コード例 #1
0
    def summarise(self):

        """
        Compute the correlation between each of the four methods and the
        two authoritative trees, displaying the results in a PrettyTable.
        """

        # Save full pairwise distance lists
        geo_austro = distance.build_optimal_geographic_matrix(self.austrolangs)
        geo_indo =  distance.build_optimal_geographic_matrix(self.indolangs)
        gen_austro = distance.build_optimal_genetic_matrix(self.austrolangs)
        gen_indo =  distance.build_optimal_genetic_matrix(self.indolangs)
        feat_austro = distance.build_optimal_feature_matrix(self.austrolangs)
        feat_indo =  distance.build_optimal_feature_matrix(self.indolangs)
        combo_austro = distance.build_optimal_combination_matrix(self.austrolangs)
        combo_indo =  distance.build_optimal_combination_matrix(self.indolangs)
        geo = [n for n in (itertools.chain(itertools.chain(*gen_austro), itertools.chain(*gen_indo)))]
        gen = [n for n in (itertools.chain(itertools.chain(*gen_austro), itertools.chain(*gen_indo)))]
        feat = [n for n in (itertools.chain(itertools.chain(*feat_austro), itertools.chain(*feat_indo)))]
        combo = [n for n in (itertools.chain(itertools.chain(*combo_austro), itertools.chain(*combo_indo)))]
        df = {}
        df["geo"] = geo
        df["gen"] = gen
        df["feat"] = feat
        df["combo"] = combo
        df = pd.DataFrame(df)
        df.to_csv("calibration_results/all_methods_full.csv")

        # Do PrettyTable
        pt = prettytable.PrettyTable(["Method", "Austronesian correl", "Indo-European correl"])

        names = ("GEOGRAPHIC", "GENETIC", "FEATURE", "COMBINATION")
        little_names = ("geo", "gen", "feat", "combo")
        funcs = (distance.build_optimal_geographic_matrix,
                    distance.build_optimal_genetic_matrix,
                    distance.build_optimal_feature_matrix,
                    distance.build_optimal_combination_matrix)
        df = {}
        df["auth"] = self.common_auth_combo_vector
        for name, little_name, func in zip(names, little_names, funcs):
            austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans)
            austro_auth = self.common_auth_austro_vector
            indo_method =  self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans)
            indo_auth = self.common_auth_indo_vector

            austro_correl = np.corrcoef(austro_method, austro_auth)[0,1]
            indo_correl = np.corrcoef(indo_method, indo_auth)[0,1]
            pt.add_row([name, austro_correl, indo_correl])

            df[little_name] = np.concatenate([austro_method, indo_method])

        df = pd.DataFrame(df)
        df.to_csv("calibration_results/all_methods_common.csv")

        print pt
コード例 #2
0
    def optimise_combination(self):
        """
        Use multiple linear regression to determine the optimal weighted
        combination of the GEOGRAPHIC, GENETIC and FEATUE methods.
        """

        df = {}
        df["auth"] = self.common_auth_combo_vector

        names = ("geo", "gen", "feat")
        funcs = (distance.build_optimal_geographic_matrix,
                    distance.build_optimal_genetic_matrix,
                    distance.build_optimal_feature_matrix)
        for name, func in zip(names, funcs):
            austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans)
            indo_method =  self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans)
            df[name] = np.concatenate([austro_method, indo_method])

        df = pd.DataFrame(df)
        df.to_csv("calibration_results/feature_data.csv")
        model = smf.wls('auth ~ geo + gen + feat', data=df, weights=self.weights).fit()

        fp = open("calibration_results/optimal_combination_weights", "w")
#        fp.write("intercept\t%f\n" % model.params["Intercept"])
        fp.write("intercept\t%f\n" % 0.0)
        fp.write("geo\t%f\n" % model.params["geo"])
        fp.write("gen\t%f\n" % model.params["gen"])
        fp.write("feat\t%f\n" % model.params["feat"])
        fp.close()

#        return (model.params["Intercept"], model.params["geo"], model.params["gen"], model.params["feat"])

        combo_austro = distance.build_optimal_combination_matrix(self.austrolangs)
        combo_indo = distance.build_optimal_combination_matrix(self.indolangs)
        D, intt, mult = self.fit_models(combo_austro, combo_indo, "combo")
        print "best combo D: ", D

        fp = open("calibration_results/optimal_combination_weights", "w")
        fp.write("intercept\t%f\n" % intt)
        print intt
        fp.write("geo\t%f\n" % (mult*model.params["geo"]))
        print mult*model.params["geo"]
        fp.write("gen\t%f\n" % (mult*model.params["gen"]))
        print mult*model.params["gen"]
        fp.write("feat\t%f\n" % (mult*model.params["feat"]))
        print mult*model.params["feat"]
        fp.close()

        return

        return (best_intercept, best_weights[0], best_weights[1], best_weights[2])
        old_D = 1000
        lowest_D = 1000
        weights = [1.0/3, 1.0/3, 1.0/3]
        best_weights = weights[:]
        intercept = 0.5
        best_intercept = 0.5
        for iterations in range(0,10000):
            oldweights = weights[:]
            oldint = intercept
            # change params
            if random.randint(1,100) == 42:
                # Go back to best so far
                weights = best_weights[:]
                intercept = best_intercept
            elif random.randint(1,3) == 1:
                # shuffle weights
                random.shuffle(weights)
            elif random.randint(1,3) == 2:
                # shift weights
                source, target = random.sample([0,1,2],2)
                delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0]
                if weights[source] > delta:
                    weights[source] -= delta
                    weights[target] += delta
            elif random.randint(1,3) == 3:
                # shift intercept
                delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0]
                if random.randint(1,2) == 1 and intercept >= delta:
                    intercept -= delta
                elif intercept <= 1.0 - delta:
                    intercept += delta

            observations = [weights[0]*a + weights[1]*b + weights[2]*c for a, b, c in itertools.izip(geo, gen, feat)]
            D, p = scipy.stats.kstest(observations, baselinecdf)
            if D < old_D or random.randint(1,100) < 20:
                # We've improved, or it's a rare backward step
                old_D = D
            else:
                # Keep old value
                weights = oldweights[:]
                intercept = oldint
            if D < lowest_D:
                lowest_D = D
                best_weights = weights
                best_intercept = intercept

#        df = {}
#        df["auth"] = self.auth_combo_vector
#        df["geo"] = np.concatenate([geo_austro, geo_indo])
#        df["gen"] = np.concatenate([gen_austro, gen_indo])
#        df["feat"] = np.concatenate([feat_austro, feat_indo])
#        df = pd.DataFrame(df)
#        df.to_csv("calibration_results/combination_data.csv")
#        model = smf.ols('auth ~ geo + gen + feat', data=df).fit()
#        weights = [model.params[x] for x in ("geo", "gen", "feat")]

        fp = open("calibration_results/optimal_combination_weights", "w")
        fp.write("intercept\t%f\n" % best_intercept)
        fp.write("geo\t%f\n" % best_weights[0])
        fp.write("gen\t%f\n" % best_weights[1])
        fp.write("feat\t%f\n" % best_weights[2])
        fp.close()

        return (best_intercept, best_weights[0], best_weights[1], best_weights[2])