def summarise(self): """ Compute the correlation between each of the four methods and the two authoritative trees, displaying the results in a PrettyTable. """ # Save full pairwise distance lists geo_austro = distance.build_optimal_geographic_matrix(self.austrolangs) geo_indo = distance.build_optimal_geographic_matrix(self.indolangs) gen_austro = distance.build_optimal_genetic_matrix(self.austrolangs) gen_indo = distance.build_optimal_genetic_matrix(self.indolangs) feat_austro = distance.build_optimal_feature_matrix(self.austrolangs) feat_indo = distance.build_optimal_feature_matrix(self.indolangs) combo_austro = distance.build_optimal_combination_matrix(self.austrolangs) combo_indo = distance.build_optimal_combination_matrix(self.indolangs) geo = [n for n in (itertools.chain(itertools.chain(*gen_austro), itertools.chain(*gen_indo)))] gen = [n for n in (itertools.chain(itertools.chain(*gen_austro), itertools.chain(*gen_indo)))] feat = [n for n in (itertools.chain(itertools.chain(*feat_austro), itertools.chain(*feat_indo)))] combo = [n for n in (itertools.chain(itertools.chain(*combo_austro), itertools.chain(*combo_indo)))] df = {} df["geo"] = geo df["gen"] = gen df["feat"] = feat df["combo"] = combo df = pd.DataFrame(df) df.to_csv("calibration_results/all_methods_full.csv") # Do PrettyTable pt = prettytable.PrettyTable(["Method", "Austronesian correl", "Indo-European correl"]) names = ("GEOGRAPHIC", "GENETIC", "FEATURE", "COMBINATION") little_names = ("geo", "gen", "feat", "combo") funcs = (distance.build_optimal_geographic_matrix, distance.build_optimal_genetic_matrix, distance.build_optimal_feature_matrix, distance.build_optimal_combination_matrix) df = {} df["auth"] = self.common_auth_combo_vector for name, little_name, func in zip(names, little_names, funcs): austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans) austro_auth = self.common_auth_austro_vector indo_method = self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans) indo_auth = self.common_auth_indo_vector austro_correl = np.corrcoef(austro_method, austro_auth)[0,1] indo_correl = np.corrcoef(indo_method, indo_auth)[0,1] pt.add_row([name, austro_correl, indo_correl]) df[little_name] = np.concatenate([austro_method, indo_method]) df = pd.DataFrame(df) df.to_csv("calibration_results/all_methods_common.csv") print pt
def optimise_combination(self): """ Use multiple linear regression to determine the optimal weighted combination of the GEOGRAPHIC, GENETIC and FEATUE methods. """ df = {} df["auth"] = self.common_auth_combo_vector names = ("geo", "gen", "feat") funcs = (distance.build_optimal_geographic_matrix, distance.build_optimal_genetic_matrix, distance.build_optimal_feature_matrix) for name, func in zip(names, funcs): austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans) indo_method = self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans) df[name] = np.concatenate([austro_method, indo_method]) df = pd.DataFrame(df) df.to_csv("calibration_results/feature_data.csv") model = smf.wls('auth ~ geo + gen + feat', data=df, weights=self.weights).fit() fp = open("calibration_results/optimal_combination_weights", "w") # fp.write("intercept\t%f\n" % model.params["Intercept"]) fp.write("intercept\t%f\n" % 0.0) fp.write("geo\t%f\n" % model.params["geo"]) fp.write("gen\t%f\n" % model.params["gen"]) fp.write("feat\t%f\n" % model.params["feat"]) fp.close() # return (model.params["Intercept"], model.params["geo"], model.params["gen"], model.params["feat"]) combo_austro = distance.build_optimal_combination_matrix(self.austrolangs) combo_indo = distance.build_optimal_combination_matrix(self.indolangs) D, intt, mult = self.fit_models(combo_austro, combo_indo, "combo") print "best combo D: ", D fp = open("calibration_results/optimal_combination_weights", "w") fp.write("intercept\t%f\n" % intt) print intt fp.write("geo\t%f\n" % (mult*model.params["geo"])) print mult*model.params["geo"] fp.write("gen\t%f\n" % (mult*model.params["gen"])) print mult*model.params["gen"] fp.write("feat\t%f\n" % (mult*model.params["feat"])) print mult*model.params["feat"] fp.close() return return (best_intercept, best_weights[0], best_weights[1], best_weights[2]) old_D = 1000 lowest_D = 1000 weights = [1.0/3, 1.0/3, 1.0/3] best_weights = weights[:] intercept = 0.5 best_intercept = 0.5 for iterations in range(0,10000): oldweights = weights[:] oldint = intercept # change params if random.randint(1,100) == 42: # Go back to best so far weights = best_weights[:] intercept = best_intercept elif random.randint(1,3) == 1: # shuffle weights random.shuffle(weights) elif random.randint(1,3) == 2: # shift weights source, target = random.sample([0,1,2],2) delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0] if weights[source] > delta: weights[source] -= delta weights[target] += delta elif random.randint(1,3) == 3: # shift intercept delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0] if random.randint(1,2) == 1 and intercept >= delta: intercept -= delta elif intercept <= 1.0 - delta: intercept += delta observations = [weights[0]*a + weights[1]*b + weights[2]*c for a, b, c in itertools.izip(geo, gen, feat)] D, p = scipy.stats.kstest(observations, baselinecdf) if D < old_D or random.randint(1,100) < 20: # We've improved, or it's a rare backward step old_D = D else: # Keep old value weights = oldweights[:] intercept = oldint if D < lowest_D: lowest_D = D best_weights = weights best_intercept = intercept # df = {} # df["auth"] = self.auth_combo_vector # df["geo"] = np.concatenate([geo_austro, geo_indo]) # df["gen"] = np.concatenate([gen_austro, gen_indo]) # df["feat"] = np.concatenate([feat_austro, feat_indo]) # df = pd.DataFrame(df) # df.to_csv("calibration_results/combination_data.csv") # model = smf.ols('auth ~ geo + gen + feat', data=df).fit() # weights = [model.params[x] for x in ("geo", "gen", "feat")] fp = open("calibration_results/optimal_combination_weights", "w") fp.write("intercept\t%f\n" % best_intercept) fp.write("geo\t%f\n" % best_weights[0]) fp.write("gen\t%f\n" % best_weights[1]) fp.write("feat\t%f\n" % best_weights[2]) fp.close() return (best_intercept, best_weights[0], best_weights[1], best_weights[2])