def main(): splits = [] with open('data/results.json') as fin: for line in fin: result = json.loads(line) splits += result.get('split', []) print(len(splits)) ss = splits[:] splits = [] for split in ss: area = split['child1.area'] + split['child2.area'] assert area >= 1, split assert 0 <= split['actual_effect'] <= area, split slope = split['parent.pop'] / area if slope >= split['slope']: splits.append(split) print(len(splits)) #splits = [Struct(**split) for split in splits] features = collections.defaultdict(list) for split in splits: for k, v in split.items(): features[k].append(v) effect = features['actual_effect'] del features['actual_effect'] del features['hz'] del features['expected_effect'] features = { k: linear_regression.RawFeature(k, numpy.array(v, dtype=numpy.double)) for k, v in features.items()} #features = Struct(**features) #print(features) #features['inv_slope'] = linear_regression.RawFeature( # '1/slope', 1.0/features['slope'].xs) area = features['child1.area'].xs + features['child2.area'].xs effect /= area #features['parent.pop'].xs /= area # for k in [ # 'child1.stdev_x', # 'child1.stdev_y', # 'child2.stdev_x', # 'child2.stdev_y', # 'parent.stdev_x', # 'parent.stdev_x', # ]: # features[k].xs /= numpy.sqrt(area) ones = linear_regression.ConstantOneFeature(len(effect)) features = [linear_regression.standardize_feature(f) for _, f in sorted(features.items())] features.append(ones) features = linear_regression.polynomial_features(features, 3) #print(features) print(len(features), 'features') lr = linear_regression.LinearRegression.create( features, effect, weights=area) lr.solve() #print(lr.solution) #for k, f in sorted(zip(lr.solution, lr.features), key=lambda q: abs(q[0])): # print(k, f.get_expression()) print('penalty', sqrt(lr.penalty / len(effect))) xs = [] ys = [] colors = [] for x, y, a in zip(lr.predicted_results(), effect, area): xs.append(x) ys.append(y) colors.append(a) def quantilize(xs): sx = sorted(xs) return [sx.index(x) for x in xs] #colors = quantilize(colors) pylab.plot([0, 0.3], [0, 0.3], color='red') pylab.scatter( xs, ys, c=colors, s=0.5, linewidths=(0,), cmap='cool') pylab.axis([-0.1, 0.4, 0, 0.4]) pylab.savefig('linear_regression.png', dpi=140) #print(features) model = ' + '.join( '{:.7}*{}'.format(a, f.get_expression()) for a, f in zip(lr.solution, lr.features)) model = 'return (child1.area + child2.area) * ({});\n'.format(model) with open('model.txt', 'w') as fout: fout.write(model)
def main(): results = [] with open('data/results.json') as fin: for line in fin: result = json.loads(line) #frac = result['land_area'] / (result['W'] * result['H']) #if frac > 0.2: results.append(result) print(len(results), 'data points') features = collections.defaultdict(list) for result in results: for k, v in result.items(): features[k].append(v) features = Struct(**{ k: linear_regression.RawFeature(k, numpy.array(v, dtype=numpy.double)) for k, v in features.items()}) #print(collections.Counter(features.land_density_bucket.xs)) #print(collections.Counter(features.percentage_bucket.xs)) goal = 100 * numpy.log(features.score.xs / features.land_area.xs) land_fraction = linear_regression.RawFeature( 'land_fraction', (features.land_area.xs / (features.variance_x.xs + features.variance_y.xs))) #land_fraction = RawFeature('land_fraction', features.land_area / (features.w * features.h)) #print(sorted(land_fraction.xs)[::len(land_fraction.xs) // 5]) q = sorted(land_fraction.xs) print(q[-1]) print([q[i * (len(q) - 1) // 5] for i in range(5 + 1)]) log_area = linear_regression.RawFeature( 'log_area', numpy.log(features.land_area.xs)) #fs = land_fraction #print(land_fraction) ones = linear_regression.ConstantOneFeature(len(goal)) elongation = linear_regression.RawFeature( 'elongation', (numpy.log(features.W.xs / features.H.xs) ** 2)) density = linear_regression.RawFeature( 'density', (features.total_population.xs / features.land_area.xs)) adjusted_percentage = linear_regression.RawFeature( 'adjusted_percentage', (0.01 * features.max_percentage.xs) ** 0.1) #land_fraction.xs **= 2 #features.variance_x.xs **= 0.5 #features.variance_y.xs **= 0.5 fs = [land_fraction, adjusted_percentage] fs = [ones] + [linear_regression.standardize_feature(f) for f in fs] fs = linear_regression.polynomial_features(fs, 3) lr = linear_regression.LinearRegression.create(fs, goal) lr.solve() print(lr.solution) for k, f in sorted(zip(lr.solution, lr.features), key=lambda q: abs(q[0])): print('+', k, '*', f.get_expression()) print('penalty', sqrt(lr.penalty / len(goal))) print(len(goal))