Ejemplo n.º 1
0
def examples_with_aggregated_gauss(subset):
    planets = get_planets(subset)
    os.makedirs('gauss_aggregated', exist_ok=True)
    for planet in tqdm(planets):
        pickle_path = 'gauss_aggregated/{}.pickle'.format(planet)
        if not os.path.exists(pickle_path):
            data = None
            planet_matrix = np.zeros((10, 55, 300), dtype=np.float32)
            for spot in range(1, 11):
                spot_matrix = np.zeros((55, 300), dtype=np.float32)
                for gaus in range(1, 11):
                    data = parse_input(
                        '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format(
                            subset, planet, spot, gaus))
                    spot_matrix += data['matrix']

                spot_matrix /= 10
                planet_matrix[spot - 1] = 1 - spot_matrix

            data['matrix'] = planet_matrix
            out_path = '../database/params_train/{}_01_01.txt'.format(planet)
            if os.path.exists(out_path):
                data.update(parse_output(out_path))
            data['planet'] = planet

            pickle.dump(data, open(pickle_path, 'wb'))
Ejemplo n.º 2
0
def create_meta_dataset(subset):
    planets = get_planets(subset)
    fcnn_errors = pnd.read_csv('per_planet_wmae.csv', index_col=1)
    bagging_errors = pnd.read_csv('bagging_errors.csv', index_col=0)
    eps = [0.01, 0.005, 0.001, 0.0005]
    models = ",".join("model{}".format(e) for e in eps)
    rows = [
        'planet,noise,std,star_temp,star_logg,star_logg,star_mass,star_k_magg,period,{}'
        .format(models)
    ]
    for planet in tqdm(planets):
        with open('completely_aggregated/{}.pickle'.format(planet), 'rb') as f:
            data = pickle.load(f)

        window_size = 1
        denoised = np.zeros((55, 300))
        for j in range(300):
            denoised[:, j] = np.mean(data['matrix'][:,
                                                    max(0, j - window_size):j +
                                                    window_size + 1],
                                     axis=1)
        maxes = np.mean(np.partition(denoised, -10, axis=1)[:, -10:], axis=1)
        std = np.std(maxes)

        offset = 1
        noises = []
        for c in range(55):
            r = data['matrix'][c]
            matrix = np.corrcoef(r[offset:], r[:-offset])
            assert abs(matrix[0, 1] - matrix[1, 0]) < 10**-10
            corr = max(matrix[0, 1], 0)  # 0 or negative --> 0
            noises.append(1 - corr)

        if subset == "train":
            targets = []
            diff = bagging_errors.loc[
                int(planet), 'baggingError'] - fcnn_errors.loc[int(planet),
                                                               'mean_wmae']
            for e in eps:
                if abs(diff) < e:
                    targets.append("any")
                elif diff > 0:
                    targets.append("fcnn")
                else:
                    targets.append("bagging")
        else:
            targets = ["?"] * len(eps)

        row = [planet, str(np.mean(noises)), str(std)]
        row += [str(x) for x in data['misc_inputs']]
        row += targets
        rows.append(','.join(row))

    with open('meta_dataset_{}.csv'.format(subset), 'w') as f:
        for row in rows:
            print(row, file=f)
Ejemplo n.º 3
0
def aggregated_examples_with_tsfresh():
    tsfresh_df = pd.read_pickle('tsfresh_all.pki')
    train_planets = get_planets('train')
    test_planets = get_planets('test')

    train_df = tsfresh_df[tsfresh_df.index.isin(train_planets)]
    singleton_columns = [
        c for c in train_df.columns if len(set(train_df[c])) == 1
    ]

    tsfresh_df.dropna(axis=1, inplace=True)
    tsfresh_df.drop(singleton_columns, axis=1, inplace=True)

    os.makedirs('aggregated_tsfresh', exist_ok=True)
    for planet in tqdm(train_planets | test_planets):
        data = pickle.load(
            open('completely_aggregated/{}.pickle'.format(planet), 'rb'))
        data['tsfresh'] = tsfresh_df.loc[[planet]].values[0]
        pickle.dump(data,
                    open('aggregated_tsfresh/{}.pickle'.format(planet), 'wb'))
Ejemplo n.º 4
0
def calculate_tsfresh_features():
    planets = get_planets('train') | get_planets('test')
    folder = 'completely_aggregated'

    values = {'planet_id': [], 'timestep': []}
    for c in range(55):
        values['channel_{}'.format(c + 1)] = []

    for planet in tqdm(planets):
        with open('{}/{}.pickle'.format(folder, planet), 'rb') as f:
            data = pickle.load(f)
        for t in range(300):
            values['planet_id'].append(planet)
            values['timestep'].append(t)
            for c in range(55):
                values['channel_{}'.format(c + 1)].append(data['matrix'][c, t])

    df = pd.DataFrame(values)
    features = tsfresh.extract_features(df,
                                        column_id='planet_id',
                                        column_sort='timestep',
                                        n_jobs=4)
    features.to_pickle('tsfresh_all.pki')
Ejemplo n.º 5
0
def examples_with_aggregated_matrices(subset):
    planets = get_planets(subset)
    os.makedirs('../database/completely_aggregated', exist_ok=True)
    for planet in tqdm(planets):
        data = None
        planet_matrix = np.zeros((10, 55, 300), dtype=np.float64)
        for spot in range(1, 11):
            spot_matrix = np.zeros((55, 300), dtype=np.float64)
            for gaus in range(1, 11):
                data = parse_input(
                    '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format(
                        subset, planet, spot, gaus))
                spot_matrix += data['matrix']

            spot_matrix /= 10
            planet_matrix[spot - 1] = spot_matrix

        planet_matrix = 1 - np.median(planet_matrix, axis=0)
        data['matrix'] = planet_matrix

        window_size = 3
        temp = np.zeros((55, 300))
        for i in range(300):
            temp[:, i] = np.mean(planet_matrix[:,
                                               max(0, i - window_size):i +
                                               window_size],
                                 axis=1)
        data['maxes'] = np.max(temp, axis=1)
        data['relative_means'] = np.mean(temp, axis=1) / data['maxes']

        frequencies = create_ariel_frequencies(equidistant_frequency=True)
        radiations = [
            black_body_radiation(frequencies[i], data['misc_inputs'][0])
            for i in NOISE_ORDER
        ]
        data['radiation'] = np.array(radiations)

        out_path = '../database/params_train/{}_01_01.txt'.format(planet)
        if os.path.exists(out_path):
            data.update(parse_output(out_path))
        data['planet'] = planet

        pickle.dump(
            data,
            open('../database/completely_aggregated/{}.pickle'.format(planet),
                 'wb'))
Ejemplo n.º 6
0
def examples_with_maximums(subset, window_size=3):
    planets = get_planets(subset)
    os.makedirs('engineered', exist_ok=True)
    for planet in tqdm(planets):
        with open('completely_aggregated/{}.pickle'.format(planet), 'rb') as f:
            data = pickle.load(f)

        matrix = data['matrix']
        for i in range(300):
            matrix[:, i] = np.mean(matrix[:,
                                          max(0, i - window_size):i +
                                          window_size],
                                   axis=1)

        data['maxes'] = np.max(matrix, axis=1)
        del data['matrix']

        with open('engineered/{}.pickle'.format(planet), 'wb') as f:
            pickle.dump(data, f)
Ejemplo n.º 7
0
def stats_per_planet(subset):
    planets = get_planets(subset)
    stats = {}
    folder = 'completely_aggregated'
    window_size = 1
    for planet in tqdm(planets):
        with open('{}/{}.pickle'.format(folder, planet), 'rb') as f:
            data = pickle.load(f)

        denoised = np.zeros((55, 300))
        for j in range(300):
            denoised[:, j] = np.mean(data['matrix'][:,
                                                    max(0, j - window_size):j +
                                                    window_size + 1],
                                     axis=1)
        maxes = np.mean(np.partition(denoised, -10, axis=1)[:, -10:], axis=1)
        means = np.mean(denoised, axis=1)
        stats[planet] = (np.std(maxes), np.mean(maxes - means),
                         data.get('radii', '?'))

    return stats
Ejemplo n.º 8
0
    def __init__(self, all_folds, included_folds, train_or_test='train'):

        planets = list(get_planets(train_or_test))
        planets.sort()
        window_size = 5
        folder = 'completely_aggregated'
        # folder = 'aggregated_tsfresh'
        # folder = 'gauss_aggregated'
        # self.channel = 6
        self.rows = []
        for i, planet in tqdm(enumerate(planets)):
            if i % all_folds in included_folds:
                with open('{}/{}.pickle'.format(folder, planet), 'rb') as f:
                    data = pickle.load(f)

                temp = np.zeros((55, 300))
                for i in range(300):
                    temp[:, i] = np.mean(data['matrix'][:, max(0, i-window_size):i+window_size], axis=1)
                data['maxes'] = np.max(temp, axis=1)
                data['relative_means'] = np.mean(temp, axis=1) / data['maxes']

                # for STR
                # if 'radii' in data:
                #     data['radii'] = data['radii'][self.channel]

                self.rows.append(data)

                # for per channel
                # for j in range(55):
                #     d = {x: data[x] for x in data}
                #     d['matrix'] = d['matrix'][:, j, :]
                #     d['channel'] = j
                #     if 'radii' in d:
                #         d['radii'] = d['radii'][j]
                #     self.rows.append(d)

        self.size = len(self.rows)
Ejemplo n.º 9
0
def histogram_arffs():
    bins = 50
    window_size = 5
    data_type = "test" if False else "train"
    with open('custom_{}.arff'.format(data_type), 'w') as f:
        print('@relation planets', file=f)
        print('@attribute planet string', file=f)
        print('@attribute channel string', file=f)
        print('@attribute sma numeric', file=f)
        print('@attribute incl numeric', file=f)

        print('@attribute radius numeric', file=f)
        print('@attribute radiation numeric', file=f)
        print('@attribute max numeric', file=f)
        print('@attribute avg numeric', file=f)
        for j in range(bins):
            print('@attribute histo_{} numeric'.format(j), file=f)
        # for i in range(55):
        #     print('@attribute radius{} numeric'.format(i+1), file=f)

        # for i in range(55):
        #     print('@attribute radiation{} numeric'.format(i+1), file=f)

        # for i in range(55):
        #     print('@attribute max{} numeric'.format(i+1), file=f)
        #     print('@attribute avg{} numeric'.format(i+1), file=f)

        # for i in range(55):
        #     for j in range(bins):
        #         print('@attribute histo_{}_{} numeric'.format(i+1, j), file=f)

        print('\n@data', file=f)
        for planet in tqdm(get_planets(data_type)):
            with open(
                    '../database/completely_aggregated/{}.pickle'.format(
                        planet), 'rb') as g:
                data = pickle.load(g)
            matrix = data['matrix']
            # features = [data['planet'], data['sma'], data['incl']] + list(data['radii']) + list(data['radiation'])

            # for c in range(55):
            #     temp = np.zeros(300)
            #     for i in range(300):
            #         temp[i] = np.mean(matrix[c, max(0, i-window_size):i+window_size])
            #     m = np.max(temp)
            #     avg = np.mean(temp) / m
            #     features.append(m)
            #     features.append(avg)
            #
            # for c in range(55):
            #     h, _ = np.histogram(matrix[c], bins=bins, range=(0,0.1))
            #     features += list(h)

            # print(','.join([str(x) for x in features]), file=f)

            for c in range(55):
                # features = [data['planet'], c, data['sma'], data['incl'], data['radii'][c], data['radiation'][c]]
                features = [
                    data['planet'], c, '?', '?', '?', data['radiation'][c]
                ]
                temp = np.zeros(300)
                for i in range(300):
                    temp[i] = np.mean(matrix[c,
                                             max(0, i - window_size):i +
                                             window_size])
                m = np.max(temp)
                avg = np.mean(temp) / m
                features.append(m)
                features.append(avg)
                h, _ = np.histogram(matrix[c], bins=bins, range=(0, 0.1))
                features += list(h)

                print(','.join([str(x) for x in features]), file=f)
Ejemplo n.º 10
0
def create_fully_aggregated_csv(gauss_aggregation, spot_aggregation,
                                file_name):
    """
    Creates a csv file that consists of aggregated values of time series for all planets (train and test).
    The csv format is as follows:
    ID,star_temp,star_logg,star_rad,star_mass,star_k_mag,period,sma,incl,m1,...,m300
    0001_1,...
    ...
    0001_55,...
    ...

    Missing values (values of sma, incl and target values for test set) are represented as empty strings.
    :param gauss_aggregation: function for aggregating the matrices <planet>_<spot>_<gauss
    to a single <planet>_<spot> matrix. Its signature is f(Iterable[float]) -> float.
    :param spot_aggregation: function for aggregating the matrices <planet> matrix,
    Its signature is f(Iterable[float]) -> float.
    :param file_name: the name of the output file (no path, just name, e.g., 'mean_mean.csv').
    :return:
    """

    out_dir = "../database/csv"
    os.makedirs(out_dir, exist_ok=True)
    columns = [
        "ID,star_temp,star_logg,star_rad,star_mass,star_k_mag,period,sma,incl".
        split(','), ["m{}".format(i) for i in range(1, 301)], ["r"]
    ]
    data = {c: [] for cs in columns for c in cs}
    planets = [(planet_type, planet) for planet_type in ["train", "test"]
               for planet in sorted(get_planets(planet_type))]
    # planets = planets[:5] + planets[-5:]
    for planet_type, planet in tqdm(planets):
        planet_matrices = []
        for spot in range(1, 11):
            spot_matrices = []
            for gauss in range(1, 11):
                d = parse_input(
                    '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format(
                        planet_type, planet, spot, gauss))
                spot_matrices.append(d['matrix'])
            planet_matrices.append(
                aggregate_matrices(np.array(spot_matrices), gauss_aggregation))
        # matrix
        planet_matrix = 1 - aggregate_matrices(
            planet_matrices, spot_aggregation)  # type: np.ndarray
        # additional parameters
        additional_parameters = dict(
            zip(d['misc_inputs_names'], d['misc_inputs']))
        if planet_type == "train":
            d = parse_output('../database/params_{}/{}_01_01.txt'.format(
                planet_type, planet),
                             join_misc_params=True)
            additional_parameters.update(
                dict(zip(d['misc_outputs_names'], d['misc_outputs'])))
            additional_parameters.update({'r': d['radii']})
        for row in range(55):
            planet_id = "{}_{}".format(planet, row + 1)
            data[columns[0][0]].append(planet_id)
            # additional
            for c in columns[0][1:]:
                value = additional_parameters[
                    c] if c in additional_parameters else None
                data[c].append(value)
            # time series
            for c, value in zip(columns[1], planet_matrix[row]):
                data[c].append(value)
            # radius
            r = columns[2][0]
            value = additional_parameters[r][
                row] if r in additional_parameters else None
            data[columns[2][0]].append(value)
    df = pd.DataFrame(data, columns=[c for cs in columns for c in cs])
    df.to_csv(os.path.join(out_dir, file_name), index=False)
Ejemplo n.º 11
0
def examples_with_custom_features(subset):

    # @njit
    def sigmoid(x, a, b, c, d):
        return a / (1 + np.exp(-b * (x - c))) + d

    def ramp(x, peak, a, b):
        return np.piecewise(x, [x < a, (a < x) & (x < b), b < x],
                            [0, lambda x: peak * (x - a) / (b - a), peak])

    planets = get_planets(subset)
    os.makedirs('custom', exist_ok=True)
    for planet in tqdm(planets):
        planet_matrix = np.zeros((10, 10, 55, 300), dtype=np.float64)
        data = {'planet': planet}
        for spot in range(1, 11):
            for gaus in range(1, 11):
                input = parse_input(
                    '../database/noisy_{}/{}_{:0>2}_{:0>2}.txt'.format(
                        subset, planet, spot, gaus))
                data.update(input)
                planet_matrix[gaus - 1, spot - 1] = 1 - input['matrix']

        del data['matrix']

        frequencies = create_ariel_frequencies(equidistant_frequency=True)
        radiations = [
            black_body_radiation(frequencies[i], data['misc_inputs'][0])
            for i in NOISE_ORDER
        ]
        data['radiation'] = np.array(radiations)

        gaus_agg = np.median(planet_matrix, axis=0)
        spot_agg = np.median(gaus_agg, axis=0)
        raw_curves = np.zeros((55, 3))
        agg_curves = np.zeros((55, 3))

        for channel in range(55):
            xdata = [i / 300 for i in range(150)
                     ] + [i / 300 for i in range(149, -1, -1)]

            ydata_agg = spot_agg[channel]
            try:
                # popt_agg, _ = opt.curve_fit(sigmoid, xdata, ydata_agg, p0=(0, 1, 0.3, 0), maxfev=2*10**3)
                popt_agg, _ = opt.curve_fit(ramp,
                                            xdata,
                                            ydata_agg,
                                            p0=(0, 0.2, 0.4),
                                            maxfev=2 * 10**3)
            except RuntimeError:
                print('Failed agg for planet {} channel {}'.format(
                    planet, channel))
                popt_agg = (0, 0, 0, 0)
            agg_curves[channel] = popt_agg

            ydata_raw = np.concatenate(
                np.concatenate(planet_matrix[:, :, channel, :]))
            try:
                # popt_raw, _ = opt.curve_fit(sigmoid, xdata*100, ydata_raw, p0=(0, 1, 0.3, 0), maxfev=2*10**3)
                popt_raw, _ = opt.curve_fit(ramp,
                                            xdata * 100,
                                            ydata_raw,
                                            p0=(0, 0.2, 0.4),
                                            maxfev=2 * 10**3)
            except RuntimeError:
                print('Failed raw for planet {} channel {}'.format(
                    planet, channel))
                popt_raw = (0, 0, 0, 0)
            raw_curves[channel] = popt_raw

        data['raw_curves'] = raw_curves
        data['agg_curves'] = agg_curves

        out_path = '../database/params_train/{}_01_01.txt'.format(planet)
        if os.path.exists(out_path):
            data.update(parse_output(out_path))

        pickle.dump(data, open('custom/{}.pickle'.format(planet), 'wb'))
Ejemplo n.º 12
0
#rc('font',**{'family':'sans-serif','sans-serif':['Arial']})
# Latex fonts, quick:
#matplotlib.rcParams['mathtext.fontset'] = 'stix'
#matplotlib.rcParams['font.family'] = 'STIXGeneral'
# Latex fonts, slow (but accurate):
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)
matplotlib.rcParams.update({'font.size': 12})
plt.rc('legend', **{'fontsize': 7})

# Ticks to the outside:
rcParams['axes.linewidth'] = 1.2
rcParams['xtick.direction'] = 'out'
rcParams['ytick.direction'] = 'out'

M, Merr, R, Rerr, Teff, Rstar, sep = utils.get_planets()

# Erase datapoints with no data:
idx = np.where((M != -1) & (R != -1))[0]
M = M[idx]
Merr = Merr[idx]
R = R[idx]
Rerr = Rerr[idx]
Teff = Teff[idx]
sep = sep[idx]
Rstar = Rstar[idx]

# Erase datapoints with zero mass:
idx = np.where((M != 0.))[0]
M = M[idx]
Merr = Merr[idx]