Beispiel #1
0
def get_recommendations():
    if request.method == 'POST':
        postdata = request.json
        user = postdata['user']
        availability = postdata['availability']
        if (user['extentFrustratedInTraffic'] == 3):
            user['extentFrustratedInTraffic'] = 4
        weights = {'traffic': 1, 'events': 1}
        weights['traffic'] = -1 * (user['extentFrustratedInTraffic'] - 3) / 2

        dates = []
        available_times = []
        for date in availability:
            dates.append(date)
        dates = dates + get_missing_dates(availability)
        dates = sorted(dates)

        available_times = []
        for date in dates:
            if date in availability:
                available_times.append(availability[date])
            else:
                available_times.append([0 for i in range(24)])

        CALENDAR_SIZE = (24, len(dates))
        for day in available_times:
            for idx, time in enumerate(day):
                if idx < user['earliestHourWillingToWork']: day[idx] = 0
                if idx > user['latestHourWillingToWork']: day[idx] = 0

        mask = 1 - np.asarray(available_times)
        mask = np.transpose(mask)
        # return json.dumps(available_times) #FOR DEBUGGING available_times
        # return json.dumps(mask.tolist()) #FOR DEBUGGING mask
        # return json.dumps( [mask.shape] ) #FOR DEBUGGING

        model_output = simple_model(weights,
                                    mask.astype(bool),
                                    user['hoursPerWeek'],
                                    CALENDAR_SIZE,
                                    dates,
                                    verbose=False)
        # return json.dumps((np.transpose(model_output).astype(int)).tolist()) #FOR DEBUGGING. generate_calendear_matrix transposes this matrix to get the true recommendation

        #simple_model --> tranpose (24,x) to (x, 24) --> json
        df = generate_calendar_matrix(model_output, dates, CALENDAR_SIZE)
        df_dict = df.to_dict('dict')

        final_dic = {}
        for key, val in df_dict.items():
            rec = []
            for k, v in val.items():
                rec.append(1) if v else rec.append(0)
            final_dic[key] = rec

        return json.dumps(final_dic, sort_keys=True)

    return "Error"
Beispiel #2
0
def train():
    # Apply smoothing to output bins? Ask Hon how to do it best
    # transformer_layers = 8 Try 1 instead of 8
    model_folder = "model1"
    model_name = "expression_model_1.h5"
    figures_folder = "figures_1"
    input_size = 100000
    half_size = input_size / 2
    max_shift = 20
    bin_size = 1000
    num_regions = int(input_size / bin_size)
    mid_bin = math.floor(num_regions / 2)
    BATCH_SIZE = 1
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    GLOBAL_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync
    STEPS_PER_EPOCH = 3000
    out_stack_num = 1000
    num_epochs = 10000

    Path(figures_folder + "/" + "attribution").mkdir(parents=True,
                                                     exist_ok=True)
    Path(figures_folder + "/" + "tracks").mkdir(parents=True, exist_ok=True)

    chromosomes = ["chrX", "chrY"]
    # our_model = mo.simple_model(input_size, num_regions, 3000)
    for i in range(2, 23):
        chromosomes.append("chr" + str(i))

    if Path("pickle/genome.p").is_file():
        genome = pickle.load(open("pickle/genome.p", "rb"))
        ga = pickle.load(open("pickle/ga.p", "rb"))
    else:
        genome, ga = cm.parse_genome("hg19.fa", bin_size)
        pickle.dump(genome,
                    open("pickle/genome.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(ga,
                    open("pickle/ga.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)

    if Path("pickle/input_sequences_long.p").is_file():
        input_sequences_long = pickle.load(
            open("pickle/input_sequences_long.p", "rb"))
        test_input_sequences = pickle.load(
            open("pickle/test_input_sequences.p", "rb"))
        test_output = pickle.load(open("pickle/test_output.p", "rb"))
        test_class = pickle.load(open("pickle/test_class.p", "rb"))
        test_info = pickle.load(open("pickle/test_info.p", "rb"))
        # gas = joblib.load("pickle/gas.gz")
        # kk = list(gas.keys())
        # for key in kk:
        #     gas[key.replace("/", "_")] = gas.pop(key)
        # # for key in gas.keys():
        # #     joblib.dump(gas[key], "parsed_tracks/" + key, compress=3)
        # joblib.dump(kk, "pickle/keys.gz", compress=3)
        # exit()
        # gas_keys = []
        # for filename in os.listdir("parsed_tracks"):
        #     gas_keys.append(filename)
        # gas_keys.remove("IPSC")
        # gas_keys.remove("DMFB")
        # joblib.dump(gas_keys, "pickle/keys.gz", compress=3)
        gas_keys = joblib.load("pickle/keys.gz")
        output_info = pickle.load(open("pickle/output_info.p", "rb"))
        counts = pickle.load(open("pickle/counts.p", "rb"))
        cells = list(sorted(counts.keys()))
    else:
        print("Parsing tracks")
        # gas = joblib.load("pickle/gas.gz")
        gas = {}

        #
        counts = {}
        directory = "count"
        for filename in os.listdir(directory):
            if filename.endswith(".tsv"):
                cell = filename.split(".")[0]
                df = pd.read_csv(os.path.join(directory, filename), sep="\t")
                df['count'] = 1 + df["count"]
                df['count'] = np.log(df['count'])
                df["count"] = df["count"] / df["count"].max()
                d = dict(zip(df["countRegion_ID"], df["count"]))
                counts[cell] = d
                continue
            else:
                continue
        pickle.dump(counts,
                    open("pickle/counts.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        cells = list(sorted(counts.keys()))
        df = pd.read_csv("DMFB_IPSC.CRE.info.tsv", sep="\t")
        enhancers_ids = df.query("classc == 'distal'")['CREID'].to_list()
        promoters_ids = df.query("classc != 'distal'")['CREID'].to_list()
        coding_ids = df.query("classc == 'coding'")['CREID'].to_list()

        ranges_file = "DMFB_IPSC.CRE.coord.bed"
        promoters, enhancers = read_ranges_2(ranges_file, chromosomes,
                                             promoters_ids, enhancers_ids)
        test_promoters, test_enhancers = read_ranges_2(ranges_file, ["chr1"],
                                                       promoters_ids,
                                                       enhancers_ids)

        for cell in cells:
            gas[cell] = copy.deepcopy(ga)
        # joblib.dump(gas, "pickle/gas.gz", compress=3)

        over = 0
        for chr in chromosomes:
            for p in promoters[chr] + enhancers[chr]:
                for cell in cells:
                    pos = int(p[0] / bin_size)
                    if gas[cell][chr][pos] != 0:
                        over += 1
                    gas[cell][chr][pos] += counts[cell][p[1]]
                    if pos - 1 > 0:
                        gas[cell][chr][pos - 1] += counts[cell][p[1]]
                    if pos + 1 < num_regions:
                        gas[cell][chr][pos + 1] += counts[cell][p[1]]

        for chr in ["chr1"]:
            for p in test_promoters[chr] + test_enhancers[chr]:
                for cell in cells:
                    pos = int(p[0] / bin_size)
                    if gas[cell][chr][pos] != 0:
                        over += 1
                    gas[cell][chr][pos] += counts[cell][p[1]]
                    if pos - 1 > 0:
                        gas[cell][chr][pos - 1] += counts[cell][p[1]]
                    if pos + 1 < num_regions:
                        gas[cell][chr][pos + 1] += counts[cell][p[1]]

        for cell in cells:
            joblib.dump(gas[cell], "parsed_tracks/" + cell, compress=3)
        print("Overlap: " + str(over))
        test_input_sequences = []
        test_output = []
        test_class = []
        test_info = []
        for chr, chr_cres in test_promoters.items():
            for i in range(len(chr_cres)):
                tss = chr_cres[i][0]
                seq = get_seq(genome, chr, tss, input_size)
                if len(seq) != input_size:
                    continue
                test_input_sequences.append(seq)
                start = int((tss - half_size) / bin_size)
                scores = []
                for key in cells:
                    scores.append(gas[key][chr][start:start + num_regions])
                test_output.append(scores)
                if chr_cres[i][1] in coding_ids:
                    test_class.append(1)
                else:
                    test_class.append(0)
                test_info.append(chr_cres[i][1])

        test_input_sequences = np.asarray(test_input_sequences)
        test_output = np.asarray(test_output)
        print("Test set completed")
        input_sequences_long = []
        output_info = []
        for chr, chr_cres in promoters.items():
            for i in range(len(chr_cres)):
                tss = chr_cres[i][0]
                seq = get_seq(genome, chr, tss, input_size + max_shift)
                if len(seq) != input_size + max_shift:
                    continue
                input_sequences_long.append(seq)
                output_info.append([chr, tss])

        input_sequences_long = np.asarray(input_sequences_long)
        print("Training set completed")
        pickle.dump(input_sequences_long,
                    open("pickle/input_sequences_long.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_input_sequences,
                    open("pickle/test_input_sequences.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_output,
                    open("pickle/test_output.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_class,
                    open("pickle/test_class.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_info,
                    open("pickle/test_info.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(output_info,
                    open("pickle/output_info.p", "wb"),
                    protocol=pickle.HIGHEST_PROTOCOL)
        del counts
        del gas
        gc.collect()

    # gas_keys = []
    # directory = "tracks"
    # for filename in os.listdir(directory):
    #     if filename.endswith(".gz"):
    #         start = time.time()
    #         fn = os.path.join(directory, filename)
    #         t_name = fn.replace("/", "_")
    #         gas_keys.append(t_name)
    #         gast = copy.deepcopy(ga)
    #         df = pd.read_csv(fn, sep="\t", names=["chr", "start", "end", "m", "score", "strand"], header=None, index_col=False)
    #         chrd = list(df["chr"].unique())
    #         df["mid"] = (df["start"] + (df["end"] - df["start"]) / 2) / bin_size
    #         df = df.astype({"mid": int})
    #
    #         # group the scores over `key` and gather them in a list
    #         grouped_scores = df.groupby("chr").agg(list)
    #
    #         # for each key, value in the dictionary...
    #         for key, val in gast.items():
    #             if key not in chrd:
    #                 continue
    #             # first lookup the positions to update and the corresponding scores
    #             pos, score = grouped_scores.loc[key, ["mid", "score"]]
    #             # fancy indexing
    #             gast[key][pos] += score
    #
    #         max_val = -1
    #         for key in gast.keys():
    #             gast[key] = np.log(gast[key] + 1)
    #             max_val = max(np.max(gast[key]), max_val)
    #         for key in gast.keys():
    #             gast[key] = gast[key] / max_val
    #         joblib.dump(gast, "parsed_tracks/" + t_name, compress=3)
    #         end = time.time()
    #         print("Parsed " + fn + ". Elapsed time: " + str(end - start) + ". Max value: " + str(max_val))
    #
    # # tf tracks #############################################
    # our_model = mo.simple_model(input_size, num_regions, 100)
    # dfa = pd.read_csv("tf_tracks.bed.gz", sep="\t", names=["chr", "start", "end", "m", "score", "strand"],
    #                   header=None,
    #                   index_col=False)
    #
    # # dfa["m"] = dfa["m"].apply(lambda x: x[x.find('.') + 1:]).copy()
    #
    #
    # dfa["mid"] = (dfa["start"] + (dfa["end"] - dfa["start"]) / 2) / bin_size
    # print(dfa["score"].min())
    # dfa['score'] = dfa['score'].replace(0.0, 1.0)
    # print(dfa["score"].min())
    # dfa = dfa.astype({"mid": int})
    # # tf_tracks = list(dfa["m"].unique())
    # # print(len(tf_tracks))
    # # dfa = dfa.groupby('m').filter(lambda x: len(x) <= 30000)
    # tf_tracks = list(dfa["m"].unique())
    # print("After filtering " + str(len(tf_tracks)))
    # for t in tf_tracks:
    #     t_name = "chip_" + t
    #     gast = copy.deepcopy(ga)
    #     if t_name not in gas_keys:
    #         gas_keys.append(t_name)
    #     df = dfa.loc[dfa['m'] == t]
    #     chrd = list(df["chr"].unique())
    #     # group the scores over `key` and gather them in a list
    #     grouped_scores = df.groupby("chr").agg(list)
    #
    #     # for each key, value in the dictionary...
    #     for key, val in gast.items():
    #         if key not in chrd:
    #             continue
    #         # first lookup the positions to update and the corresponding scores
    #         pos, score = grouped_scores.loc[key, ["mid", "score"]]
    #         gast[key][pos] += score
    #
    #     max_val = -1
    #     for key in gast.keys():
    #         gast[key] = np.log(gast[key] + 1)
    #         max_val = max(np.max(gast[key]), max_val)
    #     for key in gast.keys():
    #         gast[key] = gast[key] / max_val
    #
    #     joblib.dump(gast, "parsed_tracks/" + t_name, compress=3)
    #     if not Path(model_folder + "/" + t_name).is_file():
    #         joblib.dump(our_model.get_layer("out_row_0").get_weights(), model_folder + "/" + t_name, compress=3)
    #     print(t_name + " " + str(df.shape[0]) + " " + str(max_val))
    # joblib.dump(gas_keys, "pickle/keys.gz", compress=3)

    ########################################################################
    # hic_keys = []
    # directory = "hic"
    # hic_data = {}
    # for filename in os.listdir(directory):
    #     if filename.endswith(".gz"):
    #         fn = os.path.join(directory, filename)
    #         t_name = fn.replace("/", "_")
    #         hic_keys.append(t_name)
    #         df = pd.read_csv(fn, sep="\t", index_col=False)
    #         df.drop(['relCoverage1', 'relCoverage2', 'relCoverage1',
    #                  'probability', 'expected', 'logObservedOverExpected',
    #                  "locus2_chrom", "locus1_end", "locus2_end"], axis=1, inplace=True)
    #         df.drop(df[df.readCount < 5].index, inplace=True)
    #         df.drop(df[df.qvalue > 0.05].index, inplace=True)
    #         df["score"] = 1.0
    #         # df["score"] = -1 * np.log(df["pvalue"])
    #         # df["score"] = df["score"] / df["score"].max()
    #         df.drop(['readCount', 'qvalue', 'pvalue'], axis=1, inplace=True)
    #         # df.to_csv("parsed_hic/" + t_name,index=False,compression="gzip")
    #         chrd = list(df["locus1_chrom"].unique())
    #         for chr in chrd:
    #             hic_data[t_name + chr] = df.loc[df['locus1_chrom'] == chr].sort_values(by=['locus1_start'])
    #         print(t_name)
    # joblib.dump(hic_data, "pickle/hic_data.gz", compress=3)
    # joblib.dump(hic_keys, "pickle/hic_keys.gz", compress=3)
    hic_data = joblib.load("pickle/hic_data.gz")
    hic_keys = joblib.load("pickle/hic_keys.gz")
    print("Number of tracks: " + str(len(gas_keys)))
    with strategy.scope():
        if Path(model_folder + "/" + model_name).is_file():
            our_model = tf.keras.models.load_model(
                model_folder + "/" + model_name,
                custom_objects={'PatchEncoder': mo.PatchEncoder})
            print('Loaded existing model')
        else:
            our_model = mo.simple_model(input_size, num_regions, out_stack_num)
            Path(model_folder).mkdir(parents=True, exist_ok=True)
            our_model.save(model_folder + "/" + model_name)
            print("Model saved")
            for i, key in enumerate(gas_keys):
                joblib.dump(our_model.get_layer("out_row_0").get_weights(),
                            model_folder + "/" + key,
                            compress=3)
                if i == 0:
                    print(
                        our_model.get_layer("out_row_0").get_weights()
                        [0].shape)
                    print(
                        our_model.get_layer("out_row_0").get_weights()
                        [1].shape)
                if i % 50 == 0:
                    print(i, end=" ")
                    gc.collect()
            print("\nWeights saved")
    # print("0000000000000000000000000000")
    # our_model_new = mo.simple_model(input_size, num_regions, 200)
    # for l in our_model_new.layers:
    #     if "out_row" not in l.name:
    #         try:
    #             l.set_weights(our_model.get_layer(l.name).get_weights())
    #         except Exception as e:
    #             print(l.name)
    # our_model = our_model_new
    # print("0000000000000000000000000000")
    del genome
    del ga
    gc.collect()
    for k in range(num_epochs):
        print("Epoch " + str(k) + datetime.now().strftime(' %H:%M:%S'))
        if k > 0:
            with strategy.scope():
                our_model = tf.keras.models.load_model(
                    model_folder + "/" + model_name,
                    custom_objects={'PatchEncoder': mo.PatchEncoder})
        input_sequences = []
        output_scores = []
        print("Preparing sequences" + datetime.now().strftime(' %H:%M:%S'))
        chosen_tracks = random.sample(gas_keys, out_stack_num - len(cells) -
                                      len(hic_keys))  # - len(hic_keys))
        chip_picks = 0
        for it, ct in enumerate(chosen_tracks):
            if ct.startswith("chip_"):
                chip_picks += 1
        print("Chip tracks: " + str(chip_picks) +
              datetime.now().strftime(' %H:%M:%S'))
        gas = {}
        for i, key in enumerate(chosen_tracks):
            our_model.get_layer("out_row_" + str(i)).set_weights(
                joblib.load(model_folder + "/" + key))
            gas[key] = joblib.load("parsed_tracks/" + key)
        for i, cell in enumerate(cells):
            # our_model.get_layer("out_row_" + str(-2 + i)).set_weights(joblib.load(model_folder + "/" + cell))
            gas[cell] = joblib.load("parsed_tracks/" + cell)
        print("Loaded the tracks" + datetime.now().strftime(' %H:%M:%S'))
        err = 0
        for i, seq in enumerate(input_sequences_long):
            if i >= GLOBAL_BATCH_SIZE * STEPS_PER_EPOCH:
                break
            if i % 100 == 0:
                print(i, end=" ")
                gc.collect()
            try:
                rand_var = random.randint(0, max_shift)
                # rand_var = 5
                ns = seq[rand_var:rand_var + input_size, :]
                info = output_info[i]
                start = int(
                    (info[1] +
                     (rand_var - max_shift / 2) - half_size) / bin_size)
                scores = []
                for key in chosen_tracks:
                    scores.append(gas[key][info[0]][start:start + num_regions])
                for key in hic_keys:
                    hic_mat = np.zeros((10, 10))
                    # hd = hic_data[key].loc[hic_data[key]['locus1_chrom'] == info[0]]
                    hd = hic_data[key + info[0]]
                    start_hic = int(
                        (info[1] + (rand_var - max_shift / 2) - half_size))
                    end_hic = start_hic + input_size
                    start_hic = start_hic - start_hic % 10000
                    start_row = hd['locus1_start'].searchsorted(start_hic,
                                                                side='left')
                    end_row = hd['locus1_start'].searchsorted(end_hic,
                                                              side='right')
                    hd = hd.iloc[start_row:end_row]
                    l1 = ((hd["locus1_start"].values - start_hic) /
                          10000).astype(int)
                    l2 = ((hd["locus2_start"].values - start_hic) /
                          10000).astype(int)
                    lix = l2 < len(hic_mat)
                    l1 = l1[lix]
                    l2 = l2[lix]
                    hic_mat[l1, l2] += 1  # row["score"]
                    hic_mat = hic_mat + hic_mat.T - np.diag(np.diag(hic_mat))
                    if len(hic_mat.flatten()) != 100:
                        print("ooooooooops   ")
                    scores.append(hic_mat.flatten().astype(np.float32))
                for cell in cells:
                    scores.append(gas[cell][info[0]][start:start +
                                                     num_regions])
                input_sequences.append(ns)
                output_scores.append(scores)
            except Exception as e:
                print(e)
                err += 1
        print("\nProblems: " + str(err) + datetime.now().strftime(' %H:%M:%S'))
        output_scores = np.asarray(output_scores)
        input_sequences = np.asarray(input_sequences)

        rng_state = np.random.get_state()
        np.random.shuffle(input_sequences)
        np.random.set_state(rng_state)
        np.random.shuffle(output_scores)

        input_sequences = input_sequences[:GLOBAL_BATCH_SIZE * STEPS_PER_EPOCH]
        output_scores = output_scores[:GLOBAL_BATCH_SIZE * STEPS_PER_EPOCH]

        print("Compiling model" + datetime.now().strftime(' %H:%M:%S'))
        # if k < 300:
        #     lr = 0.0001
        # elif k < 600:
        #     lr = 0.00005
        # else:
        #     lr = 0.00002
        lr = 0.0001
        fit_epochs = 1
        with strategy.scope():
            # if k % 9 != 0:
            #     freeze = True
            #     fit_epochs = 4
            # else:
            #     freeze = False
            #     fit_epochs = 2
            for l in our_model.layers:
                # if "out_row" not in l.name and freeze:
                #     l.trainable = False
                # else:
                l.trainable = True
            our_model.compile(loss="mse", optimizer=Adam(learning_rate=lr))

        # if k != 0:
        print("Training" + datetime.now().strftime(' %H:%M:%S'))
        try:
            our_model.fit(input_sequences,
                          output_scores,
                          epochs=fit_epochs,
                          batch_size=GLOBAL_BATCH_SIZE)
            our_model.save(model_folder + "/" + model_name)
            for i, key in enumerate(chosen_tracks):
                joblib.dump(our_model.get_layer("out_row_" +
                                                str(i)).get_weights(),
                            model_folder + "/" + key,
                            compress=3)
        except Exception as e:
            print(e)
            print("Error while training. Loading previous model." +
                  datetime.now().strftime(' %H:%M:%S'))
            with strategy.scope():
                our_model = tf.keras.models.load_model(
                    model_folder + "/" + model_name,
                    custom_objects={'PatchEncoder': mo.PatchEncoder})
            del input_sequences
            del output_scores
            del predictions
            gc.collect()

        if k % 10 == 0:  # and k != 0
            print("Training set")
            predictions = our_model.predict(input_sequences[0:1000],
                                            batch_size=GLOBAL_BATCH_SIZE)

            for c, cell in enumerate(cells):
                ci = -2 + c
                a = []
                b = []
                for i in range(len(predictions)):
                    # if output_scores[i][c][mid_bin] == 0:
                    #     continue
                    a.append(predictions[i][ci][mid_bin])
                    b.append(output_scores[i][ci][mid_bin])
                corr = stats.spearmanr(a, b)[0]
                print("Correlation " + cell + ": " + str(corr))

            pic_count = 0
            for it, ct in enumerate(chosen_tracks):
                if ct.startswith("chip_"):
                    for i in range(len(predictions)):
                        if np.sum(output_scores[i][it]) == 0:
                            continue
                        fig, axs = plt.subplots(2, 1, figsize=(12, 8))
                        vector1 = predictions[i][it]
                        vector2 = output_scores[i][it]
                        x = range(num_regions)
                        d1 = {'bin': x, 'expression': vector1}
                        df1 = pd.DataFrame(d1)
                        d2 = {'bin': x, 'expression': vector2}
                        df2 = pd.DataFrame(d2)
                        sns.lineplot(data=df1,
                                     x='bin',
                                     y='expression',
                                     ax=axs[0])
                        axs[0].set_title("Prediction")
                        sns.lineplot(data=df2,
                                     x='bin',
                                     y='expression',
                                     ax=axs[1])
                        axs[1].set_title("Ground truth")
                        fig.tight_layout()
                        plt.savefig(figures_folder + "/chip/track_" +
                                    str(i + 1) + "_" + str(ct) + ".png")
                        plt.close(fig)
                        pic_count += 1
                        break
                if pic_count > 10:
                    break

            for h in range(len(hic_keys)):
                pic_count = 0
                it = len(chosen_tracks) + h
                for i in range(500, 800, 1):
                    if np.sum(output_scores[i][it]) == 0:
                        continue
                    mat_gt = np.reshape(output_scores[i][it], (10, 10))
                    mat_pred = np.reshape(predictions[i][it], (10, 10))
                    fig, axs = plt.subplots(2, 1, figsize=(8, 8))
                    sns.heatmap(mat_pred, linewidth=0.0, ax=axs[0])
                    axs[0].set_title("Prediction")
                    sns.heatmap(mat_gt, linewidth=0.0, ax=axs[1])
                    axs[1].set_title("Ground truth")
                    plt.tight_layout()
                    plt.savefig(figures_folder + "/hic/track_" + str(i + 1) +
                                "_" + str(hic_keys[h]) + ".png")
                    plt.close(fig)
                    pic_count += 1
                    if pic_count > 4:
                        break

            print("Test set")
            predictions = our_model.predict(test_input_sequences,
                                            batch_size=GLOBAL_BATCH_SIZE)
            for c, cell in enumerate(cells):
                ci = -2 + c
                a = []
                b = []
                ap = []
                bp = []
                for i in range(len(predictions)):
                    # if test_output[i][c][mid_bin] == 0:
                    #     continue
                    a.append(predictions[i][ci][mid_bin])
                    b.append(test_output[i][c][mid_bin])
                    if test_class[i] == 0:
                        continue
                    ap.append(predictions[i][ci][mid_bin])
                    bp.append(test_output[i][c][mid_bin])
                corr = stats.spearmanr(a, b)[0]
                print("Correlation " + cell + ": " + str(corr) + " [" +
                      str(len(a)) + "]")
                corr = stats.spearmanr(ap, bp)[0]
                print("Correlation coding " + cell + ": " + str(corr) + " [" +
                      str(len(ap)) + "]")

            del predictions
            # print("Drawing")
            # for c, cell in enumerate(cells):
            #     ci = -2 + c
            #     for i in range(1200, 1250, 1):
            #         fig, axs = plt.subplots(2, 1, figsize=(12, 8))
            #         vector1 = predictions[i][ci]
            #         vector2 = test_output[i][ci]
            #         x = range(num_regions)
            #         d1 = {'bin': x, 'expression': vector1}
            #         df1 = pd.DataFrame(d1)
            #         d2 = {'bin': x, 'expression': vector2}
            #         df2 = pd.DataFrame(d2)
            #         sns.lineplot(data=df1, x='bin', y='expression', ax=axs[0])
            #         axs[0].set_title("Prediction")
            #         sns.lineplot(data=df2, x='bin', y='expression', ax=axs[1])
            #         axs[1].set_title("Ground truth")
            #         fig.tight_layout()
            #         plt.savefig(figures_folder + "/tracks/track_" + str(i + 1) + "_" + str(cell) + "_" + test_info[i] + ".png")
            #         plt.close(fig)
            #
            # # Marks
            # for m in range(10):
            #     for i in range(1200, 1250, 1):
            #         fig, axs = plt.subplots(2, 1, figsize=(12, 8))
            #         vector1 = predictions[i][m]
            #         vector2 = test_output[i][m]
            #         x = range(num_regions)
            #         d1 = {'bin': x, 'expression': vector1}
            #         df1 = pd.DataFrame(d1)
            #         d2 = {'bin': x, 'expression': vector2}
            #         df2 = pd.DataFrame(d2)
            #         sns.lineplot(data=df1, x='bin', y='expression', ax=axs[0])
            #         axs[0].set_title("Prediction")
            #         sns.lineplot(data=df2, x='bin', y='expression', ax=axs[1])
            #         axs[1].set_title("Ground truth")
            #         fig.tight_layout()
            #         plt.savefig(figures_folder + "/marks/track_" + str(i + 1) + "_" + str(m) + "_" + test_info[i] + ".png")
            #         plt.close(fig)

            # Gene regplot
            # for c, cell in enumerate(cells):
            #     ci = -2 + c
            #     a = []
            #     b = []
            #     for i in range(len(predictions)):
            #         if test_class[i] == 0:
            #             continue
            #         a.append(predictions[i][ci][mid_bin])
            #         b.append(test_output[i][ci][mid_bin])
            #
            #     pickle.dump(a, open(figures_folder + "/" + str(cell) + "_a" + str(k) + ".p", "wb"),
            #                 protocol=pickle.HIGHEST_PROTOCOL)
            #     pickle.dump(b, open(figures_folder + "/" + str(cell) + "_b" + str(k) + ".p", "wb"),
            #                 protocol=pickle.HIGHEST_PROTOCOL)
            #
            #     fig, ax = plt.subplots(figsize=(6, 6))
            #     r, p = stats.spearmanr(a, b)
            #
            #     sns.regplot(x=a, y=b,
            #                 ci=None, label="r = {0:.2f}; p = {1:.2e}".format(r, p)).legend(loc="best")
            #
            #     ax.set(xlabel='Predicted', ylabel='Ground truth')
            #     plt.title("Gene expression prediction")
            #     fig.tight_layout()
            #     plt.savefig(figures_folder + "/corr_" + str(k) + "_" + str(cell) + ".svg")
            #     plt.close(fig)

            # attribution
            # for c, cell in enumerate(cells):
            #     for i in range(1200, 1210, 1):
            #         baseline = tf.zeros(shape=(input_size, 4))
            #         image = test_input_sequences[i].astype('float32')
            #         ig_attributions = attribution.integrated_gradients(our_model, baseline=baseline,
            #                                                            image=image,
            #                                                            target_class_idx=[mid_bin, c],
            #                                                            m_steps=40)
            #
            #         attribution_mask = tf.squeeze(ig_attributions).numpy()
            #         attribution_mask = (attribution_mask - np.min(attribution_mask)) / (
            #                     np.max(attribution_mask) - np.min(attribution_mask))
            #         attribution_mask = np.mean(attribution_mask, axis=-1, keepdims=True)
            #         attribution_mask[int(input_size / 2) - 2000 : int(input_size / 2) + 2000, :] = np.nan
            #         attribution_mask = skimage.measure.block_reduce(attribution_mask, (100, 1), np.mean)
            #         attribution_mask = np.transpose(attribution_mask)
            #
            #         fig, ax = plt.subplots(figsize=(60, 6))
            #         sns.heatmap(attribution_mask, linewidth=0.0, ax=ax)
            #         plt.tight_layout()
            #         plt.savefig(figures_folder + "/attribution/track_" + str(i + 1) + "_" + str(cell) + "_" + test_info[i] + ".jpg")
            #         plt.close(fig)
        print("Cleaning" + datetime.now().strftime(' %H:%M:%S'))
        # Needed to prevent Keras memory leak
        del input_sequences
        del output_scores
        del our_model
        del gas
        gc.collect()
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("Epoch " + str(k) + " finished. ")
Beispiel #3
0
    CALENDAR_SIZE = (24, len(dates))
    for day in available_times:
        for idx, time in enumerate(day):
            if idx < user['earliestHourWillingToWork']: day[idx] = 0
            if idx > user['latestHourWillingToWork']: day[idx] = 0

    mask = 1 - np.asarray(available_times)
    mask = np.transpose(mask)
    # return json.dumps(available_times) #FOR DEBUGGING available_times
    # return json.dumps(mask.tolist()) #FOR DEBUGGING mask
    # return json.dumps( [mask.shape] ) #FOR DEBUGGING

    model_output = simple_model(weights,
                                mask.astype(bool),
                                user['hoursPerWeek'],
                                CALENDAR_SIZE,
                                dates,
                                verbose=False)
    # return json.dumps((np.transpose(model_output).astype(int)).tolist()) #FOR DEBUGGING. generate_calendear_matrix transposes this matrix to get the true recommendation

    #simple_model --> tranpose (24,x) to (x, 24) --> json
    df = generate_calendar_matrix(model_output, dates, CALENDAR_SIZE)
    df_dict = df.to_dict('dict')

    final_dic = {}
    one_user = []
    for key, val in df_dict.items():
        rec = []
        for k, v in val.items():
            rec.append(1) if v else rec.append(0)
        final_dic[key] = rec
Beispiel #4
0
def train_hard(config):
    print('-' * 50)
    print('Load data')

    X = np.load(config.x_train).astype(np.float32, copy=False)
    X /= 255.0

    y = np.load(config.y_train)

    if config.weighted:
        weights = []
        with open(config.meta_train, 'r') as fin:
            fin.readline()
            for line in fin:
                s = line.replace('\n','').split(',')
                w = float(s[2])
                weights.append(w)
        weights = np.array(weights).reshape(-1, 1)
        y = np.hstack([y, weights])
    print('Done.')
    print('Shuffle and split')

    np.random.seed(config.seed)
    np.random.shuffle(X)
    np.random.seed(config.seed)
    np.random.shuffle(y)

    X_train, y_train, X_test, y_test = split_data(X, y, split_ratio=0.2)

    gc.collect()
    print('Load model')
    # model = w_simple_model()
    model = simple_model()

    nb_iter = 200
    epochs_per_iter = 1
    batch_size = 32

    min_val = sys.float_info.max

    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=45,  # randomly rotate images in the range (degrees, 0 to 180)
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=True,  # randomly flip images
        vertical_flip=True)  # randomly flip images

    print('-'*50)
    print('Training...')
    print('-'*50)

    datagen.fit(X_train)

    checkpointer_best = ModelCheckpoint(filepath=config.weights_prefix + 'weights_best.hdf5', verbose=1, save_best_only=True)
    checkpointer = ModelCheckpoint(filepath=config.weights_prefix + 'weights.hdf5', verbose=1, save_best_only=False)

    def weight_wrapper(data_flow):
        for X_batch, y_batch in data_flow:
            yield X_batch, y_batch[:, 0], y_batch[:, 1]

    if config.weighted:
        dataFlow = weight_wrapper(datagen.flow(X_train, 
                                               y_train[:, [config.col, -1]],
                                               batch_size=batch_size))
    else:
        dataFlow = datagen.flow(X_train, y_train[:, config.col],
                                batch_size=batch_size)


    hist = model.fit_generator(dataFlow,
                               samples_per_epoch=X_train.shape[0],
                               nb_epoch=nb_iter, show_accuracy=False,
                               validation_data=(X_test, y_test[:, config.col]),
                               callbacks=[checkpointer, checkpointer_best],
                               nb_worker=config.nb)

    with open(config.weights_prefix + 'val_loss.txt', mode='w+') as f:
        f.write(str(min(hist.history['val_loss'])))

    print('Make train predict')
    pred = model.predict(X, batch_size=batch_size, verbose=1)
    np.save(config.pred_prefix + 'y-train.npy', pred)

    X = np.load(config.x_test).astype(np.float32, copy=False)
    X /= 255.0

    print('Make test predict')
    pred = model.predict(X, batch_size=batch_size, verbose=1)
    np.save(config.pred_prefix + 'y-test.ny', pred)
Beispiel #5
0
                                   vertical_flip=True,
                                   brightness_range=(0.3, 1.)).flow(
                                       X_train,
                                       y=y_train,
                                       batch_size=batch_size))
            #print(X_train.shape)
            #print(y_train.shape)
            yield X_train, y_train


import gc
import psutil

from model import find_patches_from_slide, predict_from_model, simple_model

model = simple_model(pretrained_weights='s_1.h5')


# # Data Path Load
def read_data_path():
    image_paths = []
    with open('train.txt', 'r') as f:
        for line in f:
            line = line.rstrip('\n')
            image_paths.append(line)
    #print('image_path # : ',len(image_paths))

    tumor_mask_paths = []

    with open('train_mask.txt', 'r') as f:
        for line in f:
Beispiel #6
0
        # Factor to C and B matrices.
        C, B = factor_S_sharp_to_C_and_B(S_sharp, n_basis)

        # Build linear model.
        scene = model.BasisShapeModel(Rs,
                                      Bs=B.reshape(n_basis, 3, B.shape[1]),
                                      C=C,
                                      Ts=Ts)

    return scene


if __name__ == '__main__':

    # Set the seed.
    np.random.seed(0)

    # Generate some synthetic data.
    n_frames = 200
    gt_model = model.simple_model(n_frames)
    W = gt_model.W

    # Use the Dai algorithm.
    inf_model = factor(W, use_method_1=0)

    # Register to ground truth
    inf_model.register(gt_model)

    model.compare(inf_model, gt_model, visualize=False)
Beispiel #7
0
import skimage.transform as trans
from datetime import datetime

import math
from PIL import Image
from xml.etree.ElementTree import ElementTree, Element, SubElement
from io import BytesIO
import skimage.io as io

from sklearn import metrics
from model import find_patches_from_slide, predict_from_model, InceptionV3, simple_model

print('****************************INFERENCE FILE*******************************')
#model = simple_model(pretrained_weights ='/data/model/u_1.h5')
#model = IncpetionV3(pretrained_weights= '/data/model/i_1.h5')
model = simple_model(pretrained_weights = '/data/model/i_1.h5')


PATCH_SIZE = 256
NUM_CLASSES = 2 # not_tumor, tumor

file_handles=[]

from PIL import ImageEnhance as ie
import gc

def gen_imgs_test(slide_path, truth_path, samples, batch_size, patch_size = PATCH_SIZE,num_epoch = 1, shuffle=True):
    """This function returns a generator that 
    yields tuples of (
        X: tensor, float - [batch_size, patch_size, patch_size, 3]
        y: tensor, int32 - [batch_size, patch_size, patch_size, NUM_CLASSES]
Beispiel #8
0
    else:
        
        # Recover S_sharp
        S_sharp = S_sharp_from_Rs(W_cent, Rs)

        # Factor to C and B matrices.
        C, B = factor_S_sharp_to_C_and_B(S_sharp, n_basis)

        # Build linear model.
        scene = model.BasisShapeModel(Rs, Bs = B.reshape(n_basis, 3, B.shape[1]), C=C, Ts=Ts)

    return scene

if __name__ == '__main__':
    
    # Set the seed.
    np.random.seed(0)
    
    # Generate some synthetic data.
    n_frames = 200
    gt_model = model.simple_model(n_frames)
    W = gt_model.W

    # Use the Dai algorithm.
    inf_model = factor(W, use_method_1 = 0)
    
    # Register to ground truth
    inf_model.register(gt_model)
    
    model.compare(inf_model, gt_model, visualize=False)
Beispiel #9
0
print "train shape X", X_train.shape
print "train shape y", y_train.shape

# np.save("X_train.data", X_train)
# np.save("y_train.data", y_train)
# np.save("X_test.data", X_test)
# np.save("y_test.data", y_test)

# X_train = np.load("X_train.data.npy")
# y_train = np.load("y_train.data.npy")
# X_test = np.load("X_test.data.npy")
# y_test = np.load("y_test.data.npy")

label_binarizer = LabelBinarizer()
input_shape = (size, size, 1)
model = simple_model(input_shape)

sgd = optimizers.SGD(lr=0.01, decay=1e-5, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd,
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train,
                    y_train,
                    batch_size=batch_size,
                    epochs=10,
                    validation_split=0.2)

y_one_hot_test = label_binarizer.fit_transform(y_test)
metrics = model.evaluate(X_test, y_test)
for i in xrange(len(model.metrics_names)):
    metric_name = model.metrics_names[i]