def window_fst_sup(Windows,ref_labels,labels1,Chr= 1,ncomp= 4,range_sample= [],rand_sample= 0):
    
    kde_class_labels= labels1
    kde_label_dict= {
        z:[x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z] for z in list(set(kde_class_labels))
    }
    
    if rand_sample:
        sample= rand_sample
        sample_range= [0,sample]
        Freq_extract= {
            Chr:{
                bl:Windows[Chr][bl] for bl in np.random.choice(list(Windows[Chr].keys()),sample,replace= True)
            }
        }

    if range_sample:
        sample_range= range_sample
        Freq_extract= {
            Chr:{
                bl:Windows[Chr][bl] for bl in list(sorted(Windows[Chr].keys()))[sample_range[0]:sample_range[1]]
            }
        }
    
    sim_fst= []
    
    for c in Freq_extract[Chr].keys():
        Sequences= Windows[Chr][c]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0,0]
            print('hi')
            continue

        Sequences= np.nan_to_num(Sequences)

        pca = PCA(n_components=ncomp, whiten=False,svd_solver='randomized').fit(Sequences)
        data = pca.transform(Sequences)
        Ngps= len(ref_labels)
        these_freqs= []
        
        for hill in ref_labels:
            cl_seqs= Sequences[kde_label_dict[hill],:]

            freq_vector= [float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs,axis= 0)]
            these_freqs.append(freq_vector)
            
        Pairwise= return_fsts2(np.array(these_freqs))
        sim_fst.append(list(Pairwise.fst))
    
    ###
        
    return sim_fst
Example #2
0
def window_analysis(Windows,
                    ref_labels,
                    labels1,
                    Chr=1,
                    ncomp=4,
                    amova=True,
                    supervised=True,
                    include_who=[],
                    range_sample=[130, 600],
                    rand_sample=0,
                    clsize=15,
                    cl_freqs=5,
                    Bandwidth_split=20,
                    quantile=0.1,
                    centre_d=True,
                    PC_sel=0):

    kde_class_labels = labels1
    kde_label_dict = {
        z:
        [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z]
        for z in list(set(kde_class_labels))
    }

    if include_who:
        include = [
            x for x in range(len(kde_class_labels))
            if kde_class_labels[x] in include_who
        ]
        ref_labels = include_who
        kde_class_labels = [kde_class_labels[x] for x in include]

        kde_label_dict = {
            z: [
                x for x in range(len(kde_class_labels))
                if kde_class_labels[x] == z
            ]
            for z in include_who
        }

    if rand_sample:
        sample = rand_sample
        sample_range = [0, sample]
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in np.random.choice(
                    list(Windows[Chr].keys()), sample, replace=True)
            }
        }

    if range_sample:
        sample_range = range_sample
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in list(sorted(Windows[Chr].keys()))
                [sample_range[0]:sample_range[1]]
            }
        }

    Results = {'header': ['Chr', 'window'], 'info': [], 'coords': []}

    Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []}

    pc_density = []
    pc_coords = []

    sim_fst = []

    for c in Freq_extract[Chr].keys():
        Sequences = Windows[Chr][c]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0, 0]
            print('hi')
            continue

        Sequences = np.nan_to_num(Sequences)

        pca = PCA(n_components=ncomp, whiten=False,
                  svd_solver='randomized').fit(Sequences)
        data = pca.transform(Sequences)

        from sklearn.preprocessing import scale

        if include_who:
            data = data[include, :]

        ##### PC density
        PC = PC_sel

        pc_places = data[:, PC]

        if centre_d:
            pc_places = scale(pc_places, with_std=False)

        X_plot = np.linspace(-8, 8, 100)

        Focus_labels = list(range(data.shape[0]))

        bandwidth_pc = estimate_bandwidth(pc_places.reshape(-1, 1),
                                          quantile=quantile,
                                          n_samples=len(pc_places))
        if bandwidth_pc <= 1e-3:
            bandwidth_pc = 0.01

        bandwidth = estimate_bandwidth(data,
                                       quantile=quantile,
                                       n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.01

        kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth_pc).fit(
            np.array(pc_places).reshape(-1, 1))

        log_dens = kde.score_samples(X_plot.reshape(-1, 1))

        pc_density.append(np.exp(log_dens))
        pc_coords.append(pc_places)

        PC_var['coords'].append([Chr, c])
        PC_var['info'].append([x for x in pca.explained_variance_])
        ###
        params = {
            'bandwidth': np.linspace(np.min(data), np.max(data),
                                     Bandwidth_split)
        }
        grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                          breadth_first=False),
                            params,
                            verbose=0)

        ######################################
        ####### TEST global Likelihood #######
        ######################################

        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        ms = MeanShift(bandwidth=bandwidth,
                       cluster_all=False,
                       min_bin_freq=clsize)
        ms.fit(data[Focus_labels, :])
        labels = ms.labels_

        Tree = {
            x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x]
            for x in [g for g in list(set(labels)) if g != -1]
        }
        Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize]

        Tree = {x: Tree[x] for x in Keep}
        Ngps = len(Tree)
        SpaceX = {x: data[Tree[x], :] for x in Tree.keys()}

        these_freqs = []
        ### Extract MScluster likelihood by sample

        for hill in SpaceX.keys():

            if len(Tree[hill]) >= cl_freqs:
                if supervised == False:
                    print('hi')
                    cl_seqs = Sequences[Tree[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

            grid.fit(data[Tree[hill], :])

            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_

            P_dist = kde.score_samples(data[Tree[hill], :])
            Dist = kde.score_samples(data)
            P_dist = np.nan_to_num(P_dist)
            Dist = np.nan_to_num(Dist)

            if np.std(P_dist) == 0:
                Dist = np.array(
                    [int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(Dist)
            Dist = np.nan_to_num(Dist)

            Construct['coords'].append([Chr, c, hill])
            Construct['info'].append(Dist)

            #########################################
        ############# AMOVA ################
        #########################################

        if supervised:
            labels = [x for x in kde_class_labels if x in ref_labels]
            Who = [
                z for z in it.chain(*[kde_label_dict[x] for x in ref_labels])
            ]
            Ngps = len(ref_labels)

            #print(ref_labels)
            for hill in ref_labels:

                if len(kde_label_dict[hill]) >= cl_freqs:
                    if include_who:
                        Seq_specific = Sequences[include, :]

                    cl_seqs = Seq_specific[kde_label_dict[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

        else:
            Who = [
                x for x in range(len(labels))
                if labels[x] != -1 and labels[x] in Keep
            ]
            labels = [labels[x] for x in Who]
            Who = [Focus_labels[x] for x in Who]

        #
        if len(these_freqs) > 1:
            Pairwise = return_fsts2(np.array(these_freqs))
            sim_fst.extend(Pairwise.fst)

        if len(list(set(labels))) == 1:
            Results['info'].append([Chr, c, 0, 1])
            #Results['info'].append([AMOVA,Ngps])
            continue

        if amova:
            clear_output()
            AMOVA, Cig = AMOVA_FM42(data[Who, :],
                                    labels,
                                    n_boot=0,
                                    metric='euclidean')
            print('counting: {}, Ngps: {}'.format(AMOVA, Ngps))
            Results['info'].append([Chr, c, AMOVA, Ngps])

    Results['info'] = pd.DataFrame(
        np.array(Results['info']),
        columns=['chrom', 'window', 'AMOVA', 'Ngps'])

    if len(sim_fst) > 3:
        X_plot = np.linspace(0, .3, 100)

        freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(
            np.array(sim_fst).reshape(-1, 1))

        log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1))

        fig_roost_dens = [
            go.Scatter(x=X_plot,
                       y=np.exp(log_dens),
                       mode='lines',
                       fill='tozeroy',
                       name='',
                       line=dict(color='blue', width=2))
        ]
        ##

        layout = go.Layout(
            title='allele frequency distribution across clusters',
            yaxis=dict(title='density'),
            xaxis=dict(title='fst'))

        fig = go.Figure(data=fig_roost_dens, layout=layout)

    else:
        fig = []

    return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig