Beispiel #1
0
def full_train(model, haplotypes, group_keys, iteration):

    """ return (matrix_profiles, snp_list) """

    X_train = X_test = haplotypes
    y_train = y_test = group_keys
    best_score = (-1, None, None, None)

    for i in range(iteration):
        # the iteration here is used for stochastic models where each iteration can yield
        # different result
        lk_predictions, snplist, orig_predictions = fit_and_predict(model, X_train, y_train, X_test, k)
        scores = lkprof.calculate_scores(y_test,  lk_predictions, len(snplist), model='lk'
                    , selector=model.code, iter = i)
        if orig_predictions is not None:
            orig_scores = lkprof.calculate_scores(y_test, orig_predictions
                    , len(snplist), model=model.code, selector=model.code, iter = i)
        else:
            orig_scores = None

        f_min = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0]
        f_mean = scores.loc[ scores['REG'] == 'MEAN', 'F'].values[0]
        f_score = 2 * f_min * f_mean / (f_min + f_mean)
        if f_score > best_score[0]:
            best_score = (f_score, scores, orig_scores, snplist.tolist())

    results.append( best_score[1] )
    if best_score[2] is not None:
        results.append( best_score[2] )
    snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3]

    # reformat model log
    log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()]
Beispiel #2
0
    def select_2(self, haplotypes1, haplotypes2):
        """ return (snplist, F):
            snplist - a list of SNP positions after further selection
            F = F score for these particular SNP set """

        X_train =  np.append(haplotypes1, haplotypes2, axis=0)
        y_train =  np.array( [1] * len(haplotypes1) + [2] * len(haplotypes2) )

        best_score = (-1, None, None, None)
        for i in range(3):

            classifier = DecisionTreeClassifier(class_weight='balanced', random_state = self.randomstate, min_samples_leaf=2)
            classifier = classifier.fit(X_train, y_train)
            features = classifier.tree_.feature

            # remove features with negative position and redundant
            features = np.unique(features[ features >= 0])

            model = FixSNPSelector(features)
            lk_predictions, snplist, _, params = fit_and_predict(model, X_train, y_train, X_train, len(features))
            scores = lkprof.calculate_scores(y_train,  lk_predictions)

            f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0]
            if f_score > best_score[0]:
                best_score = (f_score, scores, None, features.tolist())

        return best_score[3], best_score[0]
Beispiel #3
0
def validator_worker( args ):

    """ validator: returns (r, scores, snplist, log)
        where:
            r: repeat identifier
            scores: Panda dataframe containing all scores
            snplist: a dictionary of simid: snplist
            log: list of log message
    """

    model, y, k_list, fold, iteration, simid = args
    pid = os.getpid()

    cerr('[I - pid %d: validator_worker() started]' % pid)

    np.random.seed( simid % pid )
    model.reseed( simid )

    if var_dict['X_shape'] == None:
        X = var_dict['X']
    else:
        cerr('[I - pid %d: validator_worker() is mapping numpy array]' % pid)
        X = np.frombuffer(var_dict['X'], dtype=np.int8).reshape(var_dict['X_shape'])

    results = []
    snps = {}
    k_fold = -1

    if fold <= 0:
        # no cross-validation

        X_train = X_test = X
        y_train = y_test = y

        for k in k_list:

            # best score will be based on highest min F score
            best_score = (-1, None, None, None)
            for i in range(iteration):
                # the iteration here is used for stochastic models where each iteration can yield
                # different result
                lk_predictions, snplist, orig_predictions, params = fit_and_predict(model, X_train, y_train, X_test, k)
                scores = lkprof.calculate_scores(y_test,  lk_predictions
                    , k = len(snplist), _k = k, EST = 'lk', SELECTOR = model.code, SIMID = simid
                    , FOLD = k_fold, **params)
                if orig_predictions is not None:
                    orig_scores = lkprof.calculate_scores(y_test, orig_predictions
                            , k = len(snplist), _k = k, EST = model.code, SELECTOR = model.code, SIMID = simid
                            , FOLD = k_fold, **params)
                else:
                    orig_scores = None

                f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0]
                if f_score > best_score[0]:
                    best_score = (f_score, scores, orig_scores, snplist.tolist())

            results.append( best_score[1] )
            if best_score[2] is not None:
                results.append( best_score[2] )
            snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3]

        # reformat model log
        log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()]

        return (simid, pd.concat( results ), snps, log)


    # check for sample size suitability for k-folding
    X, y = prepare_stratified_samples( X, y, fold )

    skf = StratifiedKFold(n_splits = fold, shuffle=True, random_state = np.random.randint(1e8))

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        k_fold += 1

        for k in k_list:

            # best score will be based on highest min F score
            best_score = (-1, None, None, None)
            for i in range(iteration):
                # the iteration here is used for stochastic models where each iteration can yield
                # different result
                lk_predictions, snplist, orig_predictions, params = fit_and_predict(model, X_train, y_train, X_test, k)
                scores = lkprof.calculate_scores(y_test,  lk_predictions, len(snplist), k, 'lk', simid, k_fold)
                if orig_predictions is not None:
                    orig_scores = lkprof.calculate_scores(y_test, orig_predictions
                            , len(snplist), k, model.code, simid, k_fold)
                else:
                    orig_scores = None

                f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0]
                if f_score > best_score[0]:
                    best_score = (f_score, scores, orig_scores, snplist.tolist())

            results.append( best_score[1] )
            if best_score[2] is not None:
                results.append( best_score[2] )
            snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3]

    # reformat model log
    log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()]

    return (simid, pd.concat( results ), snps, log)
Beispiel #4
0
    def score(self, genotype_train, group_train, genotype_test, group_test,
              simid, k_fold):
        """ return a dataframe containing scores and dict of snps """

        results = []
        snps = {}
        log = []

        for k in self.k_list:

            # best score containe (F, score_dataframe, orig_score_dataframe, snplist)
            best_score = (-1, None, None, None)

            for i in range(self.iteration):

                lk_pred, snplist, orig_pred, params = self.fit_and_predict(
                    genotype_train, group_train, genotype_test, k)
                if lk_pred is None:
                    continue

                scores = lkprof.calculate_scores(group_test,
                                                 lk_pred,
                                                 EST='lk',
                                                 k=len(snplist),
                                                 _k=k,
                                                 SELECTOR=self.code,
                                                 MODELID=self.model_id,
                                                 SIMID=simid,
                                                 FOLD=k_fold,
                                                 **params)

                orig_scores = None
                if orig_pred is not None:
                    orig_scores = lkprof.calculate_scores(
                        group_test,
                        orig_pred,
                        EST=self.code,
                        k=len(snplist),
                        _k=k,
                        SELECTOR=self.code,
                        MODELID=self.model_id,
                        SIMID=simid,
                        FOLD=k_fold,
                        **params)

                f_score = scores.loc[scores['REG'] == 'MIN', 'F'].values[0]
                if f_score > best_score[0]:
                    best_score = (f_score, scores, orig_scores,
                                  snplist.tolist())

            if best_score[0] < 0:
                continue

            results.append(best_score[1])
            if best_score[2] is not None:
                results.append(best_score[2])
            snps['%s/%d/%d/%d/%d' % (self.model_id, simid, k_fold, k,
                                     len(best_score[3]))] = best_score[3]

        log += [
            '[I - {%d|%s}: %s]' % (simid, self.model_id, line)
            for line in self.flush_log()
        ]

        if len(results) <= 0:
            return (pd.DataFrame(), snps, log)
        return (pd.concat(results, sort=False), snps, log)
Beispiel #5
0
    def select(self, haplotypes, groups, haplotest, k=None):

        # we use k for redundancy parameters
        if k == 0 or k is None:
            k = 1

        candidate_L = []  # [ (pos, rank, no_actual_pops)]
        # we traverse through the tree
        for (level, pop1, pop2) in traverse(self.guide_tree):

            n_pops = len(pop1) + len(pop2)
            haplotypes1 = haplotypes[np.isin(groups, pop1)]
            haplotypes2 = haplotypes[np.isin(groups, pop2)]

            if len(haplotypes1) < 4:
                cerr('[I - insufficient population size for %s]' % pop1)
            if len(haplotypes2) < 4:
                cerr('[I - insufficient population size for %s]' % pop2)

            # convert haplotypes to allele counts
            ac1 = count_allele(haplotypes1)
            ac2 = count_allele(haplotypes2)

            # calculate highest FST
            FST = []
            num, den = allel.hudson_fst(ac1, ac2)

            # NOTE: the line below avoids warning (invalid value in true_divide)
            # when den == 0, which should be perfectly ok for FST calculation
            den[den == 0] = -1
            fst = num / den

            # check for FST == 1.0
            ultimate_fst_pos = np.nonzero(fst == 1.0)[0]
            if len(ultimate_fst_pos) > 0:
                self.log('FST: 1.0 at %s for pop %s <> %s' %
                         (str(ultimate_fst_pos), pop1, pop2))

            if len(ultimate_fst_pos) > k and self.priority is not None:
                # get ultimate_fst based on priority

                ultimate_priority = self.priority[ultimate_fst_pos]
                sortidx = ultimate_fst_pos[np.argsort(ultimate_priority)]

                #import IPython; IPython.embed()

            else:
                #fst[ np.isnan(fst) ] = 0
                sortidx = np.argsort(fst)

            # get highest FST
            highest_fst_pos = sortidx[-(k + 1):-1]
            highest_fst_val = fst[highest_fst_pos]
            #cerr('[I - highest FST: %5.4f at %d for pops %s and %s' % (highest_fst_val, highest_fst_pos, pop1, pop2))

            # check suitability of SNPs
            snplist, F = None, -1
            if highest_fst_val.max() < self.min_fst:

                if self.max_leaf_snp > k:

                    X_train = np.append(haplotypes1, haplotypes2, axis=0)
                    y_train = np.array([1] * len(haplotypes1) +
                                       [2] * len(haplotypes2))

                    best_iteration = (-1, None)
                    for i in range(k, self.max_leaf_snp):
                        features = sortidx[-(i + 1):-1]

                        model = FixSNPSelector('dummy', snpindex=features)
                        lk_predictions, snplist, _, params = model.fit_and_predict(
                            X_train, y_train, X_train, len(features))
                        scores = lkprof.calculate_scores(
                            y_train, lk_predictions)

                        F = scores.loc[scores['REG'] == 'MIN', 'F'].values[0]
                        if best_iteration[0] < F:
                            best_iteration = (F, snplist)

                    snplist, F = best_iteration[1], best_iteration[0]

                snplist_2, F_2 = self.select_2(haplotypes1, haplotypes2)
                if F_2 > F:
                    snplist, F = snplist_2, F_2

                if snplist is not None:
                    self.log('F: %5.4f SNP: %d for pop %s <> %s => %s' %
                             (F, len(snplist), pop1, pop2, snplist))

                    for p in snplist:
                        candidate_L.append((p, level, n_pops))
                    continue

                # TODO: 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population

                # if snplist is None, just provide warning notice !
                else:
                    self.log('low FST = %5.4f for %s vs %s' %
                             (highest_fst_val.max(), pop1, pop2))

            # append to candidate_L
            for p in highest_fst_pos:
                candidate_L.append((p, level, n_pops))
            self.log('FST: %s SNP: %d for pop %s <> %s => %s' %
                     (str(highest_fst_val), len(highest_fst_pos), pop1, pop2,
                      str(highest_fst_pos)))

        # process candidate_L
        L = np.unique(np.array(sorted([x[0] for x in candidate_L])))

        # return snp position
        return (L, None, {})