Esempio n. 1
0
def pheno_general_stats(obj, model):
    info = ""
    obj_ = load_cv_obj(obj)
    info += "%s\nPehnotype: %s\n" % (sep_str, obj_.Y_name)
    # number of samples (after sample intersection)
    info += "\tNumber of samples without missing pheno: %d\n" % len(
        set(read_id_list(obj_.samples)))
    # pheno. stats (after sample intersection)
    info += "\t0:%d,1:%d | 0:%.2f,1:%.2f\n" % (
        obj_.Y_stat['class_num']['0'], obj_.Y_stat['class_num']['1'],
        obj_.Y_stat['class_pct']['0'], obj_.Y_stat['class_pct']['1'])
    # pheno check
    if obj_.Y_check:
        # k and lambda
        k = read_k(obj_.full['eig']['eig.k'])
        l = read_lambda(obj_.full['eig']['eig.lambda'])
        info += "EIGENSTRAT\n\tFull: k=%d, lambda=%.5f\n" % (k, l)
        for rep in obj_.cv.keys():
            info += "\tRep %d" % (rep)
            for fold in obj_.cv[rep].keys():
                k = read_k(obj_.cv[rep][fold]['eig']['eig.k'])
                l = read_lambda(obj_.cv[rep][fold]['eig']['eig.lambda'])
                info += " | fold %d: k=%d, lambda=%.5f" % (fold, k, l)
            info += "\n"
        # CV selected features
        n = len(set(read_id_list(obj_.full['features_sel'])))
        info += "Selected features:\n\tFull: %d\n" % n
        for rep in obj_.cv.keys():
            info += "\tRep %d" % (rep)
            for fold in obj_.cv[rep].keys():
                n = len(set(read_id_list(obj_.cv[rep][fold]['features_sel'])))
                info += " | fold %d: %d" % (fold, n)
            info += "\n"
        # CV model
        s = ';'.join(list(set(read_id_list(obj_.full[model]["combi.txt"]))))
        info += "Models\n\tFull: %s\n" % s
        for rep in obj_.cv.keys():
            for fold in obj_.cv[rep].keys():
                s = ';'.join(
                    list(
                        set(
                            read_id_list(
                                obj_.cv[rep][fold][model]["combi.txt"]))))
                info += "\tRep %d, fold %d: %s\n" % (rep, fold, s)
        # CV eval. reps
        info += 'Mssing fold pred. results:'
        for rep in obj_.cv.keys():
            for fold in obj_.cv[rep].keys():
                f = obj_.cv[rep][fold][model]['pred.csv']
                if not os.path.isfile(f) or os.stat(f).st_size == 0:
                    info += "Rep %d, fold %d: no model prediction results\n" % (
                        rep, fold)
        info += '\n'
    else:
        info += 'Check not passed\n'
    info += "%s\n" % sep_str
    return info
Esempio n. 2
0
def bin_to_eig_pheno(pheno_file,
                     pheno_name,
                     pheno_ofile,
                     sample_file=None,
                     miss='NA'):
    samples = None
    if sample_file is not None:
        samples = set(read_id_list(sample_file))
    # Pheno
    header = True
    pheno_col = 0
    with open(pheno_file, "r") as pheno, open(pheno_ofile, "w") as pheno_o:
        for line in pheno:
            line = line.rstrip('\n')
            line = line.split('\t')
            if header:
                header = False
                for i in range(0, len(line)):
                    if line[i] == pheno_name:
                        pheno_col = i + 1  # +1 because 1st field is sampleID
                        break
                continue
            assert pheno_col > 0
            sampleID = line[0]
            if samples is not None and sampleID not in samples:
                continue
            samplePh = line[pheno_col]
            if samplePh == '1': samplePh = 'Case'
            elif samplePh == '0': samplePh = 'Control'
            elif samplePh == miss: samplePh = 'Ignore'
            else:
                sys.exit('Unknown phenotype: %s: %s: %s' %
                         (pheno_name, sampleID, samplePh))
            pheno_o.write("\t".join([sampleID, 'U', samplePh]) +
                          "\n")  #; pheno_o.flush()
Esempio n. 3
0
def cv_pheno_stats(cv_obj):
    # already done
    if cv_obj.done['y_stats']:
        return "SKIP: Pheno stat.s already computed"
    cv_obj.Y_stat['class_num'] = {'0': 0, '1': 0}
    cv_obj.Y_stat['class_pct'] = {'0': 0, '1': 0}
    with open(cv_obj.Y_stat['file']) as infile:
        for line in infile:
            line = line.rstrip("\n")
            line = line.split('\t')
            if line[0] == 'miss':
                cv_obj.Y_stat['miss'] = int(line[1])
            elif line[0] == '0':
                cv_obj.Y_stat['class_num']['0'] = int(line[1])
            elif line[0] == '1':
                cv_obj.Y_stat['class_num']['1'] = int(line[1])
            else:
                sys.exit("In Y stat.s file: unknown value %s" % line[0])
    cv_obj.Y_stat['total'] = sum([
        cv_obj.Y_stat['miss'], cv_obj.Y_stat['class_num']['0'],
        cv_obj.Y_stat['class_num']['1']
    ])
    samples = set(read_id_list(cv_obj.samples))
    assert len(samples) == cv_obj.Y_stat[
        'total'], "Have %d samples in %s but only %d as total from Y stat.s file %s" % (
            len(samples), cv_obj.samples, cv_obj.Y_stat['total'],
            cv_obj.Y_stat['file'])
    assert cv_obj.Y_stat['miss'] == 0, "Missing pheno for %s" % cv_obj.Y_name
    if (cv_obj.Y_stat['total'] - cv_obj.Y_stat['miss']) > 0:
        cv_obj.Y_stat['class_pct'] = { \
            '0' : 100.0 * float(cv_obj.Y_stat['class_num']['0']) / float(cv_obj.Y_stat['total']-cv_obj.Y_stat['miss']) ,\
            '1' : 100.0 * float(cv_obj.Y_stat['class_num']['1']) / float(cv_obj.Y_stat['total']-cv_obj.Y_stat['miss']) \
        }
    cv_obj.done['y_stats'] = True
    return "Computed pheno stat.s"
Esempio n. 4
0
def cv_check_samples(in_samples_f, ex_samples_f, cv_obj):
    if cv_obj.done['sample_check']:
        return "SKIP: Samples already checked"
    info = ""
    in_samples = set(read_id_list(in_samples_f))
    ex_samples = set(read_id_list(ex_samples_f))

    # CV full:
    cv_samples = set(read_id_list(cv_obj.full['samples']))
    # no intersectoion with excluded
    assert len(
        cv_samples.intersection(ex_samples)
    ) == 0, "Intersection with excluded: Assertion error in %s" % cv_samples
    # all in included
    assert len(
        cv_samples.difference(in_samples)
    ) == 0, "Set diff. with included: Assertion error in %s" % cv_samples
    info += "CV %s (full): Sample list %s: %d entries\n" % (
        cv_obj.Y_name, cv_obj.full['samples'], len(cv_samples))

    # CV folds: only samples from samples in CV obj., print number
    for rep in cv_obj.cv.keys():
        for fold in cv_obj.cv[rep].keys():
            train_samples = set(
                read_id_list(cv_obj.cv[rep][fold]['samples_train']))
            test_samples = set(
                read_id_list(cv_obj.cv[rep][fold]['samples_test']))
            # no intersection of train and test
            assert len(train_samples.intersection(test_samples)
                       ) == 0, "Assertion error in %s and %s" % (train_samples,
                                                                 test_samples)
            # together they should be the same as in full set
            assert cv_samples == train_samples.union(
                test_samples), "Assertion error in %s and %s" % (train_samples,
                                                                 test_samples)
            # number of samples
            info += "CV %s (rep %d, fold %d): Sample list %s: %d entries\n" % (
                cv_obj.Y_name, rep, fold,
                cv_obj.cv[rep][fold]['samples_train'], len(train_samples))
            info += "CV %s (rep %d, fold %d): Sample list %s: %d entries\n" % (
                cv_obj.Y_name, rep, fold, cv_obj.cv[rep][fold]['samples_test'],
                len(test_samples))
    cv_obj.done['sample_check'] = True
    return info
Esempio n. 5
0
def cv_eig_convert(cv_obj, rep=None, fold=None, skip=False):
    eig_geno = eig_snps = eig_pheno = eig_snps_rm = None
    samples = features = features_pr = None
    cv_obj_sub = None
    if rep is None or fold is None:
        cv_obj_sub = cv_obj.full
    else:
        cv_obj_sub = cv_obj.cv[rep][fold]
    eig_geno = cv_obj_sub['eig']['eig.geno']
    eig_snps = cv_obj_sub['eig']['eig.snps']
    eig_snps_rm = cv_obj_sub['eig']['eig.snps.rm']
    eig_pheno = cv_obj_sub['eig']['eig.pheno']
    if rep is None or fold is None:
        samples = cv_obj_sub['samples']
    else:
        samples = cv_obj_sub['samples_train']
    features = cv_obj_sub['features']
    features_pr = cv_obj_sub['features_pr']
    if not skip:
        bin_to_eig_geno_snp( \
            geno_file=cv_obj.X_bin, \
            geno_ofile=eig_geno, snp_ofile=eig_snps, \
            sample_file=samples, feature_file=features, \
            miss='NA' \
        )
        bin_to_eig_pheno( \
            pheno_file=cv_obj.Y_file, \
            pheno_name=cv_obj.Y_name, \
            pheno_ofile=eig_pheno, \
            sample_file=samples, \
            miss='NA' \
        )
        snps = set(read_id_list(features))
        snps_pr = set(read_id_list(features_pr))
        snps_rm = snps.difference(snps_pr)
        with open(eig_snps_rm, 'w') as ofile:
            for snp in snps_rm:
                ofile.write("rs%s\n" % snp)
Esempio n. 6
0
def add_model_annot(obj, annot, odir, obname, model, f_type):
    fields = None
    if f_type == 'gff':
        fields = gene_annot_fields
    elif f_type == 'vcf':
        fields = vcf_annot_fields

    obj_ = load_cv_obj(obj)
    if not obj_.Y_check: return

    model_res = path.join(obj_.odir, "%s_total_perf.csv" % model)
    if not path.isfile(model_res) or os.stat(model_res).st_size == 0:
        return  # no CV perf.
    if not path.isfile(obj_.full[model]['combi.txt']): return  # no/empty model

    ofile = path.join(odir, "%s_%s_%s.csv" % (obname, obj_.Y_name_str, model))
    perf_list = [
        'ERR', 'ACC', 'B_ACC', 'SENS', 'SPEC', 'PREC', 'NPV', 'FPR', 'FNR',
        'Fmeasure', 'gm_RS', 'gm_RP', 'AUC_ROC', 'AUC_PR'
    ]
    perf_dict = dict.fromkeys(perf_list, None)
    # read perf
    with open(model_res) as ifile:
        header = True
        for line in ifile:
            if not line: continue
            line = line.rstrip('\n')
            line = line.split('\t')
            if header:
                header = False
                for i in range(0, len(line)):
                    if line[i] in perf_dict:
                        if perf_dict[line[i]] is None: perf_dict[line[i]] = {}
                        perf_dict[line[i]]['id'] = i
                continue
            if line[0] == 'mean':
                for k in perf_dict.keys():
                    perf_dict[k]['value'] = line[perf_dict[k]['id']]
    # read features
    features = set(read_id_list(obj_.full[model]['combi.txt']))
    # write file
    with open(ofile, 'w') as of:
        of.write("Feature\t%s\t%s\n" %
                 ('\t'.join(fields), '\t'.join(perf_list)))
        for feature in features:
            of.write("%s\t%s\t%s\n" %
                     (feature, '\t'.join([annot[feature][f] for f in fields]),
                      '\t'.join([perf_dict[k]['value'] for k in perf_list])))
Esempio n. 7
0
def cv_create_folds(cv_obj):
    if cv_obj.done['cv_folds']:
        return "SKIP: Folds already created"
    info = ""
    # did not pass the phenotype check
    if not cv_obj.Y_check:
        return "CV folds: %s did not pass pheno. check" % cv_obj.Y_name
    # Samples to use
    samples = set(read_id_list(cv_obj.samples))
    # Y as array and sample IDs
    y, y_samples = pheno_as_list(pheno_file=cv_obj.Y_file,
                                 pheno_name=cv_obj.Y_name,
                                 ignore_miss=True,
                                 samples=samples)
    y = numpy.array(y)
    y_samples = numpy.array(y_samples)  # need for indexing
    for rep in cv_obj.cv.keys():
        # Create folds
        y_folds = cross_validation.StratifiedKFold(y=y,
                                                   n_folds=cv_obj.folds,
                                                   shuffle=True,
                                                   random_state=None)
        # Save folds as lists of samples in train/test
        fold = 1
        for train_index, test_index in y_folds:
            with open(cv_obj.cv[rep][fold]['samples_train'], 'w') as o_file:
                o_file.write("\n".join(y_samples[train_index].tolist()) + "\n")
            with open(cv_obj.cv[rep][fold]['samples_test'], 'w') as o_file:
                o_file.write("\n".join(y_samples[test_index].tolist()) + "\n")
            # Stats
            y_fold_stats = pheno_class_stats(
                pheno_file=cv_obj.Y_file,
                pheno_name=cv_obj.Y_name,
                samples=set(y_samples[train_index].tolist()))
            info += "Rep %d, fold %d, train: %s\n" % (rep, fold, ' ; '.join(
                [' - '.join([k, "%d" % v]) for k, v in y_fold_stats.items()]))
            y_fold_stats = pheno_class_stats(
                pheno_file=cv_obj.Y_file,
                pheno_name=cv_obj.Y_name,
                samples=set(y_samples[test_index].tolist()))
            info += "Rep %d, fold %d, test: %s\n" % (rep, fold, ' ; '.join(
                [' - '.join([k, "%d" % v]) for k, v in y_fold_stats.items()]))
            fold += 1
    cv_obj.done['cv_folds'] = True
    return info
Esempio n. 8
0
def bin_to_plink_pheno(pheno_file,
                       pheno_name,
                       pheno_ofile,
                       family="Fam",
                       sample_file=None,
                       miss='NA',
                       verbose=True):
    if verbose: sys.stdout.write("Bin. pheno mat. to PLINK alt. pheno")
    samples = None
    if sample_file is not None:
        samples = set(read_id_list(sample_file))
        if verbose:
            sys.stdout.write("Sample list contains %d unique IDs\n" %
                             len(samples))
    # Pheno
    header = True
    pheno_col = 0
    with open(pheno_file, "r") as pheno, open(pheno_ofile, "w") as pheno_o:
        sys.stdout.write("\tOutput: %s in %s\n" % (pheno_name, pheno_ofile))
        for line in pheno:
            line = line.rstrip('\n')
            line = line.split('\t')
            if header:
                header = False
                for i in range(0, len(line)):
                    if line[i] == pheno_name:
                        pheno_col = i + 1  # +1 because 1s field is sampleID
                        break
                continue
            sampleID = line[0]
            if samples is not None and sampleID not in samples:
                continue
            samplePh = line[pheno_col]
            if samplePh == miss:
                samplePh = '-9'
            pheno_o.write("\t".join([family, sampleID, samplePh]) + "\n")
Esempio n. 9
0
def bin_to_eig_geno_snp(geno_file,
                        geno_ofile,
                        snp_ofile,
                        sample_file=None,
                        feature_file=None,
                        miss='NA'):
    samples = features = None
    if sample_file is not None:
        samples = set(read_id_list(sample_file))
    if feature_file is not None:
        features = set(read_id_list(feature_file))
    # SNP
    header = True
    count = 1
    found_features = 0
    with open(geno_file, "r") as geno, open(snp_ofile, "w") as snp_o:
        for line in geno:
            if header:
                header = False
                continue
            line = line.rstrip('\n')
            snpID = line.split('\t')[0]
            if features is not None and snpID not in features:
                continue
            found_features += 1
            snp_o.write(
                "\t".join(["rs" + snpID, "1", "0.0",
                           str(count), "", ""]) +
                "\n")  # EIGENSTRAT expects "rs<snpID>"
            count += 1
    assert (features is None) or (
        found_features == len(features)
    ), "%s: expected %d, found %d" % (snp_file, len(features), found_features)
    # Geno
    header = True
    sampleIDs = None
    found_features = 0
    with open(geno_file, "r") as geno, open(geno_ofile, "w") as geno_o:
        for line in geno:
            line = line.rstrip('\n')
            line = line.split('\t')
            if header:
                header = False
                sampleIDs = line
                continue
            snpID = line[0]
            if features is not None and snpID not in features: continue
            found_features += 1
            found_samples = 0
            for i in range(1, len(line)):
                sampleID = sampleIDs[i - 1]
                if (samples is not None and sampleID not in samples) or (
                        features is not None and snpID not in features):
                    continue
                found_samples += 1
                sampleAl = line[i]
                if sampleAl == miss: continue  #sampleAl = "9"
                else:
                    geno_o.write("\t".join(["rs" +
                                            snpID, sampleID, sampleAl]) + "\n")
            assert (samples is None) or (
                found_samples == len(samples)), "%s: expected %d, found %d" % (
                    geno_file, len(saples), found_samples)
    assert (features is None) or (
        found_features == len(features)
    ), "%s: expected %d, found %d" % (geno_file, len(features), found_features)
Esempio n. 10
0
def bin_to_plink_ped_map(geno_file,
                         map_ofile,
                         ped_ofile,
                         family="Fam",
                         sample_file=None,
                         feature_file=None,
                         feature_cl_file=None,
                         miss='NA',
                         verbose=True):
    if verbose: sys.stdout.write("Bin. feature mat. to PLINK PED/MAP")
    samples = None
    if sample_file is not None:
        samples = set(read_id_list(sample_file))
        if verbose:
            sys.stdout.write("Sample list contains %d unique IDs\n" %
                             len(samples))
    features = None
    if feature_file is not None:
        features = set(read_id_list(feature_file))
        if verbose:
            sys.stdout.write("Feature list contains %d unique IDs\n" %
                             len(features))
    feature_cl = None
    if feature_cl_file is not None:
        feature_cl = read_feature_cl(feature_cl_file)
    # MAP
    header = True
    count = 1
    with open(geno_file, "r") as geno, open(map_ofile, "w") as map_o:
        if verbose: sys.stdout.write("\tOutput: %s\n" % map_ofile)
        for line in geno:
            if header:
                header = False
                continue
            line = line.rstrip('\n')
            snpID = line.split('\t')[0]
            snpChr = "0"
            snpDis = "0"
            snpPos = str(count)
            if features is not None and snpID not in features:
                continue
            if feature_cl is not None:
                snpChr = feature_cl[snpID]
            map_o.write("\t".join([snpChr, snpID, snpDis, snpPos]) + "\n")
            #map_o.flush()
            count += 1
    # PED
    header = True
    sample_dict = None
    sample_IDs = None
    with open(geno_file, "r") as geno:
        if verbose: sys.stdout.write("\tOutput: %s\n" % ped_ofile)
        for line in geno:
            line = line.rstrip('\n')
            line = line.split('\t')
            if header:
                header = False
                sample_IDs = line
                sample_dict = dict.fromkeys(sample_IDs, "")
                continue
            snpID = line[0]
            for i in range(1, len(line)):
                sampleID = sample_IDs[i - 1]
                if samples is not None and sampleID not in samples:
                    if sampleID in sample_dict: del sample_dict[sampleID]
                    continue
                if features is not None and snpID not in features:
                    continue
                sampleAl = line[i]
                if sampleAl == "1":
                    sampleAl = "A A"
                elif sampleAl == "0":
                    sampleAl = "G G"
                elif sampleAl == miss:
                    sampleAl = "0 0"
                if sample_dict[sampleID] == "":
                    sample_dict[sampleID] = sampleAl
                else:
                    sample_dict[sampleID] += ("\t" + sampleAl)
    with open(ped_ofile, "w") as ped_o:
        for sampleID, sampleAl in sample_dict.items():
            ped_o.write(
                "\t".join([family, sampleID, "0", "0", "0", "0", sampleAl]) +
                "\n")
Esempio n. 11
0
 # NOTE All pheno. CV objects
 objs = glob.glob("%s/obj/*.pkl" % args.idir)
 #--------------------------------------------------#
 # NOTE Collect information of each object
 sep_str = "#%s#" % ('-' * 50)
 with open(path.join(odir, "%s_stats.txt" % args.obname), 'w') as ofile:
     for obj in objs:
         info = pheno_general_stats(obj, args.model)
         ofile.write(info)
 #--------------------------------------------------#
 # NOTE Feature annotation
 annot = None
 # All genes
 obj_ = load_cv_obj(
     objs[0])  # load only first object (all should have same feature list)
 features = set(read_id_list(obj_.features))
 sys.stdout.write("There are %d features\n" % len(features))
 # Annotation from table
 if args.f_type == 'gff':
     annot = read_annot_tab(annot_tab=args.f_source)
 elif args.f_type == 'vcf':
     annot = read_annot_tab(annot_tab=args.f_source, id_prefix='var_')
 assert len(annot) == len(features)
 sys.stdout.write('Annotations were collected\n')
 #--------------------------------------------------#
 # NOTE Add annotation to results
 assert annot is not None
 pool = Pool(args.cores)
 pool_iter = itertools.product(objs, [annot], [odir], [args.obname],
                               [args.model], [args.f_type])
 annots = pool.starmap(add_gwas_annot, pool_iter)