def load_data(ch, dataset, snpsubset, snpruns, testpat, trainpat): """ Loading data from files into X and y matrices. Selection of patients not being in test set and SNPs present in subset-file. """ snplist = {name: [] for name in dataset.keys()} if snpsubset is not None: for name in dataset.keys(): cc = open('%s%s/%s_snps_chr%d_%d.txt' % (dataset[name], snpsubset, snpsubset, ch, snpruns[name]), 'r') for line in cc: snplist[name].append(int(line.split()[0])) cc.close() snp = len(snplist[name]) else: if len(dataset) > 1: raise exceptions.NoParameterError('subset', 'There is more than one given data set, but subset of SNPs ' + 'is not given.') else: cc = open('%smatrices/genome_stats.txt' % list(dataset.values())[0], 'r') snp = None for line in cc: if line.startswith('%d\t' % ch): snp = int(line.split()[1]) break cc.close() if 'snp' is None: raise exceptions.OtherError('There is no information about chromosome %d in %sgenome_stats.txt file' % (ch, list(dataset.values())[0])) snplist[next(iter(dataset.keys()))] = list(range(snp)) return read_Xs(ch, dataset, snp, snplist, testpat, trainpat)
def check_borutarun(directory, run, perc): with open('%sboruta/boruta_runs.txt' % directory, 'r') as file: for line in file: if line.startswith(str(run) + '\t'): line = line.split() snpsubset = line[5] if snpsubset == 'None': snpsubset = None snprun = None else: snprun = int(line[6].split('+')[0]) perc_list = line[8].split(',') if len(perc_list) > 1 and perc is None: raise exceptions.NoParameterError( 'perc', 'There is more than one perc value for given boruta run.' ) elif perc is None: perc = int(perc[0]) break if 'snpsubset' not in locals(): raise exceptions.WrongValueError( 'run', run, 'Run number %d was not conducted' % run) return perc, snpsubset, snprun
'%d\t%s\t%s\t%s\t%d\n' % (runs[setname], setname, ', '.join([ k for k in dataset.keys() if k != setname ]), funcs.make_chrstr(chrlist), shared_snps)) return shared_snps dataset = OrderedDict() chrlist = [i for i in range(1, 24)] fixed = False run = None for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: dataset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) if sys.argv[q] == '-chr': chrlist = funcs.read_chrstr(sys.argv[q + 1]) if sys.argv[q] == '-run': run = int(sys.argv[q + 1]) if sys.argv[q] == '-fixed': fixed = True found = find_shared(dataset, chrlist, fixed, run) print('%d shared SNPs found!' % found)
dir = './' for q in range(len(sys.argv)): if sys.argv[q] == '-dir': dir = sys.argv[q + 1] if sys.argv[q] == '-indir': indir = sys.argv[q + 1] if sys.argv[q] == '-outdir': outdir = sys.argv[q + 1] if sys.argv[q] == '-dataset': dataset = sys.argv[q + 1] if sys.argv[q] == '-diagdir': diagdir = sys.argv[q + 1] if 'dataset' not in globals(): raise exceptions.NoParameterError('dataset', 'e.g. adni or rosmap') if 'diagdir' not in globals(): diagdir = '%sfiles/' % dir if 'indir' not in globals(): indir = '%smatrices/' % dir if 'outdir' not in globals(): outdir = '%smatrices/' % dir if dataset == 'test': dfiles = ['test_diagnoses.tsv'] dd = test_mapping(dfiles, diagdir) if dataset == 'adni': dfiles = ['dxsum.csv']
dataset = OrderedDict() chrlist = [i for i in range(1, 24)] fixed = False run = None borutaruns = None perc = 90 thresh = 0.1 for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: dataset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) if sys.argv[q] == '-chr': chrlist = funcs.read_chrstr(sys.argv[q + 1]) if sys.argv[q] == '-run': run = int(sys.argv[q + 1]) if sys.argv[q] == '-fixed': fixed = True if sys.argv[q] == '-perc': perc = int(sys.argv[q + 1]) if sys.argv[q] == '-borutarun': if sys.argv[q + 1] in dataset.keys(): if borutaruns is None: borutaruns = OrderedDict() try: borutaruns[sys.argv[q + 1]] = int(sys.argv[q + 2])
chs.sort() for ch in chs: stats.write('%d\t%d\t%d\n' % (ch, snps[str(ch)], pat)) stats.close() indir = './' for q in range(len(sys.argv)): if sys.argv[q] == '-dbsnp': dbsnp = sys.argv[q + 1] if sys.argv[q] == '-plink': plink = sys.argv[q + 1] if sys.argv[q] == '-indir': indir = sys.argv[q + 1] if sys.argv[q] == '-outdir': outdir = sys.argv[q + 1] if 'outdir' not in globals(): outdir = indir if 'plink' not in globals(): raise exceptions.NoParameterError('plink', 'name of plink files') if 'dbsnp' not in globals(): raise exceptions.NoParameterError( 'dbsnp', 'name of file with list of SNPs assigned to their reference values') pat = make_pid(plink, indir, outdir) snps = make_ref(dbsnp, plink, indir, outdir) genome_stats(pat, snps, outdir)
continuation = False makey = False cv = None newforest = False frombed = False num_cores = None method = 'rforest' for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: dataset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) continue if sys.argv[q] == '-testset': if sys.argv[q + 2][0] in ['.', '~', '/']: testset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of test set should appear a directory to folder with it.' ) continue if sys.argv[q] == '-test': testsize = float(sys.argv[q + 1])
dataset = OrderedDict() procs = 1 chrlist = [i for i in range(1, 24)] snpsubset = None snpsubset = None snpruns = None for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: dataset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) continue if sys.argv[q] == '-outdir': if sys.argv[q + 1][0] in ['.', '~', '/']: outdir = sys.argv[q + 1] else: raise exceptions.NoParameterError( 'outdir', 'After -outdir should appear a directory to output folder.') continue if sys.argv[q] == '-procs': procs = int(sys.argv[q + 1]) continue
dataset = [] control_str = ['healthy', 'control', 'normal', 'NL', 'CN', '0'] case_str = ['ill', 'case', 'AD', 'alzheimer', '1'] all_str = ['all', 'whole', 'every'] kdeplot = False clustermap = False dendrogram = False for q in range(len(sys.argv)): if sys.argv[q].startswith('-dataset'): if sys.argv[q + 2][0] in ['.', '~', '/']: dataset.append([sys.argv[q + 1], sys.argv[q + 2]]) else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) if sys.argv[q] == '-matrix': if sys.argv[q + 1][0] in ['.', '~', '/']: sims = np.load(sys.argv[q + 1]) else: raise exceptions.NoParameterError( 'directory', 'After -matrix should appear a directory to similarity' + 'matrix written in .npy file.') if sys.argv[q].startswith('-seta') or sys.argv[q].startswith('-setA'): seta = add_to_set(sys.argv, seta, q)
except ValueError: v = -1 X[j, i + 1] = v np.savetxt('%sX_chr%s.csv' % (outdir, ch), X, fmt='%d', delimiter=',', header=',' + ','.join(list(map(str, range(snp)))), comments='') o.close() s.close() return "%s\t%d\t%d" % (ch, snp, pat) ch = '1' outdir = './' for q in range(len(sys.argv)): if sys.argv[q] == '-chr': ch = sys.argv[q + 1] if sys.argv[q] == '-input': inp = sys.argv[q + 1] if sys.argv[q] == '-outdir': outdir = sys.argv[q + 1] if 'inp' not in globals(): raise exceptions.NoParameterError('inp', 'name of input file') print(vcf_to_matrix(ch, inp, outdir))
action='store', type=str, metavar='NAME', default=None, help='name of SNP subset') parser.add_argument('-snprun', action=SnprunAction, nargs='*', metavar='') args = parser.parse_args() for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: dataset[sys.argv[q + 1]] = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) continue if sys.argv[q] == '-test': testsize = float(sys.argv[q + 1]) testsize_given = True continue if sys.argv[q] == '-perc': perc = list(map(int, sys.argv[q + 1].split(','))) perc_given = True continue if sys.argv[q] == '-snpsubset': snpsubset = sys.argv[q + 1]
s += 1 n = nn = False v1 = v2 = -2 np.save('%smatrix_chr%s.npy' % (outdir, ch), matrix) done += 2 * snp stats.close() pedfile.close() indir = './' overwrite = False for q in range(len(sys.argv)): if sys.argv[q] == '-plink': plink = sys.argv[q + 1] if sys.argv[q] == '-indir': indir = sys.argv[q + 1] if sys.argv[q] == '-outdir': outdir = sys.argv[q + 1] if sys.argv[q] == '-overwrite': overwrite = True if 'outdir' not in globals(): outdir = indir if 'plink' not in globals(): raise exceptions.NoParameterError('plink', 'name of plink files') snps_val = write_snps_list(plink, indir, outdir, overwrite) write_matrix(plink, indir, outdir, snps_val)
print('Ill: %d' % diagnoses['1']) return 0 run = None fixed = False for q in range(len(sys.argv)): if sys.argv[q] == '-dataset': if sys.argv[q + 2][0] in ['.', '~', '/']: name = sys.argv[q + 1] dir = sys.argv[q + 2] else: raise exceptions.NoParameterError( 'directory', 'After name of data set should appear a directory to folder with it.' ) if sys.argv[q] == '-outdir': if sys.argv[q + 1][0] in ['.', '~', '/']: outdir = sys.argv[q + 1] else: raise exceptions.NoParameterError( 'outdir', 'After -outdir should appear a directory to output folder.') if sys.argv[q] == '-lower': lower = float(sys.argv[q + 1]) if sys.argv[q] == '-upper': upper = float(sys.argv[q + 1])