def load_data(ch, dataset, snpsubset, snpruns, testpat, trainpat):
    """
    Loading data from files into X and y matrices.
    Selection of patients not being in test set and SNPs present in subset-file.
    """

    snplist = {name: [] for name in dataset.keys()}
    if snpsubset is not None:
        for name in dataset.keys():
            cc = open('%s%s/%s_snps_chr%d_%d.txt' % (dataset[name], snpsubset, snpsubset, ch, snpruns[name]), 'r')
            for line in cc:
                snplist[name].append(int(line.split()[0]))
            cc.close()
        snp = len(snplist[name])
    else:
        if len(dataset) > 1:
            raise exceptions.NoParameterError('subset', 'There is more than one given data set, but subset of SNPs ' +
                                                        'is not given.')
        else:
            cc = open('%smatrices/genome_stats.txt' % list(dataset.values())[0], 'r')
            snp = None
            for line in cc:
                if line.startswith('%d\t' % ch):
                    snp = int(line.split()[1])
                    break
            cc.close()
            if 'snp' is None:
                raise exceptions.OtherError('There is no information about chromosome %d in %sgenome_stats.txt file'
                                            % (ch, list(dataset.values())[0]))
            snplist[next(iter(dataset.keys()))] = list(range(snp))

    return read_Xs(ch, dataset, snp, snplist, testpat, trainpat)
Esempio n. 2
0
def check_borutarun(directory, run, perc):
    with open('%sboruta/boruta_runs.txt' % directory, 'r') as file:
        for line in file:
            if line.startswith(str(run) + '\t'):
                line = line.split()
                snpsubset = line[5]
                if snpsubset == 'None':
                    snpsubset = None
                    snprun = None
                else:
                    snprun = int(line[6].split('+')[0])
                perc_list = line[8].split(',')
                if len(perc_list) > 1 and perc is None:
                    raise exceptions.NoParameterError(
                        'perc',
                        'There is more than one perc value for given boruta run.'
                    )
                elif perc is None:
                    perc = int(perc[0])
                break
    if 'snpsubset' not in locals():
        raise exceptions.WrongValueError(
            'run', run, 'Run number %d was not conducted' % run)
    return perc, snpsubset, snprun
            '%d\t%s\t%s\t%s\t%d\n' % (runs[setname], setname, ', '.join([
                k for k in dataset.keys() if k != setname
            ]), funcs.make_chrstr(chrlist), shared_snps))

    return shared_snps


dataset = OrderedDict()
chrlist = [i for i in range(1, 24)]
fixed = False
run = None

for q in range(len(sys.argv)):
    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )
    if sys.argv[q] == '-chr':
        chrlist = funcs.read_chrstr(sys.argv[q + 1])
    if sys.argv[q] == '-run':
        run = int(sys.argv[q + 1])
    if sys.argv[q] == '-fixed':
        fixed = True

found = find_shared(dataset, chrlist, fixed, run)
print('%d shared SNPs found!' % found)
Esempio n. 4
0
dir = './'
for q in range(len(sys.argv)):
    if sys.argv[q] == '-dir':
        dir = sys.argv[q + 1]
    if sys.argv[q] == '-indir':
        indir = sys.argv[q + 1]
    if sys.argv[q] == '-outdir':
        outdir = sys.argv[q + 1]
    if sys.argv[q] == '-dataset':
        dataset = sys.argv[q + 1]
    if sys.argv[q] == '-diagdir':
        diagdir = sys.argv[q + 1]

if 'dataset' not in globals():
    raise exceptions.NoParameterError('dataset', 'e.g. adni or rosmap')

if 'diagdir' not in globals():
    diagdir = '%sfiles/' % dir

if 'indir' not in globals():
    indir = '%smatrices/' % dir

if 'outdir' not in globals():
    outdir = '%smatrices/' % dir

if dataset == 'test':
    dfiles = ['test_diagnoses.tsv']
    dd = test_mapping(dfiles, diagdir)
if dataset == 'adni':
    dfiles = ['dxsum.csv']
Esempio n. 5
0
dataset = OrderedDict()
chrlist = [i for i in range(1, 24)]
fixed = False
run = None
borutaruns = None
perc = 90
thresh = 0.1

for q in range(len(sys.argv)):
    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )
    if sys.argv[q] == '-chr':
        chrlist = funcs.read_chrstr(sys.argv[q + 1])
    if sys.argv[q] == '-run':
        run = int(sys.argv[q + 1])
    if sys.argv[q] == '-fixed':
        fixed = True
    if sys.argv[q] == '-perc':
        perc = int(sys.argv[q + 1])
    if sys.argv[q] == '-borutarun':
        if sys.argv[q + 1] in dataset.keys():
            if borutaruns is None:
                borutaruns = OrderedDict()
            try:
                borutaruns[sys.argv[q + 1]] = int(sys.argv[q + 2])
    chs.sort()
    for ch in chs:
        stats.write('%d\t%d\t%d\n' % (ch, snps[str(ch)], pat))
    stats.close()


indir = './'
for q in range(len(sys.argv)):
    if sys.argv[q] == '-dbsnp':
        dbsnp = sys.argv[q + 1]
    if sys.argv[q] == '-plink':
        plink = sys.argv[q + 1]
    if sys.argv[q] == '-indir':
        indir = sys.argv[q + 1]
    if sys.argv[q] == '-outdir':
        outdir = sys.argv[q + 1]

if 'outdir' not in globals():
    outdir = indir

if 'plink' not in globals():
    raise exceptions.NoParameterError('plink', 'name of plink files')
if 'dbsnp' not in globals():
    raise exceptions.NoParameterError(
        'dbsnp',
        'name of file with list of SNPs assigned to their reference values')

pat = make_pid(plink, indir, outdir)
snps = make_ref(dbsnp, plink, indir, outdir)
genome_stats(pat, snps, outdir)
Esempio n. 7
0
continuation = False
makey = False
cv = None
newforest = False
frombed = False
num_cores = None
method = 'rforest'

for q in range(len(sys.argv)):

    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )
        continue

    if sys.argv[q] == '-testset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            testset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of test set should appear a directory to folder with it.'
            )
        continue

    if sys.argv[q] == '-test':
        testsize = float(sys.argv[q + 1])
dataset = OrderedDict()
procs = 1
chrlist = [i for i in range(1, 24)]
snpsubset = None
snpsubset = None
snpruns = None

for q in range(len(sys.argv)):

    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )
        continue

    if sys.argv[q] == '-outdir':
        if sys.argv[q + 1][0] in ['.', '~', '/']:
            outdir = sys.argv[q + 1]
        else:
            raise exceptions.NoParameterError(
                'outdir',
                'After -outdir should appear a directory to output folder.')
        continue

    if sys.argv[q] == '-procs':
        procs = int(sys.argv[q + 1])
        continue
dataset = []
control_str = ['healthy', 'control', 'normal', 'NL', 'CN', '0']
case_str = ['ill', 'case', 'AD', 'alzheimer', '1']
all_str = ['all', 'whole', 'every']
kdeplot = False
clustermap = False
dendrogram = False

for q in range(len(sys.argv)):

    if sys.argv[q].startswith('-dataset'):
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset.append([sys.argv[q + 1], sys.argv[q + 2]])
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )

    if sys.argv[q] == '-matrix':
        if sys.argv[q + 1][0] in ['.', '~', '/']:
            sims = np.load(sys.argv[q + 1])
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After -matrix should appear a directory to similarity' +
                'matrix written in .npy file.')

    if sys.argv[q].startswith('-seta') or sys.argv[q].startswith('-setA'):

        seta = add_to_set(sys.argv, seta, q)
Esempio n. 10
0
            except ValueError:
                v = -1
            X[j, i + 1] = v

    np.savetxt('%sX_chr%s.csv' % (outdir, ch),
               X,
               fmt='%d',
               delimiter=',',
               header=',' + ','.join(list(map(str, range(snp)))),
               comments='')
    o.close()
    s.close()

    return "%s\t%d\t%d" % (ch, snp, pat)


ch = '1'
outdir = './'
for q in range(len(sys.argv)):
    if sys.argv[q] == '-chr':
        ch = sys.argv[q + 1]
    if sys.argv[q] == '-input':
        inp = sys.argv[q + 1]
    if sys.argv[q] == '-outdir':
        outdir = sys.argv[q + 1]

if 'inp' not in globals():
    raise exceptions.NoParameterError('inp', 'name of input file')

print(vcf_to_matrix(ch, inp, outdir))
                    action='store',
                    type=str,
                    metavar='NAME',
                    default=None,
                    help='name of SNP subset')
parser.add_argument('-snprun', action=SnprunAction, nargs='*', metavar='')
args = parser.parse_args()

for q in range(len(sys.argv)):

    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            dataset[sys.argv[q + 1]] = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )
        continue

    if sys.argv[q] == '-test':
        testsize = float(sys.argv[q + 1])
        testsize_given = True
        continue

    if sys.argv[q] == '-perc':
        perc = list(map(int, sys.argv[q + 1].split(',')))
        perc_given = True
        continue

    if sys.argv[q] == '-snpsubset':
        snpsubset = sys.argv[q + 1]
                s += 1
                n = nn = False
                v1 = v2 = -2
        np.save('%smatrix_chr%s.npy' % (outdir, ch), matrix)
        done += 2 * snp

    stats.close()
    pedfile.close()


indir = './'
overwrite = False
for q in range(len(sys.argv)):
    if sys.argv[q] == '-plink':
        plink = sys.argv[q + 1]
    if sys.argv[q] == '-indir':
        indir = sys.argv[q + 1]
    if sys.argv[q] == '-outdir':
        outdir = sys.argv[q + 1]
    if sys.argv[q] == '-overwrite':
        overwrite = True

if 'outdir' not in globals():
    outdir = indir

if 'plink' not in globals():
    raise exceptions.NoParameterError('plink', 'name of plink files')

snps_val = write_snps_list(plink, indir, outdir, overwrite)
write_matrix(plink, indir, outdir, snps_val)
Esempio n. 13
0
    print('Ill: %d' % diagnoses['1'])
    return 0


run = None
fixed = False

for q in range(len(sys.argv)):

    if sys.argv[q] == '-dataset':
        if sys.argv[q + 2][0] in ['.', '~', '/']:
            name = sys.argv[q + 1]
            dir = sys.argv[q + 2]
        else:
            raise exceptions.NoParameterError(
                'directory',
                'After name of data set should appear a directory to folder with it.'
            )

    if sys.argv[q] == '-outdir':
        if sys.argv[q + 1][0] in ['.', '~', '/']:
            outdir = sys.argv[q + 1]
        else:
            raise exceptions.NoParameterError(
                'outdir',
                'After -outdir should appear a directory to output folder.')

    if sys.argv[q] == '-lower':
        lower = float(sys.argv[q + 1])

    if sys.argv[q] == '-upper':
        upper = float(sys.argv[q + 1])