def __init__(self, properties): if not os.path.exists(DEV_PATH): preprocessing.run() logging.info('Preparing development set...') self.devset = json.load(open(DEV_PATH)) self.devdata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata( self.devset) logging.info('Preparing trainset...') self.trainset = json.load(open(TRAIN_PATH)) self.traindata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata( self.trainset) info = 'TRAIN DATA SIZE: ' + str(len(self.traindata)) logging.info(info) self.EPOCH = properties['EPOCH'] self.BATCH = properties['BATCH'] self.EMB_DIM = properties['EMB_DIM'] self.HIDDEN_DIM = properties['HIDDEN_DIM'] self.DROPOUT = properties['DROPOUT'] self.EARLY_STOP = properties['EARLY_STOP'] self.pretrained = properties['pretrained_input'] print('\nInitializing model...') print(self.fname()) self.init()
def __init__(self, stop=True, vector=''): if not os.path.exists(DEV_PATH): preprocessing.run() self.stop = stop self.vector = vector self.alignments = [] print('Preparing test set...') self.testset = json.load(open(TEST_PATH)) print('Preparing development set...') self.devset = json.load(open(DEV_PATH)) print('Preparing trainset...') self.trainset = json.load(open(TRAIN_PATH)) self.word2vec = None if 'word2vec' in self.vector: self.word2vec = word2vec.init_word2vec(WORD2VEC_PATH) self.trainidx = self.trainelmo = self.devidx = self.develmo = self.testidx = self.testelmo = None self.fulltrainidx = self.fulltrainelmo = self.fulldevidx = self.fulldevelmo = self.fulltestidx = self.fulltestelmo = None if 'elmo' in self.vector: self.trainidx, self.trainelmo, self.devidx, self.develmo, self.testidx, self.testelmo = elmo.init_elmo( True, ELMO_PATH)
def __init__(self): if not os.path.exists(DEV_PATH): preprocessing.run() logging.info('Preparing development set...', extra=d) self.devset = json.load(open(DEV_PATH)) self.devdata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata( self.devset) logging.info('Preparing trainset...', extra=d) self.trainset = json.load(open(TRAIN_PATH)) self.traindata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata( self.trainset) info = 'TRAIN DATA SIZE: ' + str(len(self.traindata)) logging.info(info, extra=d) self.translation = features.init_translation(traindata=self.trainset, alpha=0.7, sigma=0.3) logging.info('Preparing SimBOW...', extra=d) self.simbow = SemevalQuestionCosine() self.simbow.train() self.trainidx, self.trainelmo, self.devidx, self.develmo = features.init_elmo( ) self.fulltrainidx, self.fulltrainelmo, self.fulldevidx, self.fulldevelmo = features.init_elmo( stop=False) self.word2vec = features.init_word2vec()
def main(): """Entry point if called as an executable""" if USE_SBATCH: mode = "sbatch" else: mode = "direct" if not os.path.exists(PHE_DIR): os.makedirs(PHE_DIR) # estimate hsq with GCTA for hsq in HSQ: for n_snp in N_SNP: for n_ind in N_IND: phe_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" + str(n_ind)) phe_file = os.path.join(phe_dir, "phe_\\i") out_dir_hsq = os.path.join(HSQ_DIR, os.path.basename(phe_dir)) if os.path.exists(out_dir_hsq): shutil.rmtree(out_dir_hsq) os.makedirs(out_dir_hsq) print("estimating hsq...") sys.stdout.flush() preprocessing.run([MYGCTA, GCTA, "--grm-bin", GRM, "--pheno", phe_file + ".phen", "--qcovar", os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar", os.path.join(PHE_DIR, "centre.txt"), "--covar", os.path.join(PHE_DIR, "sex.txt"), "--out", os.path.join(out_dir_hsq, os.path.basename(phe_file)), "--reml-no-constrain"], mode=mode, slurm_par=["-J", "simu_hsq", "--mem", "2G", "-D", out_dir_hsq, "-W"], array=range(1, N_ITER+1), check=False) print("creating zip file...") sys.stdout.flush() shutil.make_archive(out_dir_hsq, "zip", os.path.dirname(out_dir_hsq), os.path.basename(out_dir_hsq))
def knn(): test_X, test_y, training_X, training_y = preprocessing.run() #Hyperparam Tuning """ k_num_values = 20 val_error = np.zeros(k_num_values) train_error = np.zeros(k_num_values) for k in range(k_num_values): #k values are (1 to 20), but are stored as (0 to 19) ztrain, zval = kfoldcv.knn5F(training_X, training_y, k+1) val_error[k] = zval.mean() train_error[k] = ztrain.mean() """ #for random seed 30 when shuffling data train_error = np.array([ 0.910490152897502, 0.912516202101554, 0.9122242496696659, 0.9104851192348832, 0.9084590700308312, 0.9145460265525702, 0.9116474758279326, 0.9104880555380775, 0.9148295895467605, 0.9130950733027119, 0.9130950733027119, 0.9130950733027119, 0.9130950733027119, 0.9130950733027119, 0.9130950733027119, 0.9119356530128568, 0.9165683005096582, 0.9165683005096582, 0.9165683005096582, 0.9165683005096582 ]) val_error = np.array([ 0.9235448312945289, 0.9282094367522516, 0.9258502486893398, 0.9269995967199891, 0.9223820405968544, 0.9154187390778329, 0.9096115069229735, 0.9211990858986423, 0.920002688533405, 0.9188600618362683, 0.9235045032934532, 0.9188869471703185, 0.9200430165344805, 0.9130864363489716, 0.9200699018685305, 0.9247143433257158, 0.9235179459604786, 0.9339696195725231, 0.9281892727517139, 0.9328001075413361 ]) BEST_K = 7 #Training clf = KNeighborsClassifier(n_neighbors=BEST_K) pca = PCA(n_components=100) pca_training_X = pca.fit_transform(training_X) pca_test_X = pca.transform(test_X) clf.fit(pca_training_X, training_y) #Testing predictions = clf.predict(pca_test_X) confusion_matrix = np.zeros((12, 12)) for prediction, label in zip(predictions, test_y): confusion_matrix[prediction][label] += 1 print('KNN Accuracy: ', accuracy_score(predictions, test_y)) #Graphing visualize_KNN_k(train_error, val_error, 'KNN_K_Value_Plot') visualize_precision_recall(confusion_matrix, 'KNN_Precision_Recall_Plot', 'KNN Precision and Recall Values by Category')
def __init__(self): if not os.path.exists(DEV_PATH): preprocessing.run() logging.info('Preparing development set...', extra=d) self.devset = json.load(open(DEV_PATH)) logging.info('Preparing trainset...', extra=d) self.trainset = json.load(open(TRAIN_PATH)) logging.info('Preparing word2vec...', extra=d) self.word2vec = features.init_word2vec() # self.glove, self.voc2id, self.id2voc = features.init_glove() logging.info('Preparing elmo...', extra=d) self.trainidx, self.trainelmo, self.devidx, self.develmo = features.init_elmo( ) self.tfidf = {} self.dict = Dictionary()
def main(config_file): """Entry point if called as an executable""" config = config_dataset.config_dataset(config_file) in_grm = os.path.join(config.grm_dir, 'grm-all', 'all') out_file_pca = os.path.join(config.grm_dir, 'grm-all', 'all.pca') in_grm_filtered = os.path.join(config.grm_dir, 'grm-all-' + str(config.grm_cutoff), 'all-' + str(config.grm_cutoff)) out_file_pca_filtered = os.path.join(config.grm_dir, 'grm-all-' + str(config.grm_cutoff), 'all-' + str(config.grm_cutoff) + '.pca') nbpcs = 10 if config.use_sbatch: rmode = "srun" else: rmode = "direct" # compute PCA for all individuals preprocessing.run([config.mygcta, config.gcta, "--grm-bin", in_grm, "--pca", str(nbpcs), "--out", out_file_pca, "--thread-num", str(config.nbproc)], mode=rmode, slurm_par=["-J", "gcta_pca", "-p", "common,dedicated", "--qos", "fast", "-c", str(config.nbproc)]) plot_pca(out_file_pca) # compute PCA for unrelated individuals preprocessing.run([config.mygcta, config.gcta, "--grm-bin", in_grm_filtered, "--pca", str(nbpcs), "--out", out_file_pca_filtered, "--thread-num", str(config.nbproc)], mode=rmode, slurm_par=["-J", "gcta_pca", "-p", "common,dedicated", "--qos", "fast", "-c", str(config.nbproc)]) plot_pca(out_file_pca_filtered)
def main(config_file): """Entry point if called as an executable""" config = config_dataset.config_dataset(config_file) # ========= 1. All SNPS ============= in_dir_gwas_allsnps = os.path.join(config.gwa_dir, 'gwas-all') out_dir = os.path.join(config.hsq_dir, 'genesis') out_dir_log = os.path.join(out_dir, 'log') os.makedirs(out_dir_log, exist_ok=True) # slurm configuration if config.use_sbatch: mode = "sbatch" else: mode = "direct" for pheno in config.phe_list: assoc_file = os.path.join(in_dir_gwas_allsnps, "all." + pheno + ".assoc.linear") res_file = os.path.join(out_dir, "all." + pheno + ".RData") if not os.path.exists(assoc_file): print("Warning: {} not found.".format(assoc_file)) continue cmd = ["Rscript", os.path.join(os.path.dirname(os.path.abspath(__file__)), 'genesis.R'), assoc_file, res_file, str(config.nbproc)] slurm_par = ["-J", "genesis", "--qos", "ghfc", "-p", "ghfc", # "-p", "common", "-D", out_dir_log, "-o", "all." + pheno + "-%j.out", "-e", "all." + pheno + "-%j.out", "-c", str(config.nbproc), "--mem", "4G"] preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
def main(): """Entry point if called as an executable""" in_grm_filtered = os.path.join(GRM_DIR, 'grm-all-' + str(GRM_CUTOFF), 'all-' + str(GRM_CUTOFF)) out_file_pca = os.path.join(GRM_DIR, 'grm-all-' + str(GRM_CUTOFF), 'all-' + str(GRM_CUTOFF)) nbpcs = 10 if USE_SBATCH: rmode = "srun" else: rmode = "direct" preprocessing.run([ MYGCTA, GCTA, "--grm-bin", in_grm_filtered, "--pca", str(nbpcs), "--out", out_file_pca, "--thread-num", str(NBPROC) ], mode=rmode, slurm_par=["-c", str(NBPROC)])
def __init__(self, stop=True, lowercase=True, punctuation=True, w2v_dim=300): if not os.path.exists(DEV_PATH): preprocessing.run() self.w2v_dim = w2v_dim self.lowercase = lowercase self.stop = stop self.punctuation = punctuation logging.info('Preparing test set 2016...') self.testset2016 = json.load(open(TEST2016_PATH)) self.test2016data = self.format_data(self.testset2016) logging.info('Preparing test set 2017...') self.testset2017 = json.load(open(TEST2017_PATH)) self.test2017data = self.format_data(self.testset2017) logging.info('Preparing development set...') self.devset = json.load(open(DEV_PATH)) self.devdata = self.format_data(self.devset) logging.info('Preparing trainset...') self.trainset = json.load(open(TRAIN_PATH)) self.traindata = self.format_data(self.trainset) info = 'TRAIN DATA SIZE: ' + str(len(self.traindata)) logging.info(info) self.word2vec = word2vec.init_word2vec(lowercase=self.lowercase, punctuation=self.punctuation, stop=self.stop, dim=self.w2v_dim) # additional data self.init_additional()
def svm(): test_X, test_y, training_X, training_y = preprocessing.run() #Training clf = SVC(gamma='scale', decision_function_shape='ovo') pca = PCA(n_components=100) pca_training_X = pca.fit_transform(training_X) pca_test_X = pca.transform(test_X) clf.fit(pca_training_X, training_y) #Testing predictions = clf.predict(pca_test_X) confusion_matrix = np.zeros((12, 12)) for prediction, label in zip(predictions, test_y): confusion_matrix[prediction][label] += 1 print('SVM Accuracy: ', accuracy_score(predictions, test_y)) #Graphing visualize_precision_recall(confusion_matrix, 'SVM_Precision_Recall_Plot', 'SVM Precision and Recall Values by Category')
def extract(filename, inverse=False, resize=False): preprocessing.run(filename, inverse, resize) processing.run(filename) postprocessing.run(filename) return extraction.run(filename)
def main(): """Entry point if called as an executable""" # initiate seed for pandas (but not for GCTA) np.random.seed(2017) gen_file = os.path.join(PRU_DIR, 'all') if USE_SBATCH: mode = "sbatch" else: mode = "direct" if not os.path.exists(PHE_DIR): os.makedirs(PHE_DIR) # load SNP table snp_table = pd.read_table(gen_file + ".bim", na_values=".", header=None).iloc[:, 1] snp_table.dropna(inplace=True) # load individual table ind_table = pd.read_table(PCS, sep=' ', dtype={'FID': str, 'IID': str}).loc[:, ['FID', 'IID']] # simulate heritable phenotypes with GCTA for hsq in HSQ: for n_snp in N_SNP: for n_ind in N_IND: out_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" + str(n_ind)) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) phe_file = os.path.join(out_dir, "phe_\\i") snp_file = os.path.join(out_dir, "snp_\\i.txt") ind_file = os.path.join(out_dir, "ind_\\i.txt") print("Generating SNP and individual lists...") for i in range(1, N_ITER+1): # extract SNP subset snp_file_i = snp_file.replace("\\i", str(i)) snp_list = snp_table.sample(n_snp) snp_list.to_csv(snp_file_i, index=False) # extract individual subset ind_file_i = ind_file.replace("\\i", str(i)) ind_list = ind_table.sample(n_ind).sort_values(by=["FID", "IID"]) ind_list.to_csv(ind_file_i, index=False, sep='\t') print("Simulating phenotypes...") sys.stdout.flush() # simulate phenotypes with GCTA preprocessing.run([GCTA, "--bfile", gen_file, "--keep", ind_file, "--simu-qt", "--simu-hsq", str(hsq), "--simu-causal-loci", snp_file, "--out", phe_file], mode=mode, slurm_par=["-J", "simu_pheno", "--mem", "4G", "-D", out_dir, "-W"], array=range(1, N_ITER+1))
def __init__(self, stop=True, vector='', lowercase=True, punctuation=True, proctrain=True, elmo_layer='top', w2vdim=300): if not os.path.exists(DEV_PATH): preprocessing.run() self.lowercase = lowercase self.stop = stop self.punctuation = punctuation self.w2vdim = w2vdim self.proctrain = proctrain self.vector = vector self.elmo_layer = elmo_layer logging.info('Preparing test set 2016...') self.testset2016 = json.load(open(TEST2016_PATH)) self.test2016data, _, _, _ = self.format_data(self.testset2016) logging.info('Preparing test set 2017...') self.testset2017 = json.load(open(TEST2017_PATH)) self.test2017data, _, _, _ = self.format_data(self.testset2017) logging.info('Preparing development set...') self.devset = json.load(open(DEV_PATH)) self.devdata, _, _, _ = self.format_data(self.devset) logging.info('Preparing trainset...') self.trainset = json.load(open(TRAIN_PATH)) self.traindata, self.voc2id, self.id2voc, self.vocabulary = self.format_data( self.trainset) info = 'TRAIN DATA SIZE: ' + str(len(self.traindata)) logging.info(info) self.word2vec = None if 'word2vec' in self.vector: self.word2vec = word2vec.init_word2vec( lowercase=self.lowercase, punctuation=self.punctuation, stop=self.stop, dim=self.w2vdim) self.fasttext = None if 'fasttext' in self.vector: self.fasttext = fasttext.init_fasttext( lowercase=self.lowercase, punctuation=self.punctuation, stop=self.stop, dim=self.w2vdim) self.trainidx = self.trainelmo = self.devidx = self.develmo = self.test2016idx = self.test2016elmo = self.test2017idx = self.test2017elmo = None if 'elmo' in self.vector: self.trainidx, self.trainelmo, self.devidx, self.develmo, self.test2016idx, self.test2016elmo, self.test2017idx, self.test2017elmo = elmo.init_elmo( lowercase=self.lowercase, stop=self.stop, punctuation=self.punctuation, path=ELMO_PATH) self.alignments = self.init_alignments(ALIGNMENTS_PATH) # additional data self.init_additional()
def main(): """Entry point if called as an executable""" if USE_SBATCH: mode = "sbatch" else: mode = "direct" gen_file = os.path.join(PRU_DIR, 'all') # estimate hsq with GCTA for hsq in HSQ: for n_snp in N_SNP: for n_ind in N_IND: phe_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" + str(n_ind)) phe_file = os.path.join(phe_dir, "phe_\\i") ind_file = os.path.join(phe_dir, "ind_\\i.txt") print("Remove header of individual lists...") for i in range(1, N_ITER+1): ind_file_i = ind_file.replace("\\i", str(i)) phe_file_i = phe_file.replace("\\i", str(i)) source_file = open(ind_file_i, 'r') source_file.readline() target_file = open(phe_file_i + ".ind", 'w') shutil.copyfileobj(source_file, target_file) source_file.close() target_file.close() out_dir_hsq = os.path.join(HSQ_DIR + "_gctb", os.path.basename(phe_dir)) if os.path.exists(out_dir_hsq): shutil.rmtree(out_dir_hsq) os.makedirs(out_dir_hsq) print("estimating hsq...") sys.stdout.flush() # estimate required memory nsnp = sum(1 for _ in open(gen_file + ".bim")) mem = 4 * nsnp * n_ind + nsnp * 500 mem_str = str(math.ceil(mem / 1e6)) + "M" preprocessing.run([MYPLINK, 'mpirun', '-np', str(NBPROC), '--oversubscribe', GCTB, "--bfile", gen_file, "--pheno", phe_file + ".phen", "--keep", phe_file + ".ind", "--qcovar", os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar", os.path.join(PHE_DIR, "centre.txt"), "--covar", os.path.join(PHE_DIR, "sex.txt"), "--bayes", "S", "--out", os.path.join(out_dir_hsq, os.path.basename(phe_file))], mode=mode, slurm_par=["-J", "simu_hsq", "--mem", mem_str, "-c", str(NBPROC), "-D", out_dir_hsq, "-W"], array=range(1, N_ITER+1), check=False) print("creating zip file...") sys.stdout.flush() shutil.make_archive(out_dir_hsq, "zip", os.path.dirname(out_dir_hsq), os.path.basename(out_dir_hsq))
def main(): """Entry point if called as an executable""" in_prefix = 'all' in_file_allsnps = os.path.join(FIL_DIR, in_prefix) out_dir_gwas_allsnps = os.path.join(GWA_DIR, 'gwas-all') log_dir_gwas_allsnps = os.path.join(out_dir_gwas_allsnps, 'log') in_file_prunedsnps = os.path.join(PRU_DIR, in_prefix) out_dir_gwas_prunedsnps = os.path.join(GWA_DIR, 'gwas-pruned') log_dir_gwas_prunedsnps = os.path.join(out_dir_gwas_prunedsnps, 'log') ## use sex, center and age as covariates ## filter individuals in centre.txt keeping only those in the genotype file fam_table = pd.read_table( in_file_allsnps + ".fam", delim_whitespace=True, names=['FID', 'IID', 'PID', 'MID', 'Gender', 'Phenotype']) centre_table = pd.read_table(os.path.join(PHE_DIR, "centre.txt"), delim_whitespace=True, index_col=False) centre_table = centre_table[centre_table.IID.isin(fam_table.IID)] centre_table.to_csv(os.path.join(PHE_DIR, "centre.cov"), sep='\t', index=False) # slurm configuration if USE_SBATCH: smode = "sbatch" else: smode = "direct" ## All SNPs os.makedirs(log_dir_gwas_allsnps, exist_ok=True) for pheno in PHE_LIST: out_prefix = 'all.' + pheno preprocessing.run([ MYPLINK, PLINK, "--bfile", in_file_allsnps, "--allow-no-sex", "--linear", "hide-covar", "--pheno", os.path.join(PHE_DIR, pheno + ".txt"), "--qcovar", os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar", os.path.join(PHE_DIR, "centre.cov"), "--qcovar", os.path.join(PHE_DIR, "sex.txt"), "--out", os.path.join(out_dir_gwas_allsnps, out_prefix) ], mode=smode, slurm_par=["-J", "gwas", "-D", log_dir_gwas_allsnps]) ## Pruned SNPs os.makedirs(log_dir_gwas_prunedsnps) for pheno in PHE_LIST: out_prefix = 'all.' + pheno preprocessing.run( [ MYPLINK, PLINK, "--bfile", in_file_prunedsnps, "--allow-no-sex", "--linear", "hide-covar", "--pheno", os.path.join(PHE_DIR, pheno + ".txt"), "--qcovar", os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar", os.path.join(PHE_DIR, "centre.cov"), "--qcovar", os.path.join(PHE_DIR, "sex.txt"), "--out", os.path.join(out_dir_gwas_prunedsnps, out_prefix) ], mode=smode, slurm_par=["-J", "gwas", "-D", log_dir_gwas_prunedsnps])
import preprocessing from warehouse import Warehouse import numpy as np import mdptoolbox import pandas as pd #Step 1 # run only one time to create transition probability matrix preprocessing.run() warehouse = Warehouse() warehouse.save_tpm() #Step2 warehouse = Warehouse() tpm = warehouse.get_tpm() rewards_matrix = warehouse.rewards_matrix() mdp_p = mdptoolbox.mdp.PolicyIteration(tpm, rewards_matrix, 0.9, max_iter=100) mdp_v = mdptoolbox.mdp.ValueIteration(tpm, rewards_matrix, 0.9, max_iter=100) mdp_p.run() mdp_v.run() result_p = warehouse.test_rl_policy(mdp_p.policy) result_v = warehouse.test_rl_policy(mdp_v.policy) print("ValueIteration Robot traveled: ", result_v[0]) value_iter_states = result_v[1] print("PolicyIteration Robot traveled: ", result_p[0]) policy_iter_states = result_p[1]
""" Atividade para trabalhar o pré-processamento dos dados. Criação de modelo preditivo para diabetes e envio para verificação de peformance no servidor. @author: Aydano Machado <*****@*****.**> """ import pandas as pd from sklearn.neighbors import KNeighborsClassifier import requests import preprocessing print('\n - Fazendo pre-processamento') feature_cols = preprocessing.run() print('\n - Lendo o arquivo com o dataset sobre diabetes') data = pd.read_csv('diabetes_dataset.csv') # Criando X and y par ao algorítmo de aprendizagem de máquina.\ print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset') # Caso queira modificar as colunas consideradas basta algera o array a seguir. # feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', # 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'] X = data[feature_cols] y = data.Outcome # Ciando o modelo preditivo para a base trabalhada print(' - Criando modelo preditivo')
def run(): X, y = preprocessing.run() clf = svm.SVC(gamma='scale', decision_function_shape='ovo') #for now, just run the function. eventually, we'll be iterating through these, passing in different tuning parameters each time z = kfoldcv.svm5F(X, y) return clf.fit(X, y)
''' vae_sess = tf.Session() rnn_sess = tf.Session() actor_sess = tf.Session() env = EnvWrap(FLAGS.init_frame_skip, FLAGS.frame_skip, FLAGS.env, FLAGS.renderGame) vaegan = VAEGAN.VAEGAN(vae_sess) rnn = RNN.RNN(rnn_sess) actor = ACTOR.ACTOR(actor_sess) mcts = MCTS.Tree(rnn, actor) trainer = Trainer() #If called, train the VAEGAN AND RNN before the actor if (FLAGS.preprocessing): preprocessing.run(env, vaegan, trainer, rnn) if (FLAGS.playing): #Make the actor play and train VAEGAN, RNN and actor playing.run(env, vaegan, rnn, actor, trainer, mcts) ''' def main(): #Tran alphazero using MCTS trainer.trainActor(mcts, vae, rnn, env, actor) ONce the VAE and RNN have been trained, I have to use the alphazero. Alphazero takes the current state and asks the MCTS to create the next states. No okay, so the next thhing is to give the current state to the MCTS that starts its algorithm. AT every iteration asks to alphazero to evaluate the node.
def main(config_file): config = config_dataset.config_dataset(config_file) nbit = config.nbit nbproc = config.nbproc out_dir = config.permu_dir out_dir_log = os.path.join(out_dir, 'log') if not os.path.exists(out_dir_log): os.makedirs(out_dir_log) if config.use_sbatch: mode = "sbatch" else: mode = "direct" # Run permutations def run_permutations(): in_file_pruned = os.path.join(config.pru_dir, 'all') # Estimate required memory in kilobytes nind = preprocessing.linecount(config.keep_ind) nsnp_tot = preprocessing.linecount(in_file_pruned + '.bim') nsnps = [preprocessing.linecount(f) for f in snplist_files] if nsnp_tot == sum(nsnps): ncomp = 1 + len(snplist_files) else: ncomp = 2 + len(snplist_files) mem = math.ceil(500000 + (0.025 + 0.009 * ncomp) * nind**2) sbatch_par_mem = str(mem) + 'K' array_lim = config.array_lim job_id = preprocessing.run_array( ['python3', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'permutations.py'), '\\j' + '_' + '\\i', in_file_pruned, ",".join(snplist_files), permu_path, os.path.abspath(config_file)], mode=mode, slurm_par=["-J", "permu_pheno", # "--qos", "fast", "-p", "common", "--mem", sbatch_par_mem, "--cpus-per-task", str(nbproc), "-D", out_dir_log], array=range(1, nbit+1), array_limit=array_lim) return job_id part2jid = {} # margins = [0, 10, 20, 30, 40, 50] margins = [0, 20, 50] # genic non-genic for margin in margins: snplist_files = [os.path.join(config.grm_dir, 'grm-genic', 'genic-margin' + str(margin) + '.snplist')] permu_path = os.path.join(out_dir, 'genic-margin' + str(margin)) hsq_prefix = os.path.join("hsq-genic", "genic-margin" + str(margin)) part2jid[hsq_prefix] = run_permutations() if margin > 0: snplist_files = [os.path.join(config.grm_dir, 'grm-genic', 'genic-margin0.snplist'), os.path.join(config.grm_dir, 'grm-genic', 'updown-margin' + str(margin) + '.snplist')] permu_path = os.path.join(out_dir, 'updown-margin' + str(margin)) hsq_prefix = os.path.join("hsq-genic", "updown-margin" + str(margin)) part2jid[hsq_prefix] = run_permutations() # cnsexpression non-cnsexpression non-genic snplist_files = [os.path.join(config.grm_dir, 'grm-cnsexpression', 'cnsexpression-margin50.snplist'), os.path.join(config.grm_dir, 'grm-cnsexpression', 'noncnsexpression-margin50.snplist')] permu_path = os.path.join(out_dir, "cnsexpression-margin50") hsq_prefix = os.path.join("hsq-cnsexpression", "cnsexpression-margin50") part2jid[hsq_prefix] = run_permutations() # neurodev non-neurodev non-genic snplist_files = [os.path.join(config.grm_dir, 'grm-neurodev', 'neurodev-margin50.snplist'), os.path.join(config.grm_dir, 'grm-neurodev', 'nonneurodev-margin50.snplist')] permu_path = os.path.join(out_dir, "neurodev-margin50") hsq_prefix = os.path.join("hsq-neurodev", "neurodev-margin50") part2jid[hsq_prefix] = run_permutations() # maf maf_intervals = config.maf_intervals snplist_files = [os.path.join(config.grm_dir, 'grm-maf', 'maf{}-{}.snplist'.format(*maf_int)) for maf_int in maf_intervals] permu_path = os.path.join(out_dir, "maf") hsq_prefix = os.path.join("hsq-maf", "maf") part2jid[hsq_prefix] = run_permutations() # == Once permutations are done, compute z-scores and p-value for the different partitions == # # to compute the p-values with sbatch for partition, jid in part2jid.items(): cmd = ["python3", os.path.join(os.path.dirname(os.path.abspath(__file__)), 'permutations_zscores.py'), os.path.abspath(config_file), partition, out_dir] slurm_par = ["-J", "zscores_" + partition, "--qos", "fast", "-p", "dedicated", # "-p", "common", "-D", out_dir_log, "--mem", "2G", "--dependency", "afterany:" + jid] preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
def main(config_file): """Entry point if called as an executable""" config = config_dataset.config_dataset(config_file) # numit = 10000 # burnin = 5000 seed = 333 ndist = 4 gpin = [0., 0.00001, 0.0001, 0.001] # slurm configuration if config.use_sbatch: mode = "sbatch" else: mode = "direct" ncpus = config.nbproc out_dir = os.path.join(config.hsq_dir, 'bayesR') log_dir = os.path.join(out_dir, 'log') tmp_dir = os.path.join(out_dir, 'plink') os.makedirs(log_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) in_files = {"all": os.path.join(config.fil_dir, 'all'), "pruned": os.path.join(config.pru_dir, 'all')} for key in in_files: in_file = in_files[key] for pheno in config.phe_list: pheno_file = os.path.join(config.phe_dir, pheno+'.txt') tmp_in_file = os.path.join(tmp_dir, key + "." + pheno) out_file = os.path.join(out_dir, key + "." + pheno) data = pd.read_table(pheno_file) data.rename(columns={data.columns[2]: "pheno"}, inplace=True) for qcov in config.quant_covar: data = data.merge(pd.read_table(qcov, sep='\s+')) for cov in config.qual_covar: data = data.merge(pd.read_table(cov, sep='\s+')) data.set_index(["FID", "IID"], inplace=True) model = smf.ols(formula='pheno~' + "+".join(data.columns.difference(["pheno"])), data=data).fit() resid_file = tmp_in_file + ".resid.txt" model.resid.to_csv(resid_file, sep=" ") preprocessing.run([config.plink, "--bfile", in_file, "--keep", config.keep_ind, "--pheno", resid_file, "--make-bed", "--out", tmp_in_file ]) # Estimate required memory in megabytes nind = preprocessing.linecount(tmp_in_file + '.fam') nsnp = preprocessing.linecount(tmp_in_file + '.bim') mem = math.ceil((1 + 2e-6 * nind * nsnp) * 1.1) sbatch_par_mem = str(mem) + 'M' cmd = [config.bayesrv2, "-bfile", tmp_in_file, "-nthreads", str(ncpus), "-ndist", str(ndist), "-gpin", ",".join(map(str, gpin)), "-out", out_file, # "-numit", str(numit), # "-burnin", str(burnin), "-seed", str(seed)] slurm_par = ["-J", "bayesR_" + key, "--qos", "ghfc", "-p", "ghfc", "-c", str(ncpus), "-D", log_dir, "-o", key + "." + pheno + "-%j.out", "-e", key + "." + pheno + "-%j.out", "--mem", sbatch_par_mem] jid = preprocessing.run(cmd, mode=mode, slurm_par=slurm_par) if config.clean_permu: cmd = ["rm", tmp_in_file + ".*"] slurm_par = ["-J", "clean_bayesR", "-p", "common,dedicated", "--qos", "fast", "-D", log_dir, "-o", "clean." + key + "." + pheno + "-%j.out", "-e", "clean." + key + "." + pheno + "-%j.out", "--mem", "500M", "--dependency", "afterany:" + jid] preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
def main(config_file): """Entry point if called as an executable""" config = config_dataset.config_dataset(config_file) in_prefix = 'all' in_file_allsnps = os.path.join(config.fil_dir, in_prefix) out_dir_gwas_allsnps = os.path.join(config.gwa_dir, 'gwas-all') log_dir_gwas_allsnps = os.path.join(out_dir_gwas_allsnps, 'log') in_file_prunedsnps = os.path.join(config.pru_dir, in_prefix) out_dir_gwas_prunedsnps = os.path.join(config.gwa_dir, 'gwas-pruned') log_dir_gwas_prunedsnps = os.path.join(out_dir_gwas_prunedsnps, 'log') # use sex, center and age as covariates # Create dummy coded centre table if not os.path.isfile(os.path.join(config.phe_dir, "centre.cov")): # filter individuals in centre.txt keeping only those in the genotype file fam_table = pd.read_table(in_file_allsnps + ".fam", delim_whitespace=True, names=['FID', 'IID', 'PID', 'MID', 'Gender', 'Phenotype']) centre_table = pd.read_table(os.path.join(config.phe_dir, "centre.txt"), delim_whitespace=True, index_col=False) centre_table = centre_table[centre_table.IID.isin(fam_table.IID)] centre_table.to_csv(os.path.join(config.phe_dir, "centre.cov"), sep='\t', index=False) # slurm configuration if config.use_sbatch: smode = "sbatch" else: smode = "direct" # All SNPs os.makedirs(log_dir_gwas_allsnps, exist_ok=True) for pheno in config.phe_list: out_prefix = 'all.' + pheno preprocessing.run([config.myplink, config.plink, "--bfile", in_file_allsnps, "--allow-no-sex", "--linear", "hide-covar", "--pheno", os.path.join(config.phe_dir, pheno+".txt"), "--qcovar", os.path.join(config.phe_dir, "age.txt"), "--qcovar", config.pcs, "--covar", os.path.join(config.phe_dir, "centre.cov"), "--qcovar", os.path.join(config.phe_dir, "sex.txt"), "--ci", str(0.95), "--out", os.path.join(out_dir_gwas_allsnps, out_prefix)], mode=smode, slurm_par=["-J", "gwas", "-D", log_dir_gwas_allsnps]) # Pruned SNPs os.makedirs(log_dir_gwas_prunedsnps, exist_ok=True) for pheno in config.phe_list: out_prefix = 'all.' + pheno preprocessing.run([config.myplink, config.plink, "--bfile", in_file_prunedsnps, "--allow-no-sex", "--linear", "hide-covar", "--pheno", os.path.join(config.phe_dir, pheno+".txt"), "--qcovar", os.path.join(config.phe_dir, "age.txt"), "--qcovar", config.pcs, "--covar", os.path.join(config.phe_dir, "centre.cov"), "--qcovar", os.path.join(config.phe_dir, "sex.txt"), "--ci", str(0.95), "--out", os.path.join(out_dir_gwas_prunedsnps, out_prefix)], mode=smode, slurm_par=["-J", "gwas", "-D", log_dir_gwas_prunedsnps])
def run(): X, y = preprocessing.run() nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X) distances, indices = nbrs.kneighbors(X) return distances, indices
import preprocessing import learning from utils import load_submission, load_train_df from configuration import CONFIG if __name__ == "__main__": # Create train and meteo preprocessed files preprocessing.run() # Define a model from sklearn.neighbors import KNeighborsRegressor _df = load_train_df(CONFIG.preprocessed_train_path) _submission_df = load_submission() _estimator = KNeighborsRegressor(n_neighbors=4, weights='distance') # estimator = LogisticRegression() _scoring = 'mean_squared_error' _k_fold = 3 _n_jobs = 3 _verbose = 0 _fit_params = None _cols = ["YEAR", "WEEK_NUMBER", "WEEK_DAY", "TIME"] _weights = [1, 1, 1, 0.1] # Test the model print(learning.cross_val_score(_estimator, _cols, _k_fold, _weights, _scoring, _n_jobs, _verbose, _fit_params, chunksize=100000)) # Create the corresponding submission file learning.create_submission_file(_estimator, _cols, weights=_weights)
import preprocessing import sys import socket if __name__ == "__main__": if len(sys.argv) == 3: preprocessing.run(sys.argv[1], sys.argv[2]) elif len(sys.argv) == 2: ip_addr = socket.gethostbyname(socket.getfqdn()) preprocessing.run(ip_addr, 8000, sys.argv[1]) else: raise Exception("Correct argument form not supplied")