Beispiel #1
0
	def __init__(self, config_file, include_vcf_features=False):
		
		self.config_file = config_file
		self.include_vcf_features = include_vcf_features		

		config_params = custom_utils.get_config_params(config_file)
		self.win_len = config_params['win_len']
		#self.win_len = int(config_params['win_len'] / 2)

		self.init_ouput_dirs()
		print(self.out_dir)


		jarvis_pkl_file = self.ml_data_dir + '/jarvis_data.pkl'
		if predict_on_test_set:
			jarvis_pkl_file = self.ml_data_dir + '/jarvis_data.' + str(test_indexes[0]) + '_' + str(test_indexes[1]) + '.pkl'
		self.data_dict_file = jarvis_pkl_file
		
		
		# @anchor -- REUNDANT check: already calling the prepara_data.py module prior to training
		if not os.path.exists(self.data_dict_file): 
			print("\nPreparing training data - calling JarvisDataPreprocessing object...")
			data_preprocessor = JarvisDataPreprocessing(config_file, predict_on_test_set=predict_on_test_set, test_indexes=test_indexes)
			
			# Extract raw sequences from input variant windows and combine with original feature set         
			additional_features_df, filtered_onehot_seqs = data_preprocessor.compile_feature_table_incl_raw_seqs()        
			# Merge data, transform into form appropriate for DNN training and save into file
			self.data_dict_file = data_preprocessor.transform_and_save_data(additional_features_df, filtered_onehot_seqs)         
		print(self.data_dict_file)
	def __init__(self, config_file, genomic_class):
		config_params = custom_utils.get_config_params(config_file)
		self.genomic_class = genomic_class
		
		self.tables_per_metric = {}

		self.current_palette = sns.color_palette() + sns.color_palette("Paired")
		self.hex_colors = [matplotlib.colors.to_hex(x) for x in self.current_palette]
Beispiel #3
0
    def __init__(self, config_file, input_features, chrom, NTHREADS=20):

        print("Initialising new JarvisDataPreprocessing object...")

        self.input_features = input_features
        self.chrom = chrom
        self.NTHREADS = NTHREADS

        # ==== Read config parameters ====
        config_params = custom_utils.get_config_params(config_file)
        self.hg_version = config_params['hg_version']
        print('\n\nhg_version:', self.hg_version)
        self.grch = {'hg19': '37', 'hg38': '38'}

        pathogenic_set = config_params['pathogenic_set']
        benign_set = config_params['benign_set']

        self.patho_benign_sets = pathogenic_set + '_' + benign_set
        self.win_len = config_params['win_len']
        #self.win_len = int(config_params['win_len'] / 2)
        self.Y_label = config_params['Y_label']

        # ==== Define dir structure ====
        out_dir = custom_utils.create_out_dir(config_file)

        self.ml_data_dir = out_dir + '/ml_data'
        if not os.path.exists(self.ml_data_dir):
            os.makedirs(self.ml_data_dir)
        self.seq_out_dir = self.ml_data_dir + '/raw_seq'
        if not os.path.exists(self.seq_out_dir):
            os.makedirs(self.seq_out_dir)
        self.feature_tables_dir = self.ml_data_dir + '/clinvar_feature_tables'
        if not os.path.exists(self.feature_tables_dir):
            os.makedirs(self.feature_tables_dir)

        self.jarvis_predictions_dir = self.ml_data_dir + '/jarvis_predictions'
        if not os.path.exists(self.jarvis_predictions_dir):
            os.makedirs(self.jarvis_predictions_dir)

        self.jarvis_predictions_per_chr_dir = self.jarvis_predictions_dir + '/chr' + str(
            self.chrom)
        if not os.path.exists(self.jarvis_predictions_per_chr_dir):
            os.makedirs(self.jarvis_predictions_per_chr_dir)

        # Specificy input (static) files
        self.human_ref_genome_2bit = '../' + self.hg_version + '/homo_sapiens_GRCh' + self.grch[
            self.hg_version] + '_FASTA/hsa' + self.grch[
                self.hg_version] + '.2bit'


if __name__ == '__main__':

	startTime = datetime.now()

	args = sys.argv
	chrom = args[1]
	config_file = args[2] #'config.yaml'
	single_nt_offset = int(args[3])   # 1 to (win_len-1)



	# Read run parameters from config file and store into a dictionary
	config_params = get_config_params(config_file)
	print(config_params)
	hg_version = config_params['hg_version']
	grch = {'hg19': '37', 'hg38': '38'}

	genomic_classes_files = {}
	print('cwd:', os.getcwd())

	with open(config_params['genomic_classes']) as fh:
		for line in fh:
			line = line.rstrip()
			genomic_class, cur_path, _ = line.split('\t')
			genomic_classes_files[genomic_class] = cur_path


	# ==================== Initialisation ====================
Beispiel #5
0
    config_file = sys.argv[1]
    input_features = sys.argv[2]
    genomic_classes = sys.argv[3]  # comma-separated
    genomic_classes = genomic_classes.split(',')
    use_fixed_cv_batches = bool(int(sys.argv[4]))
    cv_repeats = int(sys.argv[5])
    include_vcf_features = False
    test_indexes = []

    # ----------------------
    train_with_cv = False  # get generalised performance with cross-validation

    config_suffix = re.split("\.", config_file.split('/')[-1])[1]
    print('config_suffix:', config_suffix)

    run_params = custom_utils.get_config_params(config_file)

    # [Note]: predict_on_test_set is __redundant__ and __deprecated__
    #predict_on_test_set = bool(run_params['predict_on_test_set'])
    predict_on_test_set = False

    # -- Compatible only with: train_with_cv = True
    # *************************************
    use_pathogenicity_trained_model = True
    use_conservation_trained_model = False
    # *************************************

    # sanity check
    if use_pathogenicity_trained_model:
        train_with_cv = True