conttype = sys.argv[2] # contacts.gz or oe.gz # chr_num="12,13,14" # conttype = "contacts.gz" logging.basicConfig(level=logging.DEBUG) if __name__ == '__main__': # Requiered for parallelization, at least on Windows for conttype in [conttype]: logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "input/K562/" output_folder = "output/K562/" cell_type = "K562" params = Parameters() params.window_size = 25000 # region around contact to be binned for predictors params.mindist = 50001 # minimum distance between contacting regions params.maxdist = 1500000 # maximum distance between contacting regions params.sample_size = 100 # how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False # set True if you want use only CTCF with orient #params.use_only_contacts_with_CTCF = "cont_with_CTCF" # "cont_with_CTCF" params.use_only_contacts_with_CTCF = "no" # use this option to change proportion # of contacts with nearest ctcf sites in training datasets write_all_chrms_in_file = True # set True if you have train with few chromosomes. Need for writing different chromosomes in the same file fill_empty_contacts = False
conttype = sys.argv[2] # chr_num="12" # conttype = "contacts.gz" if __name__ == '__main__': #Requered for parallization, at least on Windows #,"chr10", "chr1"]: for conttype in [conttype]: print("hello") logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder ="/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/input/K562/" output_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/out/K562/5KB/all_predictors/" cell_type="K562" lengths_dict = {'chr1': 1494930, 'chr3': 609806, 'chr5': 518646, 'chr7': 682860, 'chr11': 726290, 'chr13': 115324} params = Parameters() params.binsize = 5000 #sequence resolution of contacts data. Use for finding of normalized coefficient file params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize params.mindist = params.binsize*2+1 #minimum distance between contacting regions params.maxdist = 1500000 params.sample_size = 250000 #how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient=False params.use_only_contacts_with_CTCF = "all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF " write_all_chrms_in_file=False #set True if you want write training file consisting several chromosomes fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder "+input_folder)
import pandas as pd import os if __name__ == '__main__': #Requered for parallization, at least on Windows #,"chr10", "chr1"]: for conttype in ["contacts.gz", "oe.gz"]: logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "input/Hepat/" #output_folder = "D:/Users/Polina/3Dpredictor/" output_folder = "out/Hepat/validating_chrms/" #input_folder = "input" params = Parameters() params.window_size = 25000 #region around contact to be binned for predictors #params.small_window_size = 12500 #region around contact ancors to be considered as cis params.mindist = 50001 #minimum distance between contacting regions #params.maxdist = params.window_size #max distance between contacting regions params.maxdist = 1500000 #params.binsize = 20000 #when binning regions with predictors, use this binsize params.sample_size = 25000 #how many contacts write to file #params.conttype = "oe.gz" params.conttype = conttype params.max_cpus = 12 logging.getLogger(__name__).debug("Using input folder " + input_folder) #Read contacts data params.contacts_reader = ContactsReader()
if __name__ == '__main__': parser = createParser() namespace = parser.parse_args(sys.argv[1:]) RNA_seq_file = namespace.RNA_seq_file CTCF_file = namespace.CTCF_file CTCF_orient_file = namespace.CTCF_orient_file chr = namespace.chr interval_start = namespace.interval_start interval_end = namespace.interval_end resolution = int(namespace.resolution) model_path = namespace.model_path out_file = namespace.out_file params = Parameters() params.window_size = int( resolution) #region around contact to be binned for predictors params.mindist = int( resolution) * 2 + 1 #minimum distance between contacting regions params.maxdist = 1500000 #maximum distance between contacting regions params.max_cpus = 1 # params.keep_only_orient=False #set True if you want use only CTCF with orient params.multiprocessing = False # params.write_to_file = False # Read CTCF data # CTCF_file format: ENCODE narrow peak # CTCF_orient_file format: chr--start--end--name--score--strand logging.info('create CTCF_PG') # set path to the CTCF chip-seq file: params.ctcf_reader = ChiPSeqReader(CTCF_file, name="CTCF")
conttype = sys.argv[2] # chr_num="12" # conttype = "contacts.gz" if __name__ == '__main__': #Requered for parallization, at least on Windows #,"chr10", "chr1"]: for conttype in [conttype]: print("hello") logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder ="/mnt/scratch/ws/psbelokopytova/201907031108polinaB/3DPredictor/input/NPC/" #output_folder = "D:/Users/Polina/3Dpredictor/" output_folder = "/mnt/scratch/ws/psbelokopytova/201907031108polinaB/3DPredictor/out/NPC/5KB/" cell_type="NPC" params = Parameters() params.window_size = 5000 #region around contact to be binned for predictors params.mindist = 10001 #minimum distance between contacting regions params.maxdist = 1500000 params.sample_size = 1 #how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient=False params.use_only_contacts_with_CTCF = "cont_with_CTCF"#"all_cont"#"#"all_cont"#"cont_with_CTCF " write_all_chrms_in_file=True fill_empty_contacts = False logging.getLogger(__name__).debug("Using input folder "+input_folder)
from DataGenerator import generate_data from PredictorGenerators import E1PredictorGenerator,ChipSeqPredictorGenerator, \ SmallChipSeqPredictorGenerator,SmallE1PredictorGenerator, SitesOrientPredictorGenerator, OrientBlocksPredictorGenerator, \ SitesOnlyOrientPredictorGenerator start_time = time.time() logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "/home/evgeniy/asp/3Dpredictor/input/" #output_folder = "D:/Users/Polina/3Dpredictor/" output_folder = "/home/evgeniy/asp/3Dpredictor/out/" #input_folder = "input" params = Parameters() params.window_size = 25000 #region around contact to be binned for predictors #params.small_window_size = 12500 #region around contact ancors to be considered as cis params.mindist = 50001 #minimum distance between contacting regions #params.maxdist = params.window_size #max distance between contacting regions params.maxdist = 1000000 #params.binsize = 20000 #when binning regions with predictors, use this binsize params.sample_size = 500 #how many contacts write to file params.conttype = "oe.gz" training_file_name = "2018-09-23-trainingOrient.RandOnChr1." + str( params) + ".txt" validation_file_name = "validatingOrient." + str(params) + ".txt" logging.getLogger(__name__).debug("Using input folder " + input_folder) #Read contacts data
input_folder = args['input_folder'] output_folder = args['output_folder'] cell_type = args['cell_type'] start = int(args['start']) end = int(args['end']) chromosome = 'chr' + args['chr_num'] hic_name = args['hic_name'] CTCF_file_name = args['CTCF_file_name'] #RNA_file_name = args['RNA_file_name'] # validate_chrs = args['validate_chrs'].split(",") # for chr in validate_chrs: # chr = int(chr) params = Parameters() params.binsize = int( args['binsize'] ) #sequence resolution of contacts data. Use for finding of normalized coefficient file params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize params.mindist = params.binsize * 2 + 1 #minimum distance between contacting regions params.maxdist = 1500000 # params.sample_size = end - start params.sample_size = 2 #how many contacts write to file #params.conttype = conttype params.max_cpus = int(args['max_cpus']) params.keep_only_orient = False params.use_only_contacts_with_CTCF = "all_cont" #"all_cont" or "cont_with_CTCF" rearrangement = False # deletion = Interval("chr" + chr_num, start, end)
logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "/mnt/scratch/ws/psbelokopytova/202002281332polina_data_2019/3DPredictor/input/" output_folder = "/mnt/scratch/ws/psbelokopytova/202002281332polina_data_2019/3DPredictor/out/mast_cells/" cell_type = "mast_cells" lengths_dict = { 'chr1': 1494930, 'chr3': 609806, 'chr5': 518646, 'chr7': 682860, 'chr11': 726290, 'chr13': 115324 } params = Parameters() params.binsize = 1000 #sequence resolution of contacts data. Use for finding of normalized coefficient file params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize params.mindist = params.binsize * 2 + 1 #minimum distance between contacting regions params.maxdist = 1500000 params.sample_size = 500000 #how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False params.use_only_contacts_with_CTCF = "all_cont" #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF " write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = True #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder)
logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/input/" output_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/out/H1/" cell_type = "K562" lengths_dict = { 'chr1': 1494930, 'chr3': 609806, 'chr5': 518646, 'chr7': 682860, 'chr11': 726290, 'chr13': 115324 } params = Parameters() params.binsize = 1000 #sequence resolution of contacts data. Use for finding of normalized coefficient file params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize params.mindist = params.binsize * 2 + 1 #minimum distance between contacting regions params.maxdist = 1500000 params.sample_size = 500000 #how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False params.use_only_contacts_with_CTCF = "all_cont" #"all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF " write_all_chrms_in_file = False #set True if you want write training file consisting several chromosomes fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts logging.getLogger(__name__).debug("Using input folder " + input_folder)
1] #comma separated number of chromosomes for predictor generation chr_nums = chr_num.split(",") conttype = sys.argv[2] #contacts.gz or oe.gz # chr_num="12,13,14" # conttype = "contacts.gz" if __name__ == '__main__': #Requiered for parallelization, at least on Windows for conttype in [conttype]: logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "/input/K562/" output_folder = "/output/K562/" cell_type = "K562" params = Parameters() params.window_size = 5000 #region around contact to be binned for predictors params.mindist = 10001 #minimum distance between contacting regions params.maxdist = 1500000 #maximum distance between contacting regions params.sample_size = 30000 #how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False #set True if you want use only CTCF with orient params.use_only_contacts_with_CTCF = "all_cont" #"cont_with_CTCF" #this option use for training to change proportion #of contacts with nearest ctcf sites write_all_chrms_in_file = True #set True if you have train with few chromosomes. Need for writing different chromosomes in the same file fill_empty_contacts = False logging.getLogger(__name__).debug("Using input folder " + input_folder) #Read contacts data
conttype = "contacts.gz" # contacts.gz or oe.gz # chr_num="12,13,14" # conttype = "contacts.gz" logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) if __name__ == '__main__': # Requiered for parallelization, at least on Windows for conttype in [conttype]: logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = path1 + "/input/chr19_mm10/" output_folder = path1 + "/output/chr19_mm10/" cell_type = "NPC" params = Parameters() params.window_size = 25000 # region around contact to be binned for predictors params.mindist = 50001 # minimum distance between contacting regions params.maxdist = 1500000 # maximum distance between contacting regions params.sample_size = 1 # how many contacts write to file params.conttype = conttype params.max_cpus = 11 params.keep_only_orient = False # set True if you want use only CTCF with orient params.use_only_contacts_with_CTCF = "all_cont" # "cont_with_CTCF" #this option use for training to change proportion # of contacts with nearest ctcf sites write_all_chrms_in_file = False # set True if you have train with few chromosomes. Need for writing different chromosomes in the same file fill_empty_contacts = False logging.getLogger(__name__).debug("Using input folder " + input_folder) # Read contacts data
import pandas as pd import os if __name__ == '__main__': #Requered for parallization, at least on Windows #,"chr10", "chr1"]: for conttype in ["contacts.gz", "oe.gz"]: logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) input_folder = "input/GM12878/" #output_folder = "D:/Users/Polina/3Dpredictor/" output_folder = "out/GM12878/" #input_folder = "input" params = Parameters() params.window_size = 25000 #region around contact to be binned for predictors #params.small_window_size = 12500 #region around contact ancors to be considered as cis params.mindist = 50001 #minimum distance between contacting regions #params.maxdist = params.window_size #max distance between contacting regions params.maxdist = 1500000 #params.binsize = 20000 #when binning regions with predictors, use this binsize params.sample_size = 250000 #how many contacts write to file #params.conttype = "oe.gz" params.conttype = conttype params.max_cpus = 12 logging.getLogger(__name__).debug("Using input folder " + input_folder) #Read contacts data params.contacts_reader = ContactsReader()