Esempio n. 1
0
# chr_num="12"
# conttype = "contacts.gz"

if __name__ == '__main__': #Requered for parallization, at least on Windows
    #,"chr10", "chr1"]:
    for conttype in [conttype]:
        print("hello")
        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG)

        input_folder ="/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/input/K562/"
        output_folder = "/mnt/scratch/ws/psbelokopytova/202001051010polina_data/3DPredictor/out/K562/5KB/all_predictors/"
        cell_type="K562"
        lengths_dict = {'chr1': 1494930, 'chr3': 609806, 'chr5': 518646, 'chr7': 682860, 'chr11': 726290, 'chr13': 115324}
        params = Parameters()
        params.binsize = 5000 #sequence resolution of contacts data. Use for finding of normalized coefficient file
        params.window_size = params.binsize #region around contact to be binned for predictors. Usually equal to binsize
        params.mindist = params.binsize*2+1 #minimum distance between contacting regions
        params.maxdist = 1500000
        params.sample_size = 250000 #how many contacts write to file
        params.conttype = conttype
        params.max_cpus = 11
        params.keep_only_orient=False
        params.use_only_contacts_with_CTCF = "all_cont"#"cont_with_CTCF"#"#"all_cont"#"cont_with_CTCF "

        write_all_chrms_in_file=False #set True if you want write training file consisting several chromosomes
        fill_empty_contacts = False #set True if you want use all contacts in region, without empty contacts

        logging.getLogger(__name__).debug("Using input folder "+input_folder)

        #Read contacts data
    output_folder = args['output_folder']
    cell_type = args['cell_type']
    start = int(args['start'])
    end = int(args['end'])
    chromosome = 'chr' + args['chr_num']
    hic_name = args['hic_name']
    CTCF_file_name = args['CTCF_file_name']
    #RNA_file_name = args['RNA_file_name']

    # validate_chrs = args['validate_chrs'].split(",")
    # for chr in validate_chrs:
    #     chr = int(chr)

    params = Parameters()
    params.binsize = int(
        args['binsize']
    )  #sequence resolution of contacts data. Use for finding of normalized coefficient file
    params.window_size = params.binsize  #region around contact to be binned for predictors. Usually equal to binsize
    params.mindist = params.binsize * 2 + 1  #minimum distance between contacting regions
    params.maxdist = 1500000
    # params.sample_size = end - start
    params.sample_size = 2  #how many contacts write to file
    #params.conttype = conttype
    params.max_cpus = int(args['max_cpus'])
    params.keep_only_orient = False
    params.use_only_contacts_with_CTCF = "all_cont"  #"all_cont" or "cont_with_CTCF"
    rearrangement = False

    # deletion = Interval("chr" + chr_num, start, end)
    write_all_chrms_in_file = False  #set True if you want write training file consisting several chromosomes
    fill_empty_contacts = False  #set True if you want use all contacts in region, without empty contacts