def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="Name of input list to grep") parser.add_argument("-n", "--n_parts", type=int, default=10, help="How many times to grep") parser.add_argument("-db", "--database", type=str, help="Where to grep") parser.add_argument("-o", "--output", type=str, default='to_grep.sh', help="Name of script with grep commands") parser.add_argument("-p", "--prefix", type=str, default='', help="Common prefix, command after cat $input") parser.add_argument("-s", "--suffix", type=str, default='', help="Common suffix, command before >> $output &") args = parser.parse_args() # Main with open(args.input, 'r') as inp_list: filelist = inp_list.readlines() n_patterns = len( filelist) // args.n_parts # how many patterns in one command common_prefix = "cat %s | " % args.database + args.prefix common_suffix = args.suffix + " >> %s.grep &" % args.input out_list = [] if n_patterns > 1: for i in range(args.n_parts - 1): out_list.append(common_prefix + "grep '" + "\|".join([ x.strip('\n') for x in filelist[(i * n_patterns):(i * n_patterns + n_patterns)] ]) + "'" + common_suffix) out_list.append(common_prefix + "grep '" + "\|".join([ x.strip('\n') for x in filelist[(i * n_patterns + n_patterns):] ]) + "'" + common_suffix) else: out_list.append(common_prefix + "grep '" + "\|".join([x.strip('\n') for x in filelist]) + "'" + common_suffix) with open(args.output, 'w') as out_file: out_file.write('\n'.join(out_list)) shell_call('chmod +x %s' % args.output)
def to_bigbed(bed_name, folder_name, genome_version='hg19'): """ Runs UCSC bedToBigBed script to convert bed to bigBed, bedToBigBed must be in $PATH :param bed_name: Name of BED file to be converted :param folder_name: Folder of BED file :param genome_version: Name of genome version, hg19 by default :return: sorted bed and bigBed files in the same folder """ cmd1 = 'sort -k1,1 -k2,2n %s > %s.sorted' % (os.path.join( folder_name, bed_name), os.path.join(folder_name, bed_name)) cmd2 = 'bedToBigBed \ %s.sorted \ http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes \ %s' % (os.path.join(folder_name, bed_name), genome_version, genome_version, os.path.join(folder_name, bed_name.replace('.bed', '.bb'))) for cmd in [cmd1, cmd2]: shell_call(cmd)
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="BED file, bedtools intersect -a A_FEATURE -b B_BEATURE -s -wo output") parser.add_argument("-f", "--features", type=str, help="Path to .BED6 file with features (small intervals)") parser.add_argument("-o", "--output", type=str, help="Output folder with random subfolder for results") parser.add_argument("-a", "--afeature", type=str, help="Name of A_FEATURE, i.e. circles") parser.add_argument("-b", "--bfeature", type=str, help="Name of B_BEATURE, i.e. panhandles") parser.add_argument("-iter", "--iterations", type=int, default='1000', help="Number of iterations for randomizing results") parser.add_argument("-m", "--method", type=str, nargs='+', default=['inside', 'outside', 'bedtools', ], help="Shuffling method(s): inside, outside, bedtools") args = parser.parse_args() path_to_file = args.output.rstrip('/') random_folder = path_to_file + '/random' # category_name = 'categories.csv' # pivot_name = 'categories_pivot.csv' PTES_logger.info('Reading input file... ') PTES_logger.info('Input file: %s ' % args.input) real_dict = parse_bedtools_wo(wo_outfile=args.input) # category - number of intersections PTES_logger.info('Reading input file... done') PTES_logger.info('Running intersections with random files... ') random_dicts = {} # method - category - list of values categories = ['a_in_b', 'b_in_a', 'overlap'] for method in args.method: random_dicts[method] = dict.fromkeys(categories) for k, _ in random_dicts[method].items(): random_dicts[method][k] = [] # now we have separate empty lists as values for n in range(args.iterations): random_input = '%s/%s_%i.bed' % (random_folder, method, n) random_output = '%s/%s_%i.bed.intersect' % (random_folder, method, n) cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % (args.features, random_input, random_output) shell_call(cmd) random_dict = parse_bedtools_wo(wo_outfile=random_output) for cat in categories: random_dicts[method][cat].append(random_dict[cat]) PTES_logger.info('Running intersections with random files... done') PTES_logger.info('Creating output files...') # plotting for method in args.method: fig = plt.figure(figsize=(12,6)) plt.suptitle("Shuffling method %s" % method) ax1 = fig.add_subplot(131) sns.distplot(random_dicts[method]['a_in_b'], kde=False) ax1.axvline(real_dict['a_in_b'], color='r') ax1.set(title='%s_in_%s' % (args.afeature, args.bfeature)); ax2 = fig.add_subplot(132) sns.distplot(random_dicts[method]['b_in_a'], kde=False) ax2.axvline(real_dict['b_in_a'], color='r') ax2.set(title='%s_in_%s' % (args.bfeature, args.afeature)); ax3 = fig.add_subplot(133) sns.distplot(random_dicts[method]['overlap'], kde=False) ax3.axvline(real_dict['overlap'], color='r') ax3.set(title='overlap'); plt.savefig('%s/histograms_%s.png' % (path_to_file, method)) # saving data for histograms for method in args.method: hist_name = '%s/data_hist_%s' % (path_to_file, method) with open(hist_name, 'w') as hist_file: hist_file.write('%s_in_%s' % (args.afeature, args.bfeature) + '\n') hist_file.write('real: %i' % real_dict['a_in_b'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['a_in_b'])) + '\n') hist_file.write('%s_in_%s' % (args.bfeature, args.afeature) + '\n') hist_file.write('real: %i' % real_dict['b_in_a'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['b_in_a'])) + '\n') hist_file.write('overlap' + '\n') hist_file.write('real: %i' % real_dict['overlap'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['overlap'])) + '\n') PTES_logger.info('Creating output files... done') PTES_logger.info('Remember to delete random subfolder')
for i, name in enumerate(file_str.split()): id = name.strip(None) folder_name = '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/%s' % id #without end sh_filename = 'star_%i.sh' % i init_file(sh_filename) cmd_list = [] cmd_list.append('#!/bin/bash -il \ncd %s \n' % '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/') cmd_list.append('source /uge_mnt/home/sunnymouse/tools/miniconda2/bin/activate') ''' cmd_list.append('samtools collate -uOn 256 %s%s.bam %s/tmp-prefix \ | samtools fastq - > %s/%s.fq' % (bam_folder, id, folder_name, folder_name, id)) cmd_list.append('gzip %s/%s.fq' % (folder_name, id)) cmd_list.append('%s/segemehl.x -S \ -Z 10 \ -t 10 \ -s \ -i /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.idx \ -d /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.fa \ -q %s/%s.fq \ -o %s/segemehl.sam \ -u %s/segemehl_unmapped' % (segemehl_bin, folder_name,id, folder_name, folder_name)) cmd_list.append('samtools view %s/segemehl.sam > %s/segemehl.sam.nohead' % (folder_name, folder_name)) cmd_list.append('samtools view -b %s/segemehl.sam > %s/segemehl.bam' % (folder_name, folder_name)) cmd_list.append('%s/segemehl.sam' % folder_name) ''' cmd_list.append('python segemehl_encode.py -i %s/segemehl.sam.nohead -o %s -t %s' % (folder_name, folder_name, id)) writeln_to_file('\n'.join(cmd_list), sh_filename) shell_call('chmod +x ./%s' % sh_filename)
def main(): # Arguments ''' args_s = ('-t ../tests/test_data/ptes/chim_reads_test.csv ' '-j ../tests/test_data/ptes/junc_dict.json.gz ' '-f ../tests/test_data/ptes/chim_junctions_test.csv ' '-q letters_ss=="." ' '-o ../tests/test_results/bed ' '-p test ' '-gz 1') ''' parser = argparse.ArgumentParser() parser.add_argument("-t", "--table", type=str, help="DataFrame with data to create BED, required 4 columns are: chrom, strand, donor, acceptor") parser.add_argument("-sep", "--separator", type=str, default='\t', help="DataFrame separator, tab by default") parser.add_argument("-j", "--json", type=str, help="JSON file with all read intervals as OrderedDicts") parser.add_argument("-gz", "--gzip", type=str, help="Write anything to enable reading .json.gz") parser.add_argument("-q", "--query", type=str, help="Conditions to filter junctions table, string as in pandas.DataFrame.query()") parser.add_argument("-f", "--filter", type=str, help="DataFrame with (chrom, strand, donor, acceptor) to filter input table") parser.add_argument("-n", "--names", type=str, nargs='+', default=['chim1', 'chim2', 'mate2', ], help="List of names for chim parts in BED name: [chim1, chim2, mate2]. \ Important: same order as parts in json values") parser.add_argument("-c", "--colors", type=str, nargs='+', default=['r', 'r', 'b', ], help="List of colors for chim parts in BED name: [chim1, chim2, mate2]. \ Important: same order as parts in json values\ Colors: 'r', 'g', 'b' or in RGB code like '0,255,0'") parser.add_argument("-o", "--output", type=str, default='bed', help="Output folder for results, default is bed/") parser.add_argument("-p", "--prefix", type=str, default='Output', help="Prefix for all output files") parser.add_argument("-sort", "--sort", type=str, help="Write anything to enable sorting BED files") parser.add_argument("-bb", "--bigbed", type=str, help="Write anything to enable creating .bigBed files") args = parser.parse_args() # args = parser.parse_args(args_s.split(' ')) PTES_logger.info('Reading input files...') make_dir(args.output) index_list = ['chrom', 'chain', 'donor', 'acceptor'] input_df = pd.read_csv(args.table, sep=args.separator) for col in index_list: if col not in input_df.columns: PTES_logger.error('Input table does not contain required column %s ' % col) os._exit(1) if args.filter: # filter by junctions filter_df = pd.read_csv(args.filter, sep=args.separator) for col in index_list: if col not in filter_df.columns: PTES_logger.error('Filter table does not contain required column %s ' % col) os._exit(1) cols_to_use = index_list + list(input_df.columns.difference(filter_df.columns)) # avoid repeating columns df_new = pd.merge(filter_df, input_df[cols_to_use], on=index_list, how='inner',) else: df_new = input_df if args.query: # filter reads by conditions df_new = df_new.query(args.query) df_new.to_csv(os.path.join(args.output, 'df_filter.csv'), sep='\t') # Reading .json.gz file if args.gzip: with gzip.GzipFile(args.json, 'r') as fin: junc_dict = json.loads(fin.read().decode('utf-8'), object_pairs_hook=OrderedDict) else: junc_dict = json.load(open(args.json), object_pairs_hook=OrderedDict) len_read_dicts = len(junc_dict.values()[0].values()) # must be 3 for mate_inside/outside and 2 for circles if len(args.names) < len_read_dicts: PTES_logger.warning('List of names has less items than list of features in read_dicts!') part_names = [x[0]+str(x[1]) for x in list(zip(['part_']*len_read_dicts, range(1, len_read_dicts+1)))] else: part_names = args.names if len(args.colors) < len_read_dicts: PTES_logger.warning('List of colors has less items than list of features in read_dicts!') part_colors = ['r']*len_read_dicts else: part_colors = args.colors PTES_logger.info('Reading input files... done') PTES_logger.info('Creating BED files...') bed_name = '%s.bed' % args.prefix # only track lines unique_bed_name = '%s.unique.bed' % args.prefix # one representative read for unique junctions single_bed_name = '%s.single.bed' % args.prefix # single line for one chimeric junction single_unique_bed_name = '%s.single.unique.bed' % args.prefix # for unique junctions, single line for one junction code_name = '%s.codes.csv' % args.prefix # table with codes for each read and descriptions coord_name = '%s.coords.csv' % args.prefix # table with windows to paste into GB and descriptions info_name = '%s.track' % args.prefix # file to submit to GB bed_list = [] # for outputting BED lines unique_dict = {} # for outputting BED lines, unique chimeric junctions single_list = [] # for outputting BED lines, one row per one chimeric junction single_unique_list = [] # for outputting BED lines, one row per one unique chimeric junction coord_list = [] # for outputting coord lines code_list = [] # for outputting coord lines, one row per one read with open(os.path.join(args.output, info_name), 'w') as info_file: info_file.write('\n'.join( ['browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk', 'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', 'browser pack gtexGene', 'track type=bigBed \ name="%s" \ description="bigBed" \ visibility=2 \ itemRgb="On" \ bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true' % ( args.prefix, bed_name.replace('.bed', '.bb') ) ] ) ) num = 0 junctions = df_new.groupby(index_list)['read_name'].apply(list) # unique chimeric junctions for index, read_list in junctions.items(): # value is list of read_names chrom = index[0] # index is (chrom, chain, donor_ss, acceptor_ss) chain = index[1] donor_ss = index[2] acceptor_ss = index[3] windows_min = [] windows_max = [] codes = [] for read_name in read_list: # for each read w. this junction num += 1 code = digit_code(number=num) # every unique number will be 6-digit codes.append(code) track_lists = [] if not unique_dict.get(index, None): # for each unique junction write the 1st read line(s) unique_dict[index] = [] add_unique = True else: add_unique = False read_dict_list = junc_dict[str(index)][read_name] # list of dicts: each dict is one track (i.e. chim_part) # Iterating over tracks for i, read_dict in enumerate(read_dict_list): for k, v in read_dict.items(): read_dict[k] = interval[v[0][0], v[0][1]] track_list = get_track_list(chrom=chrom, chain=chain, read_dict=read_dict, name='_'.join(map(str, [donor_ss, acceptor_ss, code, part_names[i]])), color=part_colors[i]) track_lists.append(track_list) # Writing BED lines, collecting extremas for window size for track_list in track_lists: windows_min.append(int(track_list[1])) # track_list[1] is chromStart, track_list[2] is chromEnd windows_max.append(int(track_list[2])) bed_line = '\t'.join(track_list) bed_list.append(bed_line) if add_unique: unique_dict[index].append(bed_line) # Writing code line code_list.append({ 'chrom': chrom, 'chain': chain, 'donor': donor_ss, 'acceptor': acceptor_ss, 'read_name': read_name, 'code': code }) # Making BED file with one row for the pair of mates single_track = get_single_track(read_dict_list=read_dict_list, kwargs={'chrom': chrom, 'chain': chain, 'name': '_'.join( map(str, [donor_ss, acceptor_ss, code])), 'color': '255,0,255'}) # for checking in GB that intervals are same single_list.append('\t'.join(single_track)) if add_unique: single_unique_list.append('\t'.join(single_track)) # Description for the junction into coords.csv window = (chrom, # one window for junction min(windows_min) - 200, max(windows_max) + 200) coord_list.append({ 'chrom': chrom, 'chain': chain, 'donor': donor_ss, 'acceptor': acceptor_ss, 'window': '%s:%i-%i' % window, 'codes': '-'.join(map(str,[codes[0], codes[-1]])), }) PTES_logger.info('Creating BED files... done') PTES_logger.info('Writing BED files...') with open(os.path.join(args.output, bed_name), 'w') as bed_file, \ open(os.path.join(args.output, unique_bed_name), 'w') as unique_bed_file, \ open(os.path.join(args.output, single_bed_name), 'w') as single_bed_file, \ open(os.path.join(args.output, single_unique_bed_name), 'w') as single_unique_bed_file, \ open(os.path.join(args.output, coord_name), 'w') as coord_file, \ open(os.path.join(args.output, code_name), 'w') as code_file: bed_file.write('\n'.join(bed_list)) single_bed_file.write('\n'.join(single_list)) single_unique_bed_file.write('\n'.join(single_unique_list)) for unique_value in unique_dict.values(): unique_bed_file.write('\n'.join(list(unique_value))+'\n') PTES_logger.info('Writing BED files... done') PTES_logger.info('Creating junctions dataframes...') coord_df = pd.DataFrame(coord_list) code_df = pd.DataFrame(code_list) coord_df.to_csv(os.path.join(args.output, coord_name), sep='\t') code_df.to_csv(os.path.join(args.output, code_name), sep='\t') PTES_logger.info('Creating junctions dataframes... done') if args.sort: PTES_logger.info('Sorting BED files...') for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]: shell_call('cat %s | sort -k1,1 -k2,2n > %s.sorted' % (os.path.join(args.output, filename), os.path.join(args.output, filename),) ) PTES_logger.info('Sorting BED files... done') if args.bigbed: # will also sort files PTES_logger.info('Making bigBed...') for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]: to_bigbed(bed_name=filename, folder_name=args.output) PTES_logger.info('Making bigBed... done')
path_to_file = '.' junctions_name = args.input.rstrip('/') + '/' + 'junc_of_interest.csv' junc_of_interest = [] with open(junctions_name, 'r') as junctions_file: for line in junctions_file: junc_of_interest.append(line.strip().split('\t')) print 'Reading genome file...' genome_file = args.genome genome = SeqIO.index(genome_file, "fasta") print 'done' folder_name = '%s/bed/' % path_to_file cmd1 = 'if [ ! -d %s ]; then mkdir %s; fi' % (folder_name, folder_name) shell_call(cmd1) bed_name = args.output coord_name = bed_name + '.coords.csv' init_file(bed_name, folder=folder_name) # one BED file for all tracks init_file(coord_name, folder=folder_name) # read_name - window in genome browser writeln_to_file('browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac', bed_name, folder=folder_name) writeln_to_file('browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', bed_name, folder=folder_name) writeln_to_file('browser pack gtexGene', bed_name, folder=folder_name) for line_list in junc_of_interest: key = line_list[0] # key is mapped read_name, xi - number of current read alignment xi = line_list[1] n_junctions = line_list[2] annot_donors = line_list[3] annot_acceptors = line_list[4] chrom = read_infos[key][xi][0]
args = parser.parse_args() # Functions # Main make_dir(args.output) PTES_logger.info('Creating intersection file... ') intersection_name = os.path.join( args.output, os.path.basename(args.features) + '.intersect') cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % ( args.features, args.genes, intersection_name, ) shell_call(cmd) PTES_logger.info('Creating intersection file... done') PTES_logger.info('Reading intersection file... ') p_dict = {} gene_p_dict = defaultdict(list) feature_len_list = [] gene_len_list = [] with open(intersection_name, 'r') as intersect_file: for i, line in enumerate(intersect_file): line_list = line.strip().split() b_start = get_b_start(line) if not b_start: continue chrom1 = line_list[0]
'chrom' : chrom, 'chain' : chain, 'donor' : str(donor_ss), 'annot_donor' : annot_donor, 'acceptor' : str(acceptor_ss), 'annot_acceptor' : annot_acceptor, 'chimeric' : chimeric}) mapped_junc_df = pd.DataFrame(junc_list) mapped_junc_df = mapped_junc_df[['read_name', 'aln', 'n_junctions', 'chrom', 'chain', 'donor', 'annot_donor', 'acceptor', 'annot_acceptor', 'chimeric']].sort_values(by=['read_name','aln']).reset_index(drop=True) gr = mapped_junc_df.groupby(['read_name','aln']).apply(lambda x: x.chimeric.any()).reset_index(name='chim_read') del gr['aln'] mapped_junc_df = pd.merge(mapped_junc_df, gr, on='read_name').reset_index(drop=True) mapped_junc_df.to_csv('%s/mapped_junc_df_segemehl.csv' % path_to_file, sep = '\t') shell_call('gzip -f %s/mapped_junc_df_segemehl.csv' % path_to_file) PTES_logger.info('Creating junctions table... done') x = mapped_junc_df.groupby(['n_junctions','chim_read']).apply(lambda x: x.read_name.nunique()).reset_index(name='counts') y = pd.pivot_table(x, index=['n_junctions'], columns=['chim_read'],values=['counts'], fill_value=0, aggfunc=sum, margins=True) html_file = 'segemehl_pivot_table.html' init_file(html_file, folder = path_to_file) writeln_to_file(y.to_html(), html_file, folder = path_to_file) junc_of_interest = mapped_junc_df.query('n_junctions >= 2 & chim_read == True').sort_values(by=['annot_donor','annot_acceptor'], ascending=False).reset_index(drop=True).groupby(['read_name','aln']) junc_csv_name = 'junc_of_interest.csv' PTES_logger.info('Reading genome file...') genome_file = args.genome genome = SeqIO.index(genome_file, "fasta")
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-m", "--method", type=str, nargs='+', default=[ 'inside', 'outside', 'bedtools', ], help="Shuffling method(s): inside, outside, bedtools") parser.add_argument( "-c", "--closest", type=str, default='coverage', help="Choose close elements for outside method by coverage or length") parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-iter", "--iterations", type=int, default='1000', help="Number of iterations, default 1000") parser.add_argument( "-f", "--features", type=str, help="Path to .BED6 file with features (small intervals)") parser.add_argument("-g", "--genes", type=str, help="Path to .BED6 file with genes (containers)") parser.add_argument( "-s", "--chrom_sizes", type=str, default='/home/sunnymouse/Human_ref/hg19.chrom.sizes', help= "The chrom_sizes file should be tab delimited and structured as follows: \ <chromName><TAB><chromSize>, use bedtools shuffle -h for details" ) args = parser.parse_args() make_dir(args.output) path_to_file = args.output.rstrip('/') random_folder = path_to_file + '/random' make_dir(random_folder) # Shuffling methods inside and outside: if 'inside' in args.method or 'outside' in args.method: if 'outside' in args.method: PTES_logger.info('Reading containers... ') PTES_logger.info('containers file: %s ' % args.genes) strand_dict = {} interval_dict = {} if args.closest == 'length': gene_dict = defaultdict(list) # open file with genes with open(args.genes, 'r') as genes_file: for line in genes_file: line_list = line.strip().split() chrom = line_list[0] gene_interval = interval[int(line_list[1]), int(line_list[2])] gene_dict[chrom].append(gene_interval) try: strand = line_list[5] strand_dict[gene_interval] = strand except IndexError: strand_dict[gene_interval] = '.' PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) # sort lists by gene length for key in gene_dict: # key is chromosome new_list = sorted(gene_dict[key], key=lambda x: get_interval_length(x)) interval_dict.update(choose_close(sorted_list=new_list)) if args.closest == 'coverage': PTES_logger.info('Creating coverage file... ') cover_name = '%s/%s' % ( random_folder, os.path.basename(args.features) + '.cov') cmd = 'bedtools coverage -a %s -b %s -s > %s' % ( args.genes, args.features, cover_name, ) shell_call(cmd) PTES_logger.info('Creating coverage file... done') cover_dict = {} with open(cover_name, 'r') as cover_file: for line in cover_file: line_list = line.strip().split() gene_interval = interval[int(line_list[1]), int(line_list[2])] cov = float(line_list[-1]) cover_dict[gene_interval] = cov try: strand = line_list[5] strand_dict[gene_interval] = strand except IndexError: strand_dict[gene_interval] = '.' PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) new_list = sorted(cover_dict.items(), key=lambda x: x[1]) interval_dict.update( choose_close(sorted_list=new_list, items='items')) PTES_logger.info('Reading containers... done') PTES_logger.info('Creating intersection file... ') intersection_name = '%s/%s' % ( random_folder, os.path.basename(args.features) + '.intersect') cmd = 'bedtools intersect -a %s -b %s -wo -s > %s' % ( args.features, args.genes, intersection_name, ) shell_call(cmd) PTES_logger.info('Creating intersection file... done') PTES_logger.info('Reading intersection file and shuffling... ') PTES_logger.info('intersection file: %s' % intersection_name) if 'inside' in args.method: n_list_inside = np.empty( (args.iterations, 0)).tolist() # make list of 1000 empty lists if 'outside' in args.method: n_list_outside = np.empty( (args.iterations, 0)).tolist() # make list of 1000 empty lists with open(intersection_name, 'r') as intersect_file: for i, line in enumerate(intersect_file): line_list = line.strip().split() b_start = get_b_start(line) if not b_start: continue chrom1 = line_list[0] feature_interval = interval[int(line_list[1]), int(line_list[2])] gene_interval = interval[int(line_list[b_start + 1]), int(line_list[b_start + 2])] for n in range(args.iterations): if 'inside' in args.method: random_interval_inside = randomize_interval( small_i=feature_interval, large_i=gene_interval) n_list_inside[n].append( interval_to_bed_line( chrom=chrom1, single_interval=random_interval_inside, name=line_list[3], strand=line_list[5])) if 'outside' in args.method: new_large_interval = random.choice( interval_dict[gene_interval] ) # choose one of closest genes new_strand = strand_dict[new_large_interval] feature_len = get_interval_length(feature_interval) gene_len = get_interval_length(gene_interval) if feature_len <= gene_len: try: container_strand = line_list[b_start + 5] relative_position = count_relative_position( feature=feature_interval, container=gene_interval, container_strand=container_strand) random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, large_i_strand=new_strand, same_position=True, p=relative_position) except IndexError: PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) relative_position = count_relative_position( feature=feature_interval, container=gene_interval) random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, same_position=True, p=relative_position) else: random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, same_position=False, ) n_list_outside[n].append( interval_to_bed_line( chrom=chrom1, single_interval=random_interval_outside, name=line_list[3], strand=line_list[5])) PTES_logger.info('Reading intersection file and shuffling... done') PTES_logger.info('Creating output files... ') for n in range(args.iterations): if 'inside' in args.method: out_name = random_folder + '/%s_%i.bed' % ('inside', n) with open(out_name, 'w') as out_file: out_file.write('\n'.join(n_list_inside[n])) if 'outside' in args.method: out_name = random_folder + '/%s_%i.bed' % ('outside', n) with open(out_name, 'w') as out_file: out_file.write('\n'.join(n_list_outside[n])) PTES_logger.info('Creating output files... done') # Shuffling method 3 if 'bedtools' in args.method: PTES_logger.info('Running bedtools shuffle... ') for n in range(args.iterations): random_file = 'bedtools_%i.bed' % n cmd = 'bedtools shuffle -incl %s -i %s -g %s -chrom > %s/%s' % ( args.genes, args.features, args.chrom_sizes, random_folder, random_file) shell_call(cmd) PTES_logger.info('Running bedtools shuffle... done')