def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="BED file, bedtools intersect -a A_FEATURE -b B_BEATURE -s -wo output") parser.add_argument("-f", "--features", type=str, help="Path to .BED6 file with features (small intervals)") parser.add_argument("-o", "--output", type=str, help="Output folder with random subfolder for results") parser.add_argument("-a", "--afeature", type=str, help="Name of A_FEATURE, i.e. circles") parser.add_argument("-b", "--bfeature", type=str, help="Name of B_BEATURE, i.e. panhandles") parser.add_argument("-iter", "--iterations", type=int, default='1000', help="Number of iterations for randomizing results") parser.add_argument("-m", "--method", type=str, nargs='+', default=['inside', 'outside', 'bedtools', ], help="Shuffling method(s): inside, outside, bedtools") args = parser.parse_args() path_to_file = args.output.rstrip('/') random_folder = path_to_file + '/random' # category_name = 'categories.csv' # pivot_name = 'categories_pivot.csv' PTES_logger.info('Reading input file... ') PTES_logger.info('Input file: %s ' % args.input) real_dict = parse_bedtools_wo(wo_outfile=args.input) # category - number of intersections PTES_logger.info('Reading input file... done') PTES_logger.info('Running intersections with random files... ') random_dicts = {} # method - category - list of values categories = ['a_in_b', 'b_in_a', 'overlap'] for method in args.method: random_dicts[method] = dict.fromkeys(categories) for k, _ in random_dicts[method].items(): random_dicts[method][k] = [] # now we have separate empty lists as values for n in range(args.iterations): random_input = '%s/%s_%i.bed' % (random_folder, method, n) random_output = '%s/%s_%i.bed.intersect' % (random_folder, method, n) cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % (args.features, random_input, random_output) shell_call(cmd) random_dict = parse_bedtools_wo(wo_outfile=random_output) for cat in categories: random_dicts[method][cat].append(random_dict[cat]) PTES_logger.info('Running intersections with random files... done') PTES_logger.info('Creating output files...') # plotting for method in args.method: fig = plt.figure(figsize=(12,6)) plt.suptitle("Shuffling method %s" % method) ax1 = fig.add_subplot(131) sns.distplot(random_dicts[method]['a_in_b'], kde=False) ax1.axvline(real_dict['a_in_b'], color='r') ax1.set(title='%s_in_%s' % (args.afeature, args.bfeature)); ax2 = fig.add_subplot(132) sns.distplot(random_dicts[method]['b_in_a'], kde=False) ax2.axvline(real_dict['b_in_a'], color='r') ax2.set(title='%s_in_%s' % (args.bfeature, args.afeature)); ax3 = fig.add_subplot(133) sns.distplot(random_dicts[method]['overlap'], kde=False) ax3.axvline(real_dict['overlap'], color='r') ax3.set(title='overlap'); plt.savefig('%s/histograms_%s.png' % (path_to_file, method)) # saving data for histograms for method in args.method: hist_name = '%s/data_hist_%s' % (path_to_file, method) with open(hist_name, 'w') as hist_file: hist_file.write('%s_in_%s' % (args.afeature, args.bfeature) + '\n') hist_file.write('real: %i' % real_dict['a_in_b'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['a_in_b'])) + '\n') hist_file.write('%s_in_%s' % (args.bfeature, args.afeature) + '\n') hist_file.write('real: %i' % real_dict['b_in_a'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['b_in_a'])) + '\n') hist_file.write('overlap' + '\n') hist_file.write('real: %i' % real_dict['overlap'] + '\n') hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['overlap'])) + '\n') PTES_logger.info('Creating output files... done') PTES_logger.info('Remember to delete random subfolder')
def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR output, Chimeric.out.junction \ OR list of such files") parser.add_argument( "-s", "--sam", type=str, help= "Filtered STAR SAM output, with read_names same as in Chimeric.out.junction OR list" ) parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument( "-l", "--list", type=str, help="Enables list input mode. Options: input, sam, tag - MUST be lists" ) parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument( "-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id OR list of tags") args = parser.parse_args() # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name, feature_name='exon') PTES_logger.info('Reading GTF... done') # non-iterative make_dir(args.output) junc_dict = defaultdict(dict) all_reads_df = None if args.list: with open(args.input, 'r') as chim_names_file: chim_names_list = [ x.strip('\n') for x in chim_names_file.readlines() ] with open(args.sam, 'r') as sam_names_file: sam_names_list = [ x.strip('\n') for x in sam_names_file.readlines() ] with open(args.tag, 'r') as tag_names_file: tag_names_list = [ x.strip('\n') for x in tag_names_file.readlines() ] triads = zip(chim_names_list, sam_names_list, tag_names_list) PTES_logger.info('Enabled list mode') chim_reads_df_list = [] else: triads = [(args.input, args.sam, args.tag)] for chim_name, sam_name, tag in triads: PTES_logger.info('Input file %s' % chim_name) # Reading filtered STAR non-chim output PTES_logger.info('Reading STAR .sam output...') sam_dict = sam_input(sam_name=sam_name, chim_name=chim_name) PTES_logger.info('Reading STAR .sam output... done') # Reading filtered STAR output PTES_logger.info('Reading STAR chimeric output...') read_names_list = chim_input(chim_name=chim_name, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors, sam_dict=sam_dict, junc_dict=junc_dict) PTES_logger.info('Reading STAR chimeric output... done') PTES_logger.info('Creating reads dataframes...') try: reads_df = pd.DataFrame(read_names_list) reads_df = reads_df[[ 'read_name', 'chrom', 'chain', 'donor', 'acceptor', 'annot_donor', 'annot_acceptor', 'letters_ss', 'chim_dist', 'mate_dist', 'type', ]].sort_values( by=['chrom', 'chain', 'donor', 'acceptor']).reset_index( drop=True) # reorder columns reads_df['id'] = tag if args.list: chim_reads_df_list.append(reads_df) else: all_reads_df = reads_df except KeyError: PTES_logger.warning('Creating reads dataframe... empty dataframe') if args.list: all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True) if all_reads_df is not None: # Writing reads dataframe PTES_logger.info('Writing reads dataframe...') all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t') PTES_logger.info('Writing reads dataframe... done') # Writing junc_dict PTES_logger.info('Writing intervals to json file...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json: junc_json.write( json.dumps({str(k): v for k, v in junc_dict.items() }).encode('utf-8')) else: with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2) PTES_logger.info('Writing intervals to json file... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=all_reads_df) junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done')
def chim_input(chim_name, gtf_donors, gtf_acceptors, sam_dict, junc_dict): """ Reads STAR chimeric output :param chim_name: name of file Chimeric.out.junction.filtered :param gtf_donors: dict: chrom - set of donors coordinates (integers) :param gtf_acceptors: dict: chrom - set of acceptors coordinates (integers) :param sam_dict: output of sam_input function :param junc_dict: defaultdict, global, with interval dicts for BED files :return: List of dicts ready for making a DataFrame, each dict is one read, mapped as mate-inside or mate-outside """ annot_donors = 0 annot_acceptors = 0 read_names_list = [] skipped = { 'non-filtered': 0, # different chromosomes and/or chains 'chrM': 0, # mapping to chrM 'PE': 0, # junction between the mates, -1 in STAR output 'non-chim': 0 } # STAR counts very long (>1Mb) junctions as chimeric with open(chim_name, 'r') as input_file: for i, line in enumerate(input_file): line_dict = star_line_dict(line=line) if not line_dict: continue if line_dict['chrom1'] == line_dict['chrom2'] \ or line_dict['chain1'] == line_dict['chain2']: chrom = line_dict['chrom1'] chain = line_dict['chain1'] else: PTES_logger.error('Non-filtered STAR output') PTES_logger.error('Use awk "$1 ==$4 && $3 ==$6" to filter') skipped['non-filtered'] += 1 continue if chrom == 'chrM': skipped['chrM'] += 1 continue if line_dict['junction_letters'] == '-': PTES_logger.error('PE input, junction type -1 is present!') PTES_logger.error('Current version works only with SE output') skipped['PE'] += 1 continue if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \ or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \ or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']: skipped['non-chim'] += 1 continue read_name = line_dict['read_name'] annot_donor = 0 annot_acceptor = 0 if line_dict['donor_ss'] in gtf_donors[chrom]: annot_donor = 1 annot_donors += 1 if line_dict['acceptor_ss'] in gtf_acceptors[chrom]: annot_acceptor = 1 annot_acceptors += 1 mate_tuple = return_mate_tuple(line_dict=line_dict, second_mates=sam_dict[read_name], chrom=chrom, chain=chain) if mate_tuple: junc_dict[( chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss'], )].update({read_name: mate_tuple[:3]}) # read_intervals only interval_intersection = mate_tuple[3] if interval_intersection == 'outside': mate_dist = count_mate_outside_dist(mate_tuple=mate_tuple, chain=chain) else: mate_dist = 0 read_attrs = { 'read_name': read_name, 'chain': chain, # chain of chimeric junction 'chrom': chrom, 'donor': line_dict['donor_ss'], 'acceptor': line_dict['acceptor_ss'], 'annot_donor': annot_donor, 'annot_acceptor': annot_acceptor, 'letters_ss': line_dict['junction_letters'], 'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']), 'mate_dist': mate_dist, 'type': interval_intersection, } read_names_list.append(read_attrs) PTES_logger.info('Processed: %i rows' % i) for key in skipped: PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key])) PTES_logger.info('Converted successfully: %i rows' % len(read_names_list)) PTES_logger.info('Annot donors: %i' % annot_donors) PTES_logger.info('Annot acceptors: %i' % annot_acceptors) return read_names_list
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR Chimeric.out.junction output OR list") parser.add_argument("-o", "--output", type=str, help="Path for subfolder with results") parser.add_argument("-l", "--list", type=str, help="Enables list input mode. Options: input, tag - MUST be lists") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument("-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results (prefix), i.e. ENCODE id OR list") args = parser.parse_args() # Main make_dir(args.output) skipped = {'non-filtered': 0, # different chromosomes and/or chains 'chrM': 0, # mapping to chrM 'PE': 0, # junction between the mates, -1 in STAR output 'non-chim': 0} # STAR counts very long (>1Mb) junctions as chimeric junc_dict = defaultdict(dict) # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') if args.list: with open(args.input, 'r') as chim_names_file: chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()] with open(args.tag, 'r') as tag_names_file: tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()] pairs = zip(chim_names_list, tag_names_list) PTES_logger.info('Enabled list mode') chim_reads_df_list = [] else: pairs = [(args.input, args.tag)] for chim_name, tag in pairs: annot_donors = 0 annot_acceptors = 0 read_names_list = [] PTES_logger.info('Input file: %s ' % chim_name) PTES_logger.info('Reading STAR output...') with open(chim_name, 'r') as input_file: for i, line in enumerate(input_file): line_dict = star_line_dict(line=line) if not line_dict: continue if line_dict['chrom1'] == line_dict['chrom2'] \ and line_dict['chain1'] == line_dict['chain2']: chrom = line_dict['chrom1'] chain = line_dict['chain1'] else: skipped['non-filtered'] +=1 continue if chrom == 'chrM': skipped['chrM'] += 1 continue if line_dict['junction_letters'] == '-': PTES_logger.error('PE input, junction type -1 is present!') PTES_logger.error('Current version works only with SE output') skipped['PE'] += 1 continue if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \ or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \ or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']: skipped['non-chim'] += 1 continue read_name = line_dict['read_name'] chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1']) chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2']) junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss']) ].update({read_name: (chim_part1, chim_part2)}) annot_donor = 0 annot_acceptor = 0 if line_dict['donor_ss'] in gtf_donors[chrom]: annot_donor = 1 annot_donors += 1 if line_dict['acceptor_ss'] in gtf_acceptors[chrom]: annot_acceptor = 1 annot_acceptors += 1 read_attrs = { 'read_name': read_name, 'chain': chain, # chain of chimeric junction 'chrom': chrom, 'donor': line_dict['donor_ss'], 'acceptor': line_dict['acceptor_ss'], 'annot_donor': annot_donor, 'annot_acceptor': annot_acceptor, 'letters_ss': line_dict['junction_letters'], 'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']), } read_names_list.append(read_attrs) PTES_logger.info('Reading STAR output... done') PTES_logger.info('Processed: %i rows' % i) for key in skipped: PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key])) PTES_logger.info('Converted successfully: %i rows' % len(read_names_list)) PTES_logger.info('Annot donors: %i' % annot_donors) PTES_logger.info('Annot acceptors: %i' % annot_acceptors) PTES_logger.info('Creating reads dataframe...') try: reads_df = pd.DataFrame(read_names_list) reads_df = reads_df[ ['read_name', 'chrom', 'chain', 'donor', 'acceptor', 'annot_donor', 'annot_acceptor', 'letters_ss', 'chim_dist'] ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True) # reorder columns reads_df['id'] = tag if args.list: chim_reads_df_list.append(reads_df) else: all_reads_df = reads_df except KeyError: PTES_logger.warning('Creating reads dataframe... empty dataframe') if args.list: all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True) # Writing reads dataframe if all_reads_df is not None: all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t') PTES_logger.info('Creating reads dataframe... done') # Writing junc_dict PTES_logger.info('Writing intervals to json...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json: junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8')) else: with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2) PTES_logger.info('Writing intervals to json... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=all_reads_df) junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done') else: PTES_logger.warning('Empty dataframe')
"--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_genes.gtf', help="Absolute path to genome file") parser.add_argument("-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id") args = parser.parse_args() # Functions # Main PTES_logger.info('Input files: A %s ' % args.a_input) PTES_logger.info('Input files: B %s ' % args.b_input) circles = pybedtools.BedTool(args.a_input) panhandles = pybedtools.BedTool(args.b_input).set_chromsizes('hg19') genes = pybedtools.BedTool(args.gtf_annot) #union = panhandles.intersect(circles) #intersection_c = panhandles.intersect(circles, c=True) PTES_logger.info('Intersecting A with B...') ab_results_dict = panhandles.randomstats(circles, iterations=10000, include_distribution=False, shuffle_kwargs={ 'chrom': True, 'genome': "hg19", 'incl': args.gtf_annot
def main(): # Arguments ''' args_s = ('-t ../tests/test_data/ptes/chim_reads_test.csv ' '-j ../tests/test_data/ptes/junc_dict.json.gz ' '-f ../tests/test_data/ptes/chim_junctions_test.csv ' '-q letters_ss=="." ' '-o ../tests/test_results/bed ' '-p test ' '-gz 1') ''' parser = argparse.ArgumentParser() parser.add_argument("-t", "--table", type=str, help="DataFrame with data to create BED, required 4 columns are: chrom, strand, donor, acceptor") parser.add_argument("-sep", "--separator", type=str, default='\t', help="DataFrame separator, tab by default") parser.add_argument("-j", "--json", type=str, help="JSON file with all read intervals as OrderedDicts") parser.add_argument("-gz", "--gzip", type=str, help="Write anything to enable reading .json.gz") parser.add_argument("-q", "--query", type=str, help="Conditions to filter junctions table, string as in pandas.DataFrame.query()") parser.add_argument("-f", "--filter", type=str, help="DataFrame with (chrom, strand, donor, acceptor) to filter input table") parser.add_argument("-n", "--names", type=str, nargs='+', default=['chim1', 'chim2', 'mate2', ], help="List of names for chim parts in BED name: [chim1, chim2, mate2]. \ Important: same order as parts in json values") parser.add_argument("-c", "--colors", type=str, nargs='+', default=['r', 'r', 'b', ], help="List of colors for chim parts in BED name: [chim1, chim2, mate2]. \ Important: same order as parts in json values\ Colors: 'r', 'g', 'b' or in RGB code like '0,255,0'") parser.add_argument("-o", "--output", type=str, default='bed', help="Output folder for results, default is bed/") parser.add_argument("-p", "--prefix", type=str, default='Output', help="Prefix for all output files") parser.add_argument("-sort", "--sort", type=str, help="Write anything to enable sorting BED files") parser.add_argument("-bb", "--bigbed", type=str, help="Write anything to enable creating .bigBed files") args = parser.parse_args() # args = parser.parse_args(args_s.split(' ')) PTES_logger.info('Reading input files...') make_dir(args.output) index_list = ['chrom', 'chain', 'donor', 'acceptor'] input_df = pd.read_csv(args.table, sep=args.separator) for col in index_list: if col not in input_df.columns: PTES_logger.error('Input table does not contain required column %s ' % col) os._exit(1) if args.filter: # filter by junctions filter_df = pd.read_csv(args.filter, sep=args.separator) for col in index_list: if col not in filter_df.columns: PTES_logger.error('Filter table does not contain required column %s ' % col) os._exit(1) cols_to_use = index_list + list(input_df.columns.difference(filter_df.columns)) # avoid repeating columns df_new = pd.merge(filter_df, input_df[cols_to_use], on=index_list, how='inner',) else: df_new = input_df if args.query: # filter reads by conditions df_new = df_new.query(args.query) df_new.to_csv(os.path.join(args.output, 'df_filter.csv'), sep='\t') # Reading .json.gz file if args.gzip: with gzip.GzipFile(args.json, 'r') as fin: junc_dict = json.loads(fin.read().decode('utf-8'), object_pairs_hook=OrderedDict) else: junc_dict = json.load(open(args.json), object_pairs_hook=OrderedDict) len_read_dicts = len(junc_dict.values()[0].values()) # must be 3 for mate_inside/outside and 2 for circles if len(args.names) < len_read_dicts: PTES_logger.warning('List of names has less items than list of features in read_dicts!') part_names = [x[0]+str(x[1]) for x in list(zip(['part_']*len_read_dicts, range(1, len_read_dicts+1)))] else: part_names = args.names if len(args.colors) < len_read_dicts: PTES_logger.warning('List of colors has less items than list of features in read_dicts!') part_colors = ['r']*len_read_dicts else: part_colors = args.colors PTES_logger.info('Reading input files... done') PTES_logger.info('Creating BED files...') bed_name = '%s.bed' % args.prefix # only track lines unique_bed_name = '%s.unique.bed' % args.prefix # one representative read for unique junctions single_bed_name = '%s.single.bed' % args.prefix # single line for one chimeric junction single_unique_bed_name = '%s.single.unique.bed' % args.prefix # for unique junctions, single line for one junction code_name = '%s.codes.csv' % args.prefix # table with codes for each read and descriptions coord_name = '%s.coords.csv' % args.prefix # table with windows to paste into GB and descriptions info_name = '%s.track' % args.prefix # file to submit to GB bed_list = [] # for outputting BED lines unique_dict = {} # for outputting BED lines, unique chimeric junctions single_list = [] # for outputting BED lines, one row per one chimeric junction single_unique_list = [] # for outputting BED lines, one row per one unique chimeric junction coord_list = [] # for outputting coord lines code_list = [] # for outputting coord lines, one row per one read with open(os.path.join(args.output, info_name), 'w') as info_file: info_file.write('\n'.join( ['browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk', 'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', 'browser pack gtexGene', 'track type=bigBed \ name="%s" \ description="bigBed" \ visibility=2 \ itemRgb="On" \ bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true' % ( args.prefix, bed_name.replace('.bed', '.bb') ) ] ) ) num = 0 junctions = df_new.groupby(index_list)['read_name'].apply(list) # unique chimeric junctions for index, read_list in junctions.items(): # value is list of read_names chrom = index[0] # index is (chrom, chain, donor_ss, acceptor_ss) chain = index[1] donor_ss = index[2] acceptor_ss = index[3] windows_min = [] windows_max = [] codes = [] for read_name in read_list: # for each read w. this junction num += 1 code = digit_code(number=num) # every unique number will be 6-digit codes.append(code) track_lists = [] if not unique_dict.get(index, None): # for each unique junction write the 1st read line(s) unique_dict[index] = [] add_unique = True else: add_unique = False read_dict_list = junc_dict[str(index)][read_name] # list of dicts: each dict is one track (i.e. chim_part) # Iterating over tracks for i, read_dict in enumerate(read_dict_list): for k, v in read_dict.items(): read_dict[k] = interval[v[0][0], v[0][1]] track_list = get_track_list(chrom=chrom, chain=chain, read_dict=read_dict, name='_'.join(map(str, [donor_ss, acceptor_ss, code, part_names[i]])), color=part_colors[i]) track_lists.append(track_list) # Writing BED lines, collecting extremas for window size for track_list in track_lists: windows_min.append(int(track_list[1])) # track_list[1] is chromStart, track_list[2] is chromEnd windows_max.append(int(track_list[2])) bed_line = '\t'.join(track_list) bed_list.append(bed_line) if add_unique: unique_dict[index].append(bed_line) # Writing code line code_list.append({ 'chrom': chrom, 'chain': chain, 'donor': donor_ss, 'acceptor': acceptor_ss, 'read_name': read_name, 'code': code }) # Making BED file with one row for the pair of mates single_track = get_single_track(read_dict_list=read_dict_list, kwargs={'chrom': chrom, 'chain': chain, 'name': '_'.join( map(str, [donor_ss, acceptor_ss, code])), 'color': '255,0,255'}) # for checking in GB that intervals are same single_list.append('\t'.join(single_track)) if add_unique: single_unique_list.append('\t'.join(single_track)) # Description for the junction into coords.csv window = (chrom, # one window for junction min(windows_min) - 200, max(windows_max) + 200) coord_list.append({ 'chrom': chrom, 'chain': chain, 'donor': donor_ss, 'acceptor': acceptor_ss, 'window': '%s:%i-%i' % window, 'codes': '-'.join(map(str,[codes[0], codes[-1]])), }) PTES_logger.info('Creating BED files... done') PTES_logger.info('Writing BED files...') with open(os.path.join(args.output, bed_name), 'w') as bed_file, \ open(os.path.join(args.output, unique_bed_name), 'w') as unique_bed_file, \ open(os.path.join(args.output, single_bed_name), 'w') as single_bed_file, \ open(os.path.join(args.output, single_unique_bed_name), 'w') as single_unique_bed_file, \ open(os.path.join(args.output, coord_name), 'w') as coord_file, \ open(os.path.join(args.output, code_name), 'w') as code_file: bed_file.write('\n'.join(bed_list)) single_bed_file.write('\n'.join(single_list)) single_unique_bed_file.write('\n'.join(single_unique_list)) for unique_value in unique_dict.values(): unique_bed_file.write('\n'.join(list(unique_value))+'\n') PTES_logger.info('Writing BED files... done') PTES_logger.info('Creating junctions dataframes...') coord_df = pd.DataFrame(coord_list) code_df = pd.DataFrame(code_list) coord_df.to_csv(os.path.join(args.output, coord_name), sep='\t') code_df.to_csv(os.path.join(args.output, code_name), sep='\t') PTES_logger.info('Creating junctions dataframes... done') if args.sort: PTES_logger.info('Sorting BED files...') for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]: shell_call('cat %s | sort -k1,1 -k2,2n > %s.sorted' % (os.path.join(args.output, filename), os.path.join(args.output, filename),) ) PTES_logger.info('Sorting BED files... done') if args.bigbed: # will also sort files PTES_logger.info('Making bigBed...') for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]: to_bigbed(bed_name=filename, folder_name=args.output) PTES_logger.info('Making bigBed... done')
parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to genome file") parser.add_argument("-t", "--tag", type=str, help="Tag name for grouping results, i.e. ENCODE id") args = parser.parse_args() # Functions # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') # Reading filtered STAR output PTES_logger.info('Reading STAR output...') path_to_file = args.output.rstrip('/') outside_name = 'mate_outside.junction' outside_list = [] init_file(outside_name, folder=path_to_file) mates_gtag = {'inside': 0, 'outside': 0, 'non-chim': 0} mates_nc = {'inside': 0, 'outside': 0, 'non-chim': 0}
def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR output, Chimeric.out.junction.filtered \ OR list of such files") parser.add_argument("-s", "--sam", type=str, help="Filtered STAR SAM output OR list") parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument( "-l", "--list", type=str, help="Enables list input mode. Options: sam, tag - MUST be lists") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument( "-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id OR list of tags") args = parser.parse_args() # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') make_dir(args.output) norm_junc_dict = defaultdict(dict) norm_read_names_list = [] if args.list: with open(args.sam, 'r') as sam_names_file: sam_names_list = [ x.strip('\n') for x in sam_names_file.readlines() ] if args.input: with open(args.input, 'r') as chim_names_file: chim_names_list = [ x.strip('\n') for x in chim_names_file.readlines() ] else: chim_names_list = [None] * len(sam_names_list) with open(args.tag, 'r') as tag_names_file: tag_names_list = [ x.strip('\n') for x in tag_names_file.readlines() ] triads = zip(chim_names_list, sam_names_list, tag_names_list) PTES_logger.info('Enabled list mode') else: triads = [(args.input, args.sam, args.tag)] for chim_name, sam_name, tag in triads: if chim_name: with open(chim_name, 'r') as chim_file: names_list = [ x.strip('\n').split('\t')[9] for x in chim_file.readlines() ] names_set = set(names_list) # only names with chimeric output with open(sam_name, 'r') as sam_file: PTES_logger.info('Input file %s' % sam_name) for line in sam_file: if line.startswith('@'): continue row = line.strip().split('\t') sam_attrs = None if len(row) > 1: read_name = row[0] if chim_name: if read_name in names_set: sam_attrs = parse_sam_row(row) else: sam_attrs = parse_sam_row(row) if sam_attrs: if 'N' in sam_attrs['cigar']: # read mapped with intron read_dict = get_read_interval( cigar=sam_attrs['cigar'], leftpos=sam_attrs['leftpos'], output='dict') if sam_attrs['chain'] == '+': donor_ss = int(read_dict['N1'][0].inf - 1) # counts first N as intron acceptor_ss = int(read_dict['N1'][0].sup + 1) elif sam_attrs['chain'] == '-': donor_ss = int(read_dict['N1'][0].sup + 1) acceptor_ss = int(read_dict['N1'][0].inf - 1) norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'], donor_ss, acceptor_ss)].update( {read_name: tuple([read_dict])}) norm_read_names_list.append({ 'read_name': read_name, 'chrom': sam_attrs['chrom'], 'chain': sam_attrs['chain'], 'donor': donor_ss, 'acceptor': acceptor_ss, 'id': tag }) try: norm_read_df = pd.DataFrame(norm_read_names_list) norm_read_df = norm_read_df[[ 'read_name', 'chrom', 'chain', 'donor', 'acceptor', 'id', ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index( drop=True) PTES_logger.info('Writing reads dataframe...') norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'), sep='\t') PTES_logger.info('Writing reads dataframe... done') except KeyError: PTES_logger.warning( 'Creating norm split reads dataframe... empty dataframe') # Writing junc_dict PTES_logger.info('Writing intervals to json files...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'), 'w') as norm_json: norm_json.write( json.dumps({str(k1): v1 for k1, v1 in norm_junc_dict.items() }).encode('utf-8')) else: with open(os.path.join(args.output, 'norm_dict.json'), 'w') as norm_json: json.dump({str(k1): v1 for k1, v1 in norm_junc_dict.items()}, norm_json, indent=2) PTES_logger.info('Writing intervals to json files... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=norm_read_df, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors) junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done')
os.makedirs(args.output) except OSError as exc: if exc.errno != errno.EEXIST: raise pass path_to_file = args.output.rstrip('/') col_names = [ 'read_name', 'strand', 'chrom', 'leftpos', 'cigar', 'NH', 'XI', 'XX', 'XY', 'XQ' ] # col_nums = [0,1,2,3,5,14,19] tag_list = ['NH', 'XI', 'XX', 'XY', 'XQ'] # Reading SAM input PTES_logger.info('Reading SAM input...') read_intervals, read_infos = segemehl_to_intervals(segemehl_outfile=args.input) PTES_logger.info('Reading SAM input... done') PTES_logger.info('Reading GTF...') gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=args.gtf_annot) PTES_logger.info('Reading GTF... done') PTES_logger.info('Creating junctions table...') junc_list = intervals_to_junctions( read_intervals=read_intervals, read_infos=read_infos, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors
help="Path to .BED file with genes (containers)") parser.add_argument( "-s", "--strand", type=str, help= "Enable strand-specific position mode, BED files should contain 6 or more fields" ) args = parser.parse_args() # Functions # Main make_dir(args.output) PTES_logger.info('Creating intersection file... ') intersection_name = os.path.join( args.output, os.path.basename(args.features) + '.intersect') cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % ( args.features, args.genes, intersection_name, ) shell_call(cmd) PTES_logger.info('Creating intersection file... done') PTES_logger.info('Reading intersection file... ') p_dict = {} gene_p_dict = defaultdict(list) feature_len_list = []
### Main segemehl_outfile = args.input # SAM file without header dirname = os.path.dirname(os.path.realpath(segemehl_outfile)) if dirname == '': dirname = '.' path_to_file = args.output.rstrip('/') col_names = ['read_name','flag','chrom','leftpos','cigar','xi','xq'] # col_nums = [0,1,2,3,5,14,19] tag_list = ['XI','XQ'] read_intervals = defaultdict(lambda: defaultdict(list)) # mapped intervals read_infos = defaultdict(lambda: defaultdict(list)) # mapped chrom(s) and chain(s) # Reading SAM input PTES_logger.info('Reading SAM input...') with open(segemehl_outfile, 'r') as df_segemehl: for line in df_segemehl: row = line.strip().split('\t') read_name = row[0] flag = int(row[1]) chrom = row[2] leftpos = row[3] cigar = row[5] sam_attrs = {'read_name' : read_name, 'flag': flag, 'chrom' : chrom, 'leftpos' : leftpos, 'cigar' : cigar} tags = dict.fromkeys(tag_list, None) for elm in row[14:]:
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-m", "--method", type=str, nargs='+', default=[ 'inside', 'outside', 'bedtools', ], help="Shuffling method(s): inside, outside, bedtools") parser.add_argument( "-c", "--closest", type=str, default='coverage', help="Choose close elements for outside method by coverage or length") parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-iter", "--iterations", type=int, default='1000', help="Number of iterations, default 1000") parser.add_argument( "-f", "--features", type=str, help="Path to .BED6 file with features (small intervals)") parser.add_argument("-g", "--genes", type=str, help="Path to .BED6 file with genes (containers)") parser.add_argument( "-s", "--chrom_sizes", type=str, default='/home/sunnymouse/Human_ref/hg19.chrom.sizes', help= "The chrom_sizes file should be tab delimited and structured as follows: \ <chromName><TAB><chromSize>, use bedtools shuffle -h for details" ) args = parser.parse_args() make_dir(args.output) path_to_file = args.output.rstrip('/') random_folder = path_to_file + '/random' make_dir(random_folder) # Shuffling methods inside and outside: if 'inside' in args.method or 'outside' in args.method: if 'outside' in args.method: PTES_logger.info('Reading containers... ') PTES_logger.info('containers file: %s ' % args.genes) strand_dict = {} interval_dict = {} if args.closest == 'length': gene_dict = defaultdict(list) # open file with genes with open(args.genes, 'r') as genes_file: for line in genes_file: line_list = line.strip().split() chrom = line_list[0] gene_interval = interval[int(line_list[1]), int(line_list[2])] gene_dict[chrom].append(gene_interval) try: strand = line_list[5] strand_dict[gene_interval] = strand except IndexError: strand_dict[gene_interval] = '.' PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) # sort lists by gene length for key in gene_dict: # key is chromosome new_list = sorted(gene_dict[key], key=lambda x: get_interval_length(x)) interval_dict.update(choose_close(sorted_list=new_list)) if args.closest == 'coverage': PTES_logger.info('Creating coverage file... ') cover_name = '%s/%s' % ( random_folder, os.path.basename(args.features) + '.cov') cmd = 'bedtools coverage -a %s -b %s -s > %s' % ( args.genes, args.features, cover_name, ) shell_call(cmd) PTES_logger.info('Creating coverage file... done') cover_dict = {} with open(cover_name, 'r') as cover_file: for line in cover_file: line_list = line.strip().split() gene_interval = interval[int(line_list[1]), int(line_list[2])] cov = float(line_list[-1]) cover_dict[gene_interval] = cov try: strand = line_list[5] strand_dict[gene_interval] = strand except IndexError: strand_dict[gene_interval] = '.' PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) new_list = sorted(cover_dict.items(), key=lambda x: x[1]) interval_dict.update( choose_close(sorted_list=new_list, items='items')) PTES_logger.info('Reading containers... done') PTES_logger.info('Creating intersection file... ') intersection_name = '%s/%s' % ( random_folder, os.path.basename(args.features) + '.intersect') cmd = 'bedtools intersect -a %s -b %s -wo -s > %s' % ( args.features, args.genes, intersection_name, ) shell_call(cmd) PTES_logger.info('Creating intersection file... done') PTES_logger.info('Reading intersection file and shuffling... ') PTES_logger.info('intersection file: %s' % intersection_name) if 'inside' in args.method: n_list_inside = np.empty( (args.iterations, 0)).tolist() # make list of 1000 empty lists if 'outside' in args.method: n_list_outside = np.empty( (args.iterations, 0)).tolist() # make list of 1000 empty lists with open(intersection_name, 'r') as intersect_file: for i, line in enumerate(intersect_file): line_list = line.strip().split() b_start = get_b_start(line) if not b_start: continue chrom1 = line_list[0] feature_interval = interval[int(line_list[1]), int(line_list[2])] gene_interval = interval[int(line_list[b_start + 1]), int(line_list[b_start + 2])] for n in range(args.iterations): if 'inside' in args.method: random_interval_inside = randomize_interval( small_i=feature_interval, large_i=gene_interval) n_list_inside[n].append( interval_to_bed_line( chrom=chrom1, single_interval=random_interval_inside, name=line_list[3], strand=line_list[5])) if 'outside' in args.method: new_large_interval = random.choice( interval_dict[gene_interval] ) # choose one of closest genes new_strand = strand_dict[new_large_interval] feature_len = get_interval_length(feature_interval) gene_len = get_interval_length(gene_interval) if feature_len <= gene_len: try: container_strand = line_list[b_start + 5] relative_position = count_relative_position( feature=feature_interval, container=gene_interval, container_strand=container_strand) random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, large_i_strand=new_strand, same_position=True, p=relative_position) except IndexError: PTES_logger.error('No strand found') PTES_logger.error( 'BED6 format is required for choosing strand-specific position' ) relative_position = count_relative_position( feature=feature_interval, container=gene_interval) random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, same_position=True, p=relative_position) else: random_interval_outside = randomize_interval( small_i=feature_interval, large_i=new_large_interval, same_position=False, ) n_list_outside[n].append( interval_to_bed_line( chrom=chrom1, single_interval=random_interval_outside, name=line_list[3], strand=line_list[5])) PTES_logger.info('Reading intersection file and shuffling... done') PTES_logger.info('Creating output files... ') for n in range(args.iterations): if 'inside' in args.method: out_name = random_folder + '/%s_%i.bed' % ('inside', n) with open(out_name, 'w') as out_file: out_file.write('\n'.join(n_list_inside[n])) if 'outside' in args.method: out_name = random_folder + '/%s_%i.bed' % ('outside', n) with open(out_name, 'w') as out_file: out_file.write('\n'.join(n_list_outside[n])) PTES_logger.info('Creating output files... done') # Shuffling method 3 if 'bedtools' in args.method: PTES_logger.info('Running bedtools shuffle... ') for n in range(args.iterations): random_file = 'bedtools_%i.bed' % n cmd = 'bedtools shuffle -incl %s -i %s -g %s -chrom > %s/%s' % ( args.genes, args.features, args.chrom_sizes, random_folder, random_file) shell_call(cmd) PTES_logger.info('Running bedtools shuffle... done')