def test_annot_junctions(self): filename = './test_data/ptes/hg19_exons_prot_coding.gtf.shuf' donors, acceptors = ptes.annot_junctions(gtf_exons_name=filename) chrom = 'chr16' real_donors = [2138327, 30020798] # chain +, chain - real_acceptors = [2138227, 30020879] # also present in SJ.out.tab star_donors = [] star_acceptors = [] for real_donor in real_donors: self.assertIn(real_donor, donors[chrom]) for real_acceptor in real_acceptors: self.assertIn(real_acceptor, acceptors[chrom])
class TestStar(unittest.TestCase): gtf_exons_name = os.path.join(INPUT_DIR,'hg19_exons_prot_coding.gtf.shuf') gtf_donors, gtf_acceptors = ptes.annot_junctions(gtf_exons_name=gtf_exons_name) chim_list = [] with open(os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), 'r') as chim_file: for line in chim_file: res_dict = ptes.star_line_dict(line=line) chim_list.append(res_dict) def test_parse_sam_row(self): """ 5 random lines from ENCFF636QII/mate1_Aligned.out.sam :return: dicts with attributes """ ''' with open(os.path.join(INPUT_DIR,'Aligned.out.sam.shuf')) as sam_file: for line in sam_file: row = line.strip().split('\t') if len(row) > 1: res_dict = ptes.parse_sam_row(row=row) ''' def test_sam_input(self, dump=False): res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'), chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf')) if dump: with open(os.path.join(OUTPUT_DIR, 'sam_dict.json'), 'w') as sam_json_file: json.dump(res_dict, sam_json_file, indent=2) else: with open(os.path.join(INPUT_DIR, 'sam_dict.json'), 'r') as sam_json_file: res_dict_exp = json.load(sam_json_file) self.assertEqual(res_dict, res_dict_exp) def test_chim_input(self, dump=False): junc_dict = defaultdict(dict) sam_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'), chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), ) chim_res_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), gtf_donors=self.gtf_donors, gtf_acceptors=self.gtf_acceptors, sam_dict=sam_dict, junc_dict=junc_dict) if dump: with open(os.path.join(OUTPUT_DIR, 'chim_dict.json'), 'w') as chim_json_file: json.dump(chim_res_list, chim_json_file, indent=2) else: with open(os.path.join(INPUT_DIR, 'chim_dict.json'), 'r') as chim_json_file: res_list_exp = json.load(chim_json_file) self.assertEqual(chim_res_list, res_list_exp) def test_reads_to_junctions(self, dump=False): res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'), chim_name=os.path.join(INPUT_DIR,'Chimeric.out.junction.shuf'), ) reads_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), gtf_donors=self.gtf_donors, gtf_acceptors=self.gtf_acceptors, sam_dict=res_dict) reads_df = pd.DataFrame(reads_list) reads_df['id'] = 'tag' junc_df = star_SE_chimeric.reads_to_junctions(reads_df) for index, row in junc_df.head(n=3).iterrows(): print(list(row.index)) print(row.values) if dump: junc_df.to_csv(os.path.join(OUTPUT_DIR, 'junctions.csv'), sep='\t') else: exp_junc_df = pd.read_csv(os.path.join(INPUT_DIR, 'junctions.csv'), sep='\t') self.assertEqual(exp_junc_df.to_dict('records'), junc_df.reset_index(drop=False).to_dict('records')) def test_junc_dict(self): junc_dict_test = defaultdict(list) res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'), chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), ) reads_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), gtf_donors=self.gtf_donors, gtf_acceptors=self.gtf_acceptors, sam_dict=res_dict, junc_dict=junc_dict_test) with open(os.path.join(OUTPUT_DIR, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict_test.items()}, junc_json, indent=2) data = json.load(open(os.path.join(OUTPUT_DIR, 'junc_dict.json')), object_pairs_hook=OrderedDict) with open(os.path.join(OUTPUT_DIR, 'junc_dict_loaded.json'), 'w') as junc_json: json.dump(data, junc_json, indent=2)
def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR output, Chimeric.out.junction \ OR list of such files") parser.add_argument( "-s", "--sam", type=str, help= "Filtered STAR SAM output, with read_names same as in Chimeric.out.junction OR list" ) parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument( "-l", "--list", type=str, help="Enables list input mode. Options: input, sam, tag - MUST be lists" ) parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument( "-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id OR list of tags") args = parser.parse_args() # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name, feature_name='exon') PTES_logger.info('Reading GTF... done') # non-iterative make_dir(args.output) junc_dict = defaultdict(dict) all_reads_df = None if args.list: with open(args.input, 'r') as chim_names_file: chim_names_list = [ x.strip('\n') for x in chim_names_file.readlines() ] with open(args.sam, 'r') as sam_names_file: sam_names_list = [ x.strip('\n') for x in sam_names_file.readlines() ] with open(args.tag, 'r') as tag_names_file: tag_names_list = [ x.strip('\n') for x in tag_names_file.readlines() ] triads = zip(chim_names_list, sam_names_list, tag_names_list) PTES_logger.info('Enabled list mode') chim_reads_df_list = [] else: triads = [(args.input, args.sam, args.tag)] for chim_name, sam_name, tag in triads: PTES_logger.info('Input file %s' % chim_name) # Reading filtered STAR non-chim output PTES_logger.info('Reading STAR .sam output...') sam_dict = sam_input(sam_name=sam_name, chim_name=chim_name) PTES_logger.info('Reading STAR .sam output... done') # Reading filtered STAR output PTES_logger.info('Reading STAR chimeric output...') read_names_list = chim_input(chim_name=chim_name, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors, sam_dict=sam_dict, junc_dict=junc_dict) PTES_logger.info('Reading STAR chimeric output... done') PTES_logger.info('Creating reads dataframes...') try: reads_df = pd.DataFrame(read_names_list) reads_df = reads_df[[ 'read_name', 'chrom', 'chain', 'donor', 'acceptor', 'annot_donor', 'annot_acceptor', 'letters_ss', 'chim_dist', 'mate_dist', 'type', ]].sort_values( by=['chrom', 'chain', 'donor', 'acceptor']).reset_index( drop=True) # reorder columns reads_df['id'] = tag if args.list: chim_reads_df_list.append(reads_df) else: all_reads_df = reads_df except KeyError: PTES_logger.warning('Creating reads dataframe... empty dataframe') if args.list: all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True) if all_reads_df is not None: # Writing reads dataframe PTES_logger.info('Writing reads dataframe...') all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t') PTES_logger.info('Writing reads dataframe... done') # Writing junc_dict PTES_logger.info('Writing intervals to json file...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json: junc_json.write( json.dumps({str(k): v for k, v in junc_dict.items() }).encode('utf-8')) else: with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2) PTES_logger.info('Writing intervals to json file... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=all_reads_df) junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done')
def main(): ### Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR Chimeric.out.junction output OR list") parser.add_argument("-o", "--output", type=str, help="Path for subfolder with results") parser.add_argument("-l", "--list", type=str, help="Enables list input mode. Options: input, tag - MUST be lists") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument("-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results (prefix), i.e. ENCODE id OR list") args = parser.parse_args() # Main make_dir(args.output) skipped = {'non-filtered': 0, # different chromosomes and/or chains 'chrM': 0, # mapping to chrM 'PE': 0, # junction between the mates, -1 in STAR output 'non-chim': 0} # STAR counts very long (>1Mb) junctions as chimeric junc_dict = defaultdict(dict) # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') if args.list: with open(args.input, 'r') as chim_names_file: chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()] with open(args.tag, 'r') as tag_names_file: tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()] pairs = zip(chim_names_list, tag_names_list) PTES_logger.info('Enabled list mode') chim_reads_df_list = [] else: pairs = [(args.input, args.tag)] for chim_name, tag in pairs: annot_donors = 0 annot_acceptors = 0 read_names_list = [] PTES_logger.info('Input file: %s ' % chim_name) PTES_logger.info('Reading STAR output...') with open(chim_name, 'r') as input_file: for i, line in enumerate(input_file): line_dict = star_line_dict(line=line) if not line_dict: continue if line_dict['chrom1'] == line_dict['chrom2'] \ and line_dict['chain1'] == line_dict['chain2']: chrom = line_dict['chrom1'] chain = line_dict['chain1'] else: skipped['non-filtered'] +=1 continue if chrom == 'chrM': skipped['chrM'] += 1 continue if line_dict['junction_letters'] == '-': PTES_logger.error('PE input, junction type -1 is present!') PTES_logger.error('Current version works only with SE output') skipped['PE'] += 1 continue if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \ or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \ or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']: skipped['non-chim'] += 1 continue read_name = line_dict['read_name'] chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1']) chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2']) junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss']) ].update({read_name: (chim_part1, chim_part2)}) annot_donor = 0 annot_acceptor = 0 if line_dict['donor_ss'] in gtf_donors[chrom]: annot_donor = 1 annot_donors += 1 if line_dict['acceptor_ss'] in gtf_acceptors[chrom]: annot_acceptor = 1 annot_acceptors += 1 read_attrs = { 'read_name': read_name, 'chain': chain, # chain of chimeric junction 'chrom': chrom, 'donor': line_dict['donor_ss'], 'acceptor': line_dict['acceptor_ss'], 'annot_donor': annot_donor, 'annot_acceptor': annot_acceptor, 'letters_ss': line_dict['junction_letters'], 'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']), } read_names_list.append(read_attrs) PTES_logger.info('Reading STAR output... done') PTES_logger.info('Processed: %i rows' % i) for key in skipped: PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key])) PTES_logger.info('Converted successfully: %i rows' % len(read_names_list)) PTES_logger.info('Annot donors: %i' % annot_donors) PTES_logger.info('Annot acceptors: %i' % annot_acceptors) PTES_logger.info('Creating reads dataframe...') try: reads_df = pd.DataFrame(read_names_list) reads_df = reads_df[ ['read_name', 'chrom', 'chain', 'donor', 'acceptor', 'annot_donor', 'annot_acceptor', 'letters_ss', 'chim_dist'] ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True) # reorder columns reads_df['id'] = tag if args.list: chim_reads_df_list.append(reads_df) else: all_reads_df = reads_df except KeyError: PTES_logger.warning('Creating reads dataframe... empty dataframe') if args.list: all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True) # Writing reads dataframe if all_reads_df is not None: all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t') PTES_logger.info('Creating reads dataframe... done') # Writing junc_dict PTES_logger.info('Writing intervals to json...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json: junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8')) else: with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json: json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2) PTES_logger.info('Writing intervals to json... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=all_reads_df) junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done') else: PTES_logger.warning('Empty dataframe')
type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to genome file") parser.add_argument("-t", "--tag", type=str, help="Tag name for grouping results, i.e. ENCODE id") args = parser.parse_args() # Functions # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') # Reading filtered STAR output PTES_logger.info('Reading STAR output...') path_to_file = args.output.rstrip('/') outside_name = 'mate_outside.junction' outside_list = [] init_file(outside_name, folder=path_to_file) mates_gtag = {'inside': 0, 'outside': 0, 'non-chim': 0} mates_nc = {'inside': 0, 'outside': 0, 'non-chim': 0} annot_donors = 0 annot_acceptors = 0
def main(): # Arguments parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, help="STAR output, Chimeric.out.junction.filtered \ OR list of such files") parser.add_argument("-s", "--sam", type=str, help="Filtered STAR SAM output OR list") parser.add_argument("-o", "--output", type=str, help="Output folder for results") parser.add_argument("-gz", "--gzip", type=str, help="Option to create .json.gz") parser.add_argument( "-l", "--list", type=str, help="Enables list input mode. Options: sam, tag - MUST be lists") parser.add_argument("-gtf", "--gtf_annot", type=str, default='/home/sunnymouse/Human_ref/hg19_exons.gtf', help="Absolute path to annotation file") parser.add_argument( "-t", "--tag", type=str, default='ENCODE', help="Tag name for grouping results, i.e. ENCODE id OR list of tags") args = parser.parse_args() # Exons GTF to junctions dict PTES_logger.info('Reading GTF...') gtf_exons_name = args.gtf_annot gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name) PTES_logger.info('Reading GTF... done') make_dir(args.output) norm_junc_dict = defaultdict(dict) norm_read_names_list = [] if args.list: with open(args.sam, 'r') as sam_names_file: sam_names_list = [ x.strip('\n') for x in sam_names_file.readlines() ] if args.input: with open(args.input, 'r') as chim_names_file: chim_names_list = [ x.strip('\n') for x in chim_names_file.readlines() ] else: chim_names_list = [None] * len(sam_names_list) with open(args.tag, 'r') as tag_names_file: tag_names_list = [ x.strip('\n') for x in tag_names_file.readlines() ] triads = zip(chim_names_list, sam_names_list, tag_names_list) PTES_logger.info('Enabled list mode') else: triads = [(args.input, args.sam, args.tag)] for chim_name, sam_name, tag in triads: if chim_name: with open(chim_name, 'r') as chim_file: names_list = [ x.strip('\n').split('\t')[9] for x in chim_file.readlines() ] names_set = set(names_list) # only names with chimeric output with open(sam_name, 'r') as sam_file: PTES_logger.info('Input file %s' % sam_name) for line in sam_file: if line.startswith('@'): continue row = line.strip().split('\t') sam_attrs = None if len(row) > 1: read_name = row[0] if chim_name: if read_name in names_set: sam_attrs = parse_sam_row(row) else: sam_attrs = parse_sam_row(row) if sam_attrs: if 'N' in sam_attrs['cigar']: # read mapped with intron read_dict = get_read_interval( cigar=sam_attrs['cigar'], leftpos=sam_attrs['leftpos'], output='dict') if sam_attrs['chain'] == '+': donor_ss = int(read_dict['N1'][0].inf - 1) # counts first N as intron acceptor_ss = int(read_dict['N1'][0].sup + 1) elif sam_attrs['chain'] == '-': donor_ss = int(read_dict['N1'][0].sup + 1) acceptor_ss = int(read_dict['N1'][0].inf - 1) norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'], donor_ss, acceptor_ss)].update( {read_name: tuple([read_dict])}) norm_read_names_list.append({ 'read_name': read_name, 'chrom': sam_attrs['chrom'], 'chain': sam_attrs['chain'], 'donor': donor_ss, 'acceptor': acceptor_ss, 'id': tag }) try: norm_read_df = pd.DataFrame(norm_read_names_list) norm_read_df = norm_read_df[[ 'read_name', 'chrom', 'chain', 'donor', 'acceptor', 'id', ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index( drop=True) PTES_logger.info('Writing reads dataframe...') norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'), sep='\t') PTES_logger.info('Writing reads dataframe... done') except KeyError: PTES_logger.warning( 'Creating norm split reads dataframe... empty dataframe') # Writing junc_dict PTES_logger.info('Writing intervals to json files...') if args.gzip: PTES_logger.info('Output will be archived') with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'), 'w') as norm_json: norm_json.write( json.dumps({str(k1): v1 for k1, v1 in norm_junc_dict.items() }).encode('utf-8')) else: with open(os.path.join(args.output, 'norm_dict.json'), 'w') as norm_json: json.dump({str(k1): v1 for k1, v1 in norm_junc_dict.items()}, norm_json, indent=2) PTES_logger.info('Writing intervals to json files... done') # Writing junctions dataframe PTES_logger.info('Creating junctions dataframe...') junctions_df = reads_to_junctions(reads_df=norm_read_df, gtf_donors=gtf_donors, gtf_acceptors=gtf_acceptors) junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'), sep='\t') PTES_logger.info('Creating junctions dataframe... done')