Example #1
0
 def test_annot_junctions(self):
     filename = './test_data/ptes/hg19_exons_prot_coding.gtf.shuf'
     donors, acceptors = ptes.annot_junctions(gtf_exons_name=filename)
     chrom = 'chr16'
     real_donors = [2138327, 30020798]  # chain +, chain -
     real_acceptors = [2138227, 30020879]  # also present in SJ.out.tab
     star_donors = []
     star_acceptors = []
     for real_donor in real_donors:
         self.assertIn(real_donor, donors[chrom])
     for real_acceptor in real_acceptors:
         self.assertIn(real_acceptor, acceptors[chrom])
Example #2
0
class TestStar(unittest.TestCase):
    gtf_exons_name = os.path.join(INPUT_DIR,'hg19_exons_prot_coding.gtf.shuf')
    gtf_donors, gtf_acceptors = ptes.annot_junctions(gtf_exons_name=gtf_exons_name)

    chim_list = []
    with open(os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'), 'r') as chim_file:
        for line in chim_file:
            res_dict = ptes.star_line_dict(line=line)
            chim_list.append(res_dict)

    def test_parse_sam_row(self):
        """
        5 random lines from ENCFF636QII/mate1_Aligned.out.sam
        :return: dicts with attributes
        """
        '''
        with open(os.path.join(INPUT_DIR,'Aligned.out.sam.shuf')) as sam_file:
            for line in sam_file:
                row = line.strip().split('\t')
                if len(row) > 1:
                    res_dict = ptes.parse_sam_row(row=row)
        '''
    def test_sam_input(self, dump=False):
        res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'),
                                              chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'))
        if dump:
            with open(os.path.join(OUTPUT_DIR, 'sam_dict.json'), 'w') as sam_json_file:
                json.dump(res_dict, sam_json_file, indent=2)
        else:
            with open(os.path.join(INPUT_DIR, 'sam_dict.json'), 'r') as sam_json_file:
                res_dict_exp = json.load(sam_json_file)
            self.assertEqual(res_dict, res_dict_exp)

    def test_chim_input(self, dump=False):
        junc_dict = defaultdict(dict)
        sam_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'),
                                              chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'),
                                              )

        chim_res_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'),
                                                    gtf_donors=self.gtf_donors,
                                                    gtf_acceptors=self.gtf_acceptors,
                                                    sam_dict=sam_dict,
                                                    junc_dict=junc_dict)
        if dump:
            with open(os.path.join(OUTPUT_DIR, 'chim_dict.json'), 'w') as chim_json_file:
                json.dump(chim_res_list, chim_json_file, indent=2)
        else:
            with open(os.path.join(INPUT_DIR, 'chim_dict.json'), 'r') as chim_json_file:
                res_list_exp = json.load(chim_json_file)
            self.assertEqual(chim_res_list, res_list_exp)



    def test_reads_to_junctions(self, dump=False):
        res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'),
                                              chim_name=os.path.join(INPUT_DIR,'Chimeric.out.junction.shuf'),
                                              )

        reads_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'),
                                                 gtf_donors=self.gtf_donors,
                                                 gtf_acceptors=self.gtf_acceptors,
                                                 sam_dict=res_dict)


        reads_df = pd.DataFrame(reads_list)
        reads_df['id'] = 'tag'
        junc_df = star_SE_chimeric.reads_to_junctions(reads_df)
        for index, row in junc_df.head(n=3).iterrows():
            print(list(row.index))
            print(row.values)
        if dump:
            junc_df.to_csv(os.path.join(OUTPUT_DIR, 'junctions.csv'), sep='\t')
        else:
            exp_junc_df = pd.read_csv(os.path.join(INPUT_DIR, 'junctions.csv'), sep='\t')
            self.assertEqual(exp_junc_df.to_dict('records'),
                             junc_df.reset_index(drop=False).to_dict('records'))

    def test_junc_dict(self):
        junc_dict_test = defaultdict(list)

        res_dict = star_SE_chimeric.sam_input(sam_name=os.path.join(INPUT_DIR, 'Aligned.out.sam.shuf'),
                                              chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'),
                                              )
        reads_list = star_SE_chimeric.chim_input(chim_name=os.path.join(INPUT_DIR, 'Chimeric.out.junction.shuf'),
                                                 gtf_donors=self.gtf_donors,
                                                 gtf_acceptors=self.gtf_acceptors,
                                                 sam_dict=res_dict,
                                                 junc_dict=junc_dict_test)

        with open(os.path.join(OUTPUT_DIR, 'junc_dict.json'), 'w') as junc_json:
            json.dump({str(k): v for k, v in junc_dict_test.items()}, junc_json, indent=2)

        data = json.load(open(os.path.join(OUTPUT_DIR, 'junc_dict.json')), object_pairs_hook=OrderedDict)

        with open(os.path.join(OUTPUT_DIR, 'junc_dict_loaded.json'), 'w') as junc_json:
            json.dump(data, junc_json, indent=2)
Example #3
0
def main():
    # Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="STAR output, Chimeric.out.junction \
                            OR list of such files")
    parser.add_argument(
        "-s",
        "--sam",
        type=str,
        help=
        "Filtered STAR SAM output, with read_names same as in Chimeric.out.junction OR list"
    )
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-gz",
                        "--gzip",
                        type=str,
                        help="Option to create .json.gz")
    parser.add_argument(
        "-l",
        "--list",
        type=str,
        help="Enables list input mode. Options: input, sam, tag - MUST be lists"
    )
    parser.add_argument("-gtf",
                        "--gtf_annot",
                        type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument(
        "-t",
        "--tag",
        type=str,
        default='ENCODE',
        help="Tag name for grouping results, i.e. ENCODE id OR list of tags")
    args = parser.parse_args()

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name,
                                                feature_name='exon')
    PTES_logger.info('Reading GTF... done')

    # non-iterative
    make_dir(args.output)
    junc_dict = defaultdict(dict)
    all_reads_df = None

    if args.list:
        with open(args.input, 'r') as chim_names_file:
            chim_names_list = [
                x.strip('\n') for x in chim_names_file.readlines()
            ]
        with open(args.sam, 'r') as sam_names_file:
            sam_names_list = [
                x.strip('\n') for x in sam_names_file.readlines()
            ]
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [
                x.strip('\n') for x in tag_names_file.readlines()
            ]
        triads = zip(chim_names_list, sam_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
        chim_reads_df_list = []
    else:
        triads = [(args.input, args.sam, args.tag)]

    for chim_name, sam_name, tag in triads:
        PTES_logger.info('Input file %s' % chim_name)

        # Reading filtered STAR non-chim output
        PTES_logger.info('Reading STAR .sam output...')
        sam_dict = sam_input(sam_name=sam_name, chim_name=chim_name)
        PTES_logger.info('Reading STAR .sam output... done')

        # Reading filtered STAR output
        PTES_logger.info('Reading STAR chimeric output...')
        read_names_list = chim_input(chim_name=chim_name,
                                     gtf_donors=gtf_donors,
                                     gtf_acceptors=gtf_acceptors,
                                     sam_dict=sam_dict,
                                     junc_dict=junc_dict)
        PTES_logger.info('Reading STAR chimeric output... done')
        PTES_logger.info('Creating reads dataframes...')
        try:
            reads_df = pd.DataFrame(read_names_list)
            reads_df = reads_df[[
                'read_name',
                'chrom',
                'chain',
                'donor',
                'acceptor',
                'annot_donor',
                'annot_acceptor',
                'letters_ss',
                'chim_dist',
                'mate_dist',
                'type',
            ]].sort_values(
                by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(
                    drop=True)  # reorder columns
            reads_df['id'] = tag
            if args.list:
                chim_reads_df_list.append(reads_df)
            else:
                all_reads_df = reads_df
        except KeyError:
            PTES_logger.warning('Creating reads dataframe... empty dataframe')

    if args.list:
        all_reads_df = pd.concat(chim_reads_df_list,
                                 sort=True).reset_index(drop=True)

    if all_reads_df is not None:
        # Writing reads dataframe
        PTES_logger.info('Writing reads dataframe...')
        all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'),
                            sep='\t')
        PTES_logger.info('Writing reads dataframe... done')

        # Writing junc_dict
        PTES_logger.info('Writing intervals to json file...')
        if args.gzip:
            PTES_logger.info('Output will be archived')
            with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'),
                               'w') as junc_json:
                junc_json.write(
                    json.dumps({str(k): v
                                for k, v in junc_dict.items()
                                }).encode('utf-8'))
        else:
            with open(os.path.join(args.output, 'junc_dict.json'),
                      'w') as junc_json:
                json.dump({str(k): v
                           for k, v in junc_dict.items()},
                          junc_json,
                          indent=2)

        PTES_logger.info('Writing intervals to json file... done')

        # Writing junctions dataframe
        PTES_logger.info('Creating junctions dataframe...')
        junctions_df = reads_to_junctions(reads_df=all_reads_df)
        junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'),
                            sep='\t')
        PTES_logger.info('Creating junctions dataframe... done')
Example #4
0
def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str,
                        help="STAR Chimeric.out.junction output OR list")
    parser.add_argument("-o", "--output", type=str,
                        help="Path for subfolder with results")
    parser.add_argument("-l", "--list", type=str,
                        help="Enables list input mode. Options: input, tag - MUST be lists")
    parser.add_argument("-gz", "--gzip", type=str,
                        help="Option to create .json.gz")
    parser.add_argument("-gtf", "--gtf_annot", type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument("-t", "--tag", type=str,
                        default='ENCODE',
                        help="Tag name for grouping results (prefix), i.e. ENCODE id OR list")
    args = parser.parse_args()

    # Main
    make_dir(args.output)

    skipped = {'non-filtered': 0,    # different chromosomes and/or chains
               'chrM': 0,      # mapping to chrM
               'PE': 0,   # junction between the mates, -1 in STAR output
               'non-chim': 0}   # STAR counts very long (>1Mb) junctions as chimeric

    junc_dict = defaultdict(dict)

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    if args.list:
        with open(args.input, 'r') as chim_names_file:
            chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()]
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()]
        pairs = zip(chim_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
        chim_reads_df_list = []
    else:
        pairs = [(args.input, args.tag)]

    for chim_name, tag in pairs:
        annot_donors = 0
        annot_acceptors = 0
        read_names_list = []
        PTES_logger.info('Input file: %s ' % chim_name)

        PTES_logger.info('Reading STAR output...')
        with open(chim_name, 'r') as input_file:
            for i, line in enumerate(input_file):
                line_dict = star_line_dict(line=line)
                if not line_dict:
                    continue
                if line_dict['chrom1'] == line_dict['chrom2'] \
                        and line_dict['chain1'] == line_dict['chain2']:
                    chrom = line_dict['chrom1']
                    chain = line_dict['chain1']
                else:
                    skipped['non-filtered'] +=1
                    continue
                if chrom == 'chrM':
                    skipped['chrM'] += 1
                    continue
                if line_dict['junction_letters'] == '-':
                    PTES_logger.error('PE input, junction type -1 is present!')
                    PTES_logger.error('Current version works only with SE output')
                    skipped['PE'] += 1
                    continue
                if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \
                        or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \
                        or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']:
                    skipped['non-chim'] += 1
                    continue
                read_name = line_dict['read_name']
                chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1'])
                chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2'])
                junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss'])
                ].update({read_name: (chim_part1, chim_part2)})

                annot_donor = 0
                annot_acceptor = 0
                if line_dict['donor_ss'] in gtf_donors[chrom]:
                    annot_donor = 1
                    annot_donors += 1
                if line_dict['acceptor_ss'] in gtf_acceptors[chrom]:
                    annot_acceptor = 1
                    annot_acceptors += 1

                read_attrs = {
                    'read_name': read_name,
                    'chain': chain,  # chain of chimeric junction
                    'chrom': chrom,
                    'donor': line_dict['donor_ss'],
                    'acceptor': line_dict['acceptor_ss'],
                    'annot_donor': annot_donor,
                    'annot_acceptor': annot_acceptor,
                    'letters_ss': line_dict['junction_letters'],
                    'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']),
                }
                read_names_list.append(read_attrs)

        PTES_logger.info('Reading STAR output... done')
        PTES_logger.info('Processed: %i rows' % i)
        for key in skipped:
            PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key]))
        PTES_logger.info('Converted successfully: %i rows' % len(read_names_list))
        PTES_logger.info('Annot donors: %i' % annot_donors)
        PTES_logger.info('Annot acceptors: %i' % annot_acceptors)
        PTES_logger.info('Creating reads dataframe...')
        try:
            reads_df = pd.DataFrame(read_names_list)
            reads_df = reads_df[
                ['read_name', 'chrom', 'chain',
                 'donor', 'acceptor', 'annot_donor',
                 'annot_acceptor', 'letters_ss',
                 'chim_dist']
            ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True)  # reorder columns
            reads_df['id'] = tag
            if args.list:
                chim_reads_df_list.append(reads_df)
            else:
                all_reads_df = reads_df
        except KeyError:
            PTES_logger.warning('Creating reads dataframe... empty dataframe')

    if args.list:
        all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True)
    # Writing reads dataframe
    if all_reads_df is not None:
        all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t')
        PTES_logger.info('Creating reads dataframe... done')

        # Writing junc_dict
        PTES_logger.info('Writing intervals to json...')
        if args.gzip:
            PTES_logger.info('Output will be archived')
            with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json:
                junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8'))
        else:
            with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json:
                json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2)
        PTES_logger.info('Writing intervals to json... done')

        # Writing junctions dataframe
        PTES_logger.info('Creating junctions dataframe...')
        junctions_df = reads_to_junctions(reads_df=all_reads_df)
        junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t')
        PTES_logger.info('Creating junctions dataframe... done')
    else:
        PTES_logger.warning('Empty dataframe')
Example #5
0
                    type=str,
                    default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                    help="Absolute path to genome file")
parser.add_argument("-t",
                    "--tag",
                    type=str,
                    help="Tag name for grouping results, i.e. ENCODE id")
args = parser.parse_args()

# Functions

# Exons GTF to junctions dict

PTES_logger.info('Reading GTF...')
gtf_exons_name = args.gtf_annot
gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)

PTES_logger.info('Reading GTF... done')

# Reading filtered STAR output
PTES_logger.info('Reading STAR output...')
path_to_file = args.output.rstrip('/')
outside_name = 'mate_outside.junction'
outside_list = []
init_file(outside_name, folder=path_to_file)

mates_gtag = {'inside': 0, 'outside': 0, 'non-chim': 0}
mates_nc = {'inside': 0, 'outside': 0, 'non-chim': 0}

annot_donors = 0
annot_acceptors = 0
def main():
    # Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="STAR output, Chimeric.out.junction.filtered \
                                OR list of such files")
    parser.add_argument("-s",
                        "--sam",
                        type=str,
                        help="Filtered STAR SAM output OR list")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-gz",
                        "--gzip",
                        type=str,
                        help="Option to create .json.gz")
    parser.add_argument(
        "-l",
        "--list",
        type=str,
        help="Enables list input mode. Options: sam, tag - MUST be lists")
    parser.add_argument("-gtf",
                        "--gtf_annot",
                        type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument(
        "-t",
        "--tag",
        type=str,
        default='ENCODE',
        help="Tag name for grouping results, i.e. ENCODE id OR list of tags")
    args = parser.parse_args()

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    make_dir(args.output)
    norm_junc_dict = defaultdict(dict)
    norm_read_names_list = []

    if args.list:
        with open(args.sam, 'r') as sam_names_file:
            sam_names_list = [
                x.strip('\n') for x in sam_names_file.readlines()
            ]
        if args.input:
            with open(args.input, 'r') as chim_names_file:
                chim_names_list = [
                    x.strip('\n') for x in chim_names_file.readlines()
                ]
        else:
            chim_names_list = [None] * len(sam_names_list)
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [
                x.strip('\n') for x in tag_names_file.readlines()
            ]
        triads = zip(chim_names_list, sam_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
    else:
        triads = [(args.input, args.sam, args.tag)]

    for chim_name, sam_name, tag in triads:
        if chim_name:
            with open(chim_name, 'r') as chim_file:
                names_list = [
                    x.strip('\n').split('\t')[9]
                    for x in chim_file.readlines()
                ]
                names_set = set(names_list)  # only names with chimeric output

        with open(sam_name, 'r') as sam_file:
            PTES_logger.info('Input file %s' % sam_name)
            for line in sam_file:
                if line.startswith('@'):
                    continue
                row = line.strip().split('\t')
                sam_attrs = None
                if len(row) > 1:
                    read_name = row[0]
                    if chim_name:
                        if read_name in names_set:
                            sam_attrs = parse_sam_row(row)
                    else:
                        sam_attrs = parse_sam_row(row)
                if sam_attrs:
                    if 'N' in sam_attrs['cigar']:  # read mapped with intron
                        read_dict = get_read_interval(
                            cigar=sam_attrs['cigar'],
                            leftpos=sam_attrs['leftpos'],
                            output='dict')
                        if sam_attrs['chain'] == '+':
                            donor_ss = int(read_dict['N1'][0].inf -
                                           1)  # counts first N as intron
                            acceptor_ss = int(read_dict['N1'][0].sup + 1)
                        elif sam_attrs['chain'] == '-':
                            donor_ss = int(read_dict['N1'][0].sup + 1)
                            acceptor_ss = int(read_dict['N1'][0].inf - 1)
                        norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'],
                                        donor_ss, acceptor_ss)].update(
                                            {read_name: tuple([read_dict])})
                        norm_read_names_list.append({
                            'read_name': read_name,
                            'chrom': sam_attrs['chrom'],
                            'chain': sam_attrs['chain'],
                            'donor': donor_ss,
                            'acceptor': acceptor_ss,
                            'id': tag
                        })
    try:
        norm_read_df = pd.DataFrame(norm_read_names_list)
        norm_read_df = norm_read_df[[
            'read_name',
            'chrom',
            'chain',
            'donor',
            'acceptor',
            'id',
        ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(
            drop=True)
        PTES_logger.info('Writing reads dataframe...')
        norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'),
                            sep='\t')
        PTES_logger.info('Writing reads dataframe... done')
    except KeyError:
        PTES_logger.warning(
            'Creating norm split reads dataframe... empty dataframe')

    # Writing junc_dict
    PTES_logger.info('Writing intervals to json files...')
    if args.gzip:
        PTES_logger.info('Output will be archived')
        with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'),
                           'w') as norm_json:
            norm_json.write(
                json.dumps({str(k1): v1
                            for k1, v1 in norm_junc_dict.items()
                            }).encode('utf-8'))
    else:
        with open(os.path.join(args.output, 'norm_dict.json'),
                  'w') as norm_json:
            json.dump({str(k1): v1
                       for k1, v1 in norm_junc_dict.items()},
                      norm_json,
                      indent=2)

    PTES_logger.info('Writing intervals to json files... done')

    # Writing junctions dataframe
    PTES_logger.info('Creating junctions dataframe...')
    junctions_df = reads_to_junctions(reads_df=norm_read_df,
                                      gtf_donors=gtf_donors,
                                      gtf_acceptors=gtf_acceptors)
    junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'),
                        sep='\t')
    PTES_logger.info('Creating junctions dataframe... done')