コード例 #1
0
ファイル: ucsc.py プロジェクト: sunnymouse25/ptes
def make_bed_folder(prefix, path_to_file):
    """
    DEPRECATED
    Initiates 3 files essential for Genome Browser:
    :param path_to_file: folder where ./bed subfolder will be
    :param prefix: prefix for names of files, i.e. sample tag
    :return: ./bed subfolder,
    BED file for track lines (.bed),
    table with windows to copy-paste (.coords.csv),
    track file for GB (.track)
    """
    bed_name = '%s.bed' % prefix  # only track lines
    coord_name = '%s.coords.csv' % prefix  # table with windows to paste into GB and with descriptions
    info_name = '%s.track' % prefix  # file to submit to GB
    folder_name = '%s/bed/' % path_to_file
    make_dir(folder_name)

    init_file(bed_name, folder=folder_name)
    init_file(coord_name, folder=folder_name)
    init_file(info_name, folder=folder_name)

    writeln_to_file('\n'.join([
        'browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk',
        'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3',
        'browser pack gtexGene',
        'track type=bigBed \
         name="%s" \
         description="bigBed" \
         visibility=2 \
         itemRgb="On" \
         bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true'
        % (prefix, bed_name.replace('.bed', '.bb'))
    ]),
                    info_name,
                    folder=folder_name)
    return folder_name, bed_name, coord_name
コード例 #2
0
def main():
    # Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="STAR output, Chimeric.out.junction \
                            OR list of such files")
    parser.add_argument(
        "-s",
        "--sam",
        type=str,
        help=
        "Filtered STAR SAM output, with read_names same as in Chimeric.out.junction OR list"
    )
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-gz",
                        "--gzip",
                        type=str,
                        help="Option to create .json.gz")
    parser.add_argument(
        "-l",
        "--list",
        type=str,
        help="Enables list input mode. Options: input, sam, tag - MUST be lists"
    )
    parser.add_argument("-gtf",
                        "--gtf_annot",
                        type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument(
        "-t",
        "--tag",
        type=str,
        default='ENCODE',
        help="Tag name for grouping results, i.e. ENCODE id OR list of tags")
    args = parser.parse_args()

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name,
                                                feature_name='exon')
    PTES_logger.info('Reading GTF... done')

    # non-iterative
    make_dir(args.output)
    junc_dict = defaultdict(dict)
    all_reads_df = None

    if args.list:
        with open(args.input, 'r') as chim_names_file:
            chim_names_list = [
                x.strip('\n') for x in chim_names_file.readlines()
            ]
        with open(args.sam, 'r') as sam_names_file:
            sam_names_list = [
                x.strip('\n') for x in sam_names_file.readlines()
            ]
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [
                x.strip('\n') for x in tag_names_file.readlines()
            ]
        triads = zip(chim_names_list, sam_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
        chim_reads_df_list = []
    else:
        triads = [(args.input, args.sam, args.tag)]

    for chim_name, sam_name, tag in triads:
        PTES_logger.info('Input file %s' % chim_name)

        # Reading filtered STAR non-chim output
        PTES_logger.info('Reading STAR .sam output...')
        sam_dict = sam_input(sam_name=sam_name, chim_name=chim_name)
        PTES_logger.info('Reading STAR .sam output... done')

        # Reading filtered STAR output
        PTES_logger.info('Reading STAR chimeric output...')
        read_names_list = chim_input(chim_name=chim_name,
                                     gtf_donors=gtf_donors,
                                     gtf_acceptors=gtf_acceptors,
                                     sam_dict=sam_dict,
                                     junc_dict=junc_dict)
        PTES_logger.info('Reading STAR chimeric output... done')
        PTES_logger.info('Creating reads dataframes...')
        try:
            reads_df = pd.DataFrame(read_names_list)
            reads_df = reads_df[[
                'read_name',
                'chrom',
                'chain',
                'donor',
                'acceptor',
                'annot_donor',
                'annot_acceptor',
                'letters_ss',
                'chim_dist',
                'mate_dist',
                'type',
            ]].sort_values(
                by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(
                    drop=True)  # reorder columns
            reads_df['id'] = tag
            if args.list:
                chim_reads_df_list.append(reads_df)
            else:
                all_reads_df = reads_df
        except KeyError:
            PTES_logger.warning('Creating reads dataframe... empty dataframe')

    if args.list:
        all_reads_df = pd.concat(chim_reads_df_list,
                                 sort=True).reset_index(drop=True)

    if all_reads_df is not None:
        # Writing reads dataframe
        PTES_logger.info('Writing reads dataframe...')
        all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'),
                            sep='\t')
        PTES_logger.info('Writing reads dataframe... done')

        # Writing junc_dict
        PTES_logger.info('Writing intervals to json file...')
        if args.gzip:
            PTES_logger.info('Output will be archived')
            with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'),
                               'w') as junc_json:
                junc_json.write(
                    json.dumps({str(k): v
                                for k, v in junc_dict.items()
                                }).encode('utf-8'))
        else:
            with open(os.path.join(args.output, 'junc_dict.json'),
                      'w') as junc_json:
                json.dump({str(k): v
                           for k, v in junc_dict.items()},
                          junc_json,
                          indent=2)

        PTES_logger.info('Writing intervals to json file... done')

        # Writing junctions dataframe
        PTES_logger.info('Creating junctions dataframe...')
        junctions_df = reads_to_junctions(reads_df=all_reads_df)
        junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'),
                            sep='\t')
        PTES_logger.info('Creating junctions dataframe... done')
コード例 #3
0
def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str,
                        help="STAR Chimeric.out.junction output OR list")
    parser.add_argument("-o", "--output", type=str,
                        help="Path for subfolder with results")
    parser.add_argument("-l", "--list", type=str,
                        help="Enables list input mode. Options: input, tag - MUST be lists")
    parser.add_argument("-gz", "--gzip", type=str,
                        help="Option to create .json.gz")
    parser.add_argument("-gtf", "--gtf_annot", type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument("-t", "--tag", type=str,
                        default='ENCODE',
                        help="Tag name for grouping results (prefix), i.e. ENCODE id OR list")
    args = parser.parse_args()

    # Main
    make_dir(args.output)

    skipped = {'non-filtered': 0,    # different chromosomes and/or chains
               'chrM': 0,      # mapping to chrM
               'PE': 0,   # junction between the mates, -1 in STAR output
               'non-chim': 0}   # STAR counts very long (>1Mb) junctions as chimeric

    junc_dict = defaultdict(dict)

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    if args.list:
        with open(args.input, 'r') as chim_names_file:
            chim_names_list = [x.strip('\n') for x in chim_names_file.readlines()]
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [x.strip('\n') for x in tag_names_file.readlines()]
        pairs = zip(chim_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
        chim_reads_df_list = []
    else:
        pairs = [(args.input, args.tag)]

    for chim_name, tag in pairs:
        annot_donors = 0
        annot_acceptors = 0
        read_names_list = []
        PTES_logger.info('Input file: %s ' % chim_name)

        PTES_logger.info('Reading STAR output...')
        with open(chim_name, 'r') as input_file:
            for i, line in enumerate(input_file):
                line_dict = star_line_dict(line=line)
                if not line_dict:
                    continue
                if line_dict['chrom1'] == line_dict['chrom2'] \
                        and line_dict['chain1'] == line_dict['chain2']:
                    chrom = line_dict['chrom1']
                    chain = line_dict['chain1']
                else:
                    skipped['non-filtered'] +=1
                    continue
                if chrom == 'chrM':
                    skipped['chrM'] += 1
                    continue
                if line_dict['junction_letters'] == '-':
                    PTES_logger.error('PE input, junction type -1 is present!')
                    PTES_logger.error('Current version works only with SE output')
                    skipped['PE'] += 1
                    continue
                if abs(line_dict['donor_ss'] - line_dict['acceptor_ss']) > 1000000 \
                        or chain == '+' and line_dict['donor_ss'] < line_dict['acceptor_ss'] \
                        or chain == '-' and line_dict['donor_ss'] > line_dict['acceptor_ss']:
                    skipped['non-chim'] += 1
                    continue
                read_name = line_dict['read_name']
                chim_part1 = get_read_interval(cigar=line_dict['cigar1'], leftpos=line_dict['coord1'])
                chim_part2 = get_read_interval(cigar=line_dict['cigar2'], leftpos=line_dict['coord2'])
                junc_dict[(chrom, chain, line_dict['donor_ss'], line_dict['acceptor_ss'])
                ].update({read_name: (chim_part1, chim_part2)})

                annot_donor = 0
                annot_acceptor = 0
                if line_dict['donor_ss'] in gtf_donors[chrom]:
                    annot_donor = 1
                    annot_donors += 1
                if line_dict['acceptor_ss'] in gtf_acceptors[chrom]:
                    annot_acceptor = 1
                    annot_acceptors += 1

                read_attrs = {
                    'read_name': read_name,
                    'chain': chain,  # chain of chimeric junction
                    'chrom': chrom,
                    'donor': line_dict['donor_ss'],
                    'acceptor': line_dict['acceptor_ss'],
                    'annot_donor': annot_donor,
                    'annot_acceptor': annot_acceptor,
                    'letters_ss': line_dict['junction_letters'],
                    'chim_dist': abs(line_dict['donor_ss'] - line_dict['acceptor_ss']),
                }
                read_names_list.append(read_attrs)

        PTES_logger.info('Reading STAR output... done')
        PTES_logger.info('Processed: %i rows' % i)
        for key in skipped:
            PTES_logger.info('Skipped %s: %i rows' % (key, skipped[key]))
        PTES_logger.info('Converted successfully: %i rows' % len(read_names_list))
        PTES_logger.info('Annot donors: %i' % annot_donors)
        PTES_logger.info('Annot acceptors: %i' % annot_acceptors)
        PTES_logger.info('Creating reads dataframe...')
        try:
            reads_df = pd.DataFrame(read_names_list)
            reads_df = reads_df[
                ['read_name', 'chrom', 'chain',
                 'donor', 'acceptor', 'annot_donor',
                 'annot_acceptor', 'letters_ss',
                 'chim_dist']
            ].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(drop=True)  # reorder columns
            reads_df['id'] = tag
            if args.list:
                chim_reads_df_list.append(reads_df)
            else:
                all_reads_df = reads_df
        except KeyError:
            PTES_logger.warning('Creating reads dataframe... empty dataframe')

    if args.list:
        all_reads_df = pd.concat(chim_reads_df_list, sort=True).reset_index(drop=True)
    # Writing reads dataframe
    if all_reads_df is not None:
        all_reads_df.to_csv(os.path.join(args.output, 'chim_reads.csv'), sep='\t')
        PTES_logger.info('Creating reads dataframe... done')

        # Writing junc_dict
        PTES_logger.info('Writing intervals to json...')
        if args.gzip:
            PTES_logger.info('Output will be archived')
            with gzip.GzipFile(os.path.join(args.output, 'junc_dict.json.gz'), 'w') as junc_json:
                junc_json.write(json.dumps({str(k): v for k, v in junc_dict.items()}).encode('utf-8'))
        else:
            with open(os.path.join(args.output, 'junc_dict.json'), 'w') as junc_json:
                json.dump({str(k): v for k, v in junc_dict.items()}, junc_json, indent=2)
        PTES_logger.info('Writing intervals to json... done')

        # Writing junctions dataframe
        PTES_logger.info('Creating junctions dataframe...')
        junctions_df = reads_to_junctions(reads_df=all_reads_df)
        junctions_df.to_csv(os.path.join(args.output, 'chim_junctions.csv'), sep='\t')
        PTES_logger.info('Creating junctions dataframe... done')
    else:
        PTES_logger.warning('Empty dataframe')
コード例 #4
0
def main():
    # Arguments
    '''
    args_s = ('-t ../tests/test_data/ptes/chim_reads_test.csv '
    '-j ../tests/test_data/ptes/junc_dict.json.gz '
    '-f ../tests/test_data/ptes/chim_junctions_test.csv '
    '-q letters_ss=="." '
    '-o ../tests/test_results/bed '
    '-p test '
    '-gz 1')
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--table", type=str,
                        help="DataFrame with data to create BED, required 4 columns are: chrom, strand, donor, acceptor")
    parser.add_argument("-sep", "--separator", type=str,
                        default='\t',
                        help="DataFrame separator, tab by default")
    parser.add_argument("-j", "--json", type=str,
                        help="JSON file with all read intervals as OrderedDicts")
    parser.add_argument("-gz", "--gzip", type=str,
                        help="Write anything to enable reading .json.gz")
    parser.add_argument("-q", "--query", type=str,
                        help="Conditions to filter junctions table, string as in pandas.DataFrame.query()")
    parser.add_argument("-f", "--filter", type=str,
                        help="DataFrame with (chrom, strand, donor, acceptor) to filter input table")
    parser.add_argument("-n", "--names", type=str,
                        nargs='+',
                        default=['chim1', 'chim2', 'mate2', ],
                        help="List of names for chim parts in BED name: [chim1, chim2, mate2]. \
                        Important: same order as parts in json values")
    parser.add_argument("-c", "--colors", type=str,
                        nargs='+',
                        default=['r', 'r', 'b', ],
                        help="List of colors for chim parts in BED name: [chim1, chim2, mate2]. \
                        Important: same order as parts in json values\
                        Colors: 'r', 'g', 'b' or in RGB code like '0,255,0'")
    parser.add_argument("-o", "--output", type=str,
                        default='bed',
                        help="Output folder for results, default is bed/")
    parser.add_argument("-p", "--prefix", type=str,
                        default='Output',
                        help="Prefix for all output files")
    parser.add_argument("-sort", "--sort", type=str,
                        help="Write anything to enable sorting BED files")
    parser.add_argument("-bb", "--bigbed", type=str,
                        help="Write anything to enable creating .bigBed files")
    args = parser.parse_args()
#    args = parser.parse_args(args_s.split(' '))

    PTES_logger.info('Reading input files...')
    make_dir(args.output)

    index_list = ['chrom', 'chain', 'donor', 'acceptor']
    input_df = pd.read_csv(args.table, sep=args.separator)

    for col in index_list:
        if col not in input_df.columns:
            PTES_logger.error('Input table does not contain required column %s ' % col)
            os._exit(1)

    if args.filter:   # filter by junctions
        filter_df = pd.read_csv(args.filter, sep=args.separator)
        for col in index_list:
            if col not in filter_df.columns:
                PTES_logger.error('Filter table does not contain required column %s ' % col)
                os._exit(1)
        cols_to_use = index_list + list(input_df.columns.difference(filter_df.columns))  # avoid repeating columns
        df_new = pd.merge(filter_df, input_df[cols_to_use], on=index_list, how='inner',)
    else:
        df_new = input_df

    if args.query:   # filter reads by conditions
        df_new = df_new.query(args.query)

    df_new.to_csv(os.path.join(args.output, 'df_filter.csv'), sep='\t')

    # Reading .json.gz file
    if args.gzip:
        with gzip.GzipFile(args.json, 'r') as fin:
            junc_dict = json.loads(fin.read().decode('utf-8'), object_pairs_hook=OrderedDict)
    else:
        junc_dict = json.load(open(args.json), object_pairs_hook=OrderedDict)

    len_read_dicts = len(junc_dict.values()[0].values())   # must be 3 for mate_inside/outside and 2 for circles
    if len(args.names) < len_read_dicts:
        PTES_logger.warning('List of names has less items than list of features in read_dicts!')
        part_names = [x[0]+str(x[1]) for x in list(zip(['part_']*len_read_dicts, range(1, len_read_dicts+1)))]
    else:
        part_names = args.names

    if len(args.colors) < len_read_dicts:
        PTES_logger.warning('List of colors has less items than list of features in read_dicts!')
        part_colors = ['r']*len_read_dicts
    else:
        part_colors = args.colors

    PTES_logger.info('Reading input files... done')
    PTES_logger.info('Creating BED files...')
    bed_name = '%s.bed' % args.prefix  # only track lines
    unique_bed_name = '%s.unique.bed' % args.prefix  # one representative read for unique junctions
    single_bed_name = '%s.single.bed' % args.prefix  # single line for one chimeric junction
    single_unique_bed_name = '%s.single.unique.bed' % args.prefix  # for unique junctions, single line for one junction
    code_name = '%s.codes.csv' % args.prefix   # table with codes for each read and descriptions
    coord_name = '%s.coords.csv' % args.prefix  # table with windows to paste into GB and descriptions
    info_name = '%s.track' % args.prefix  # file to submit to GB

    bed_list = []  # for outputting BED lines
    unique_dict = {}  # for outputting BED lines, unique chimeric junctions
    single_list = []  # for outputting BED lines, one row per one chimeric junction
    single_unique_list = []  # for outputting BED lines, one row per one unique chimeric junction
    coord_list = []  # for outputting coord lines
    code_list = []  # for outputting coord lines, one row per one read

    with open(os.path.join(args.output, info_name), 'w') as info_file:
        info_file.write('\n'.join(
        ['browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk',
         'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3',
         'browser pack gtexGene',
         'track type=bigBed \
         name="%s" \
         description="bigBed" \
         visibility=2 \
         itemRgb="On" \
         bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true' % (
             args.prefix,
             bed_name.replace('.bed', '.bb')
             )
            ]
           )
        )

    num = 0
    junctions = df_new.groupby(index_list)['read_name'].apply(list) # unique chimeric junctions
    for index, read_list in junctions.items():  # value is list of read_names
        chrom = index[0]  # index is (chrom, chain, donor_ss, acceptor_ss)
        chain = index[1]
        donor_ss = index[2]
        acceptor_ss = index[3]
        windows_min = []
        windows_max = []
        codes = []
        for read_name in read_list:  # for each read w. this junction
            num += 1
            code = digit_code(number=num)  # every unique number will be 6-digit
            codes.append(code)
            track_lists = []
            if not unique_dict.get(index, None):  # for each unique junction write the 1st read line(s)
                unique_dict[index] = []
                add_unique = True
            else:
                add_unique = False
            read_dict_list = junc_dict[str(index)][read_name]  # list of dicts: each dict is one track (i.e. chim_part)
            # Iterating over tracks
            for i, read_dict in enumerate(read_dict_list):
                for k, v in read_dict.items():
                    read_dict[k] = interval[v[0][0], v[0][1]]
                track_list = get_track_list(chrom=chrom,
                                            chain=chain,
                                            read_dict=read_dict,
                                            name='_'.join(map(str, [donor_ss, acceptor_ss, code, part_names[i]])),
                                            color=part_colors[i])
                track_lists.append(track_list)
            # Writing BED lines, collecting extremas for window size
            for track_list in track_lists:
                windows_min.append(int(track_list[1]))  # track_list[1] is chromStart, track_list[2] is chromEnd
                windows_max.append(int(track_list[2]))
                bed_line = '\t'.join(track_list)
                bed_list.append(bed_line)
                if add_unique:
                    unique_dict[index].append(bed_line)
            # Writing code line
            code_list.append({
                'chrom': chrom,
                'chain': chain,
                'donor': donor_ss,
                'acceptor': acceptor_ss,
                'read_name': read_name,
                'code': code
            })
            # Making BED file with one row for the pair of mates
            single_track = get_single_track(read_dict_list=read_dict_list,
                                            kwargs={'chrom': chrom,
                                                    'chain': chain,
                                                    'name': '_'.join(
                                                        map(str, [donor_ss, acceptor_ss, code])),
                                                    'color': '255,0,255'})  # for checking in GB that intervals are same
            single_list.append('\t'.join(single_track))
            if add_unique:
                single_unique_list.append('\t'.join(single_track))
        # Description for the junction into coords.csv
        window = (chrom,  # one window for junction
                  min(windows_min) - 200,
                  max(windows_max) + 200)
        coord_list.append({
                    'chrom': chrom,
                    'chain': chain,
                    'donor': donor_ss,
                    'acceptor': acceptor_ss,
                    'window': '%s:%i-%i' % window,
                    'codes': '-'.join(map(str,[codes[0], codes[-1]])),
        })

    PTES_logger.info('Creating BED files... done')

    PTES_logger.info('Writing BED files...')
    with open(os.path.join(args.output, bed_name), 'w') as bed_file, \
            open(os.path.join(args.output, unique_bed_name), 'w') as unique_bed_file, \
            open(os.path.join(args.output, single_bed_name), 'w') as single_bed_file, \
            open(os.path.join(args.output, single_unique_bed_name), 'w') as single_unique_bed_file, \
            open(os.path.join(args.output, coord_name), 'w') as coord_file, \
            open(os.path.join(args.output, code_name), 'w') as code_file:
        bed_file.write('\n'.join(bed_list))
        single_bed_file.write('\n'.join(single_list))
        single_unique_bed_file.write('\n'.join(single_unique_list))

        for unique_value in unique_dict.values():
            unique_bed_file.write('\n'.join(list(unique_value))+'\n')

    PTES_logger.info('Writing BED files... done')

    PTES_logger.info('Creating junctions dataframes...')
    coord_df = pd.DataFrame(coord_list)
    code_df = pd.DataFrame(code_list)
    coord_df.to_csv(os.path.join(args.output, coord_name), sep='\t')
    code_df.to_csv(os.path.join(args.output, code_name), sep='\t')

    PTES_logger.info('Creating junctions dataframes... done')

    if args.sort:
        PTES_logger.info('Sorting BED files...')
        for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]:
            shell_call('cat %s | sort -k1,1 -k2,2n  > %s.sorted' % (os.path.join(args.output, filename),
                                                                    os.path.join(args.output, filename),)                                                                                                                                                    )

        PTES_logger.info('Sorting BED files... done')

    if args.bigbed:   # will also sort files
        PTES_logger.info('Making bigBed...')
        for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]:
            to_bigbed(bed_name=filename, folder_name=args.output)
        PTES_logger.info('Making bigBed... done')
コード例 #5
0
def main():
    # Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="STAR output, Chimeric.out.junction.filtered \
                                OR list of such files")
    parser.add_argument("-s",
                        "--sam",
                        type=str,
                        help="Filtered STAR SAM output OR list")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-gz",
                        "--gzip",
                        type=str,
                        help="Option to create .json.gz")
    parser.add_argument(
        "-l",
        "--list",
        type=str,
        help="Enables list input mode. Options: sam, tag - MUST be lists")
    parser.add_argument("-gtf",
                        "--gtf_annot",
                        type=str,
                        default='/home/sunnymouse/Human_ref/hg19_exons.gtf',
                        help="Absolute path to annotation file")
    parser.add_argument(
        "-t",
        "--tag",
        type=str,
        default='ENCODE',
        help="Tag name for grouping results, i.e. ENCODE id OR list of tags")
    args = parser.parse_args()

    # Exons GTF to junctions dict
    PTES_logger.info('Reading GTF...')
    gtf_exons_name = args.gtf_annot
    gtf_donors, gtf_acceptors = annot_junctions(gtf_exons_name=gtf_exons_name)
    PTES_logger.info('Reading GTF... done')

    make_dir(args.output)
    norm_junc_dict = defaultdict(dict)
    norm_read_names_list = []

    if args.list:
        with open(args.sam, 'r') as sam_names_file:
            sam_names_list = [
                x.strip('\n') for x in sam_names_file.readlines()
            ]
        if args.input:
            with open(args.input, 'r') as chim_names_file:
                chim_names_list = [
                    x.strip('\n') for x in chim_names_file.readlines()
                ]
        else:
            chim_names_list = [None] * len(sam_names_list)
        with open(args.tag, 'r') as tag_names_file:
            tag_names_list = [
                x.strip('\n') for x in tag_names_file.readlines()
            ]
        triads = zip(chim_names_list, sam_names_list, tag_names_list)
        PTES_logger.info('Enabled list mode')
    else:
        triads = [(args.input, args.sam, args.tag)]

    for chim_name, sam_name, tag in triads:
        if chim_name:
            with open(chim_name, 'r') as chim_file:
                names_list = [
                    x.strip('\n').split('\t')[9]
                    for x in chim_file.readlines()
                ]
                names_set = set(names_list)  # only names with chimeric output

        with open(sam_name, 'r') as sam_file:
            PTES_logger.info('Input file %s' % sam_name)
            for line in sam_file:
                if line.startswith('@'):
                    continue
                row = line.strip().split('\t')
                sam_attrs = None
                if len(row) > 1:
                    read_name = row[0]
                    if chim_name:
                        if read_name in names_set:
                            sam_attrs = parse_sam_row(row)
                    else:
                        sam_attrs = parse_sam_row(row)
                if sam_attrs:
                    if 'N' in sam_attrs['cigar']:  # read mapped with intron
                        read_dict = get_read_interval(
                            cigar=sam_attrs['cigar'],
                            leftpos=sam_attrs['leftpos'],
                            output='dict')
                        if sam_attrs['chain'] == '+':
                            donor_ss = int(read_dict['N1'][0].inf -
                                           1)  # counts first N as intron
                            acceptor_ss = int(read_dict['N1'][0].sup + 1)
                        elif sam_attrs['chain'] == '-':
                            donor_ss = int(read_dict['N1'][0].sup + 1)
                            acceptor_ss = int(read_dict['N1'][0].inf - 1)
                        norm_junc_dict[(sam_attrs['chrom'], sam_attrs['chain'],
                                        donor_ss, acceptor_ss)].update(
                                            {read_name: tuple([read_dict])})
                        norm_read_names_list.append({
                            'read_name': read_name,
                            'chrom': sam_attrs['chrom'],
                            'chain': sam_attrs['chain'],
                            'donor': donor_ss,
                            'acceptor': acceptor_ss,
                            'id': tag
                        })
    try:
        norm_read_df = pd.DataFrame(norm_read_names_list)
        norm_read_df = norm_read_df[[
            'read_name',
            'chrom',
            'chain',
            'donor',
            'acceptor',
            'id',
        ]].sort_values(by=['chrom', 'chain', 'donor', 'acceptor']).reset_index(
            drop=True)
        PTES_logger.info('Writing reads dataframe...')
        norm_read_df.to_csv(os.path.join(args.output, 'norm_split_reads.csv'),
                            sep='\t')
        PTES_logger.info('Writing reads dataframe... done')
    except KeyError:
        PTES_logger.warning(
            'Creating norm split reads dataframe... empty dataframe')

    # Writing junc_dict
    PTES_logger.info('Writing intervals to json files...')
    if args.gzip:
        PTES_logger.info('Output will be archived')
        with gzip.GzipFile(os.path.join(args.output, 'norm_dict.json.gz'),
                           'w') as norm_json:
            norm_json.write(
                json.dumps({str(k1): v1
                            for k1, v1 in norm_junc_dict.items()
                            }).encode('utf-8'))
    else:
        with open(os.path.join(args.output, 'norm_dict.json'),
                  'w') as norm_json:
            json.dump({str(k1): v1
                       for k1, v1 in norm_junc_dict.items()},
                      norm_json,
                      indent=2)

    PTES_logger.info('Writing intervals to json files... done')

    # Writing junctions dataframe
    PTES_logger.info('Creating junctions dataframe...')
    junctions_df = reads_to_junctions(reads_df=norm_read_df,
                                      gtf_donors=gtf_donors,
                                      gtf_acceptors=gtf_acceptors)
    junctions_df.to_csv(os.path.join(args.output, 'norm_junctions.csv'),
                        sep='\t')
    PTES_logger.info('Creating junctions dataframe... done')
コード例 #6
0
                    type=str,
                    help="Path to .BED file with genes (containers)")
parser.add_argument(
    "-s",
    "--strand",
    type=str,
    help=
    "Enable strand-specific position mode, BED files should contain 6 or more fields"
)

args = parser.parse_args()

# Functions

# Main
make_dir(args.output)
PTES_logger.info('Creating intersection file... ')
intersection_name = os.path.join(
    args.output,
    os.path.basename(args.features) + '.intersect')
cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % (
    args.features,
    args.genes,
    intersection_name,
)
shell_call(cmd)
PTES_logger.info('Creating intersection file... done')
PTES_logger.info('Reading intersection file... ')

p_dict = {}
gene_p_dict = defaultdict(list)
コード例 #7
0
def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-m",
                        "--method",
                        type=str,
                        nargs='+',
                        default=[
                            'inside',
                            'outside',
                            'bedtools',
                        ],
                        help="Shuffling method(s): inside, outside, bedtools")
    parser.add_argument(
        "-c",
        "--closest",
        type=str,
        default='coverage',
        help="Choose close elements for outside method by coverage or length")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-iter",
                        "--iterations",
                        type=int,
                        default='1000',
                        help="Number of iterations, default 1000")
    parser.add_argument(
        "-f",
        "--features",
        type=str,
        help="Path to .BED6 file with features (small intervals)")
    parser.add_argument("-g",
                        "--genes",
                        type=str,
                        help="Path to .BED6 file with genes (containers)")
    parser.add_argument(
        "-s",
        "--chrom_sizes",
        type=str,
        default='/home/sunnymouse/Human_ref/hg19.chrom.sizes',
        help=
        "The chrom_sizes file should be tab delimited and structured as follows: \
                        <chromName><TAB><chromSize>, use bedtools shuffle -h for details"
    )

    args = parser.parse_args()
    make_dir(args.output)
    path_to_file = args.output.rstrip('/')
    random_folder = path_to_file + '/random'
    make_dir(random_folder)

    # Shuffling methods inside and outside:

    if 'inside' in args.method or 'outside' in args.method:
        if 'outside' in args.method:
            PTES_logger.info('Reading containers... ')
            PTES_logger.info('containers file: %s ' % args.genes)

            strand_dict = {}
            interval_dict = {}
            if args.closest == 'length':
                gene_dict = defaultdict(list)
                # open file with genes
                with open(args.genes, 'r') as genes_file:
                    for line in genes_file:
                        line_list = line.strip().split()
                        chrom = line_list[0]
                        gene_interval = interval[int(line_list[1]),
                                                 int(line_list[2])]
                        gene_dict[chrom].append(gene_interval)
                        try:
                            strand = line_list[5]
                            strand_dict[gene_interval] = strand
                        except IndexError:
                            strand_dict[gene_interval] = '.'
                            PTES_logger.error('No strand found')
                            PTES_logger.error(
                                'BED6 format is required for choosing strand-specific position'
                            )

                # sort lists by gene length
                for key in gene_dict:  # key is chromosome
                    new_list = sorted(gene_dict[key],
                                      key=lambda x: get_interval_length(x))
                    interval_dict.update(choose_close(sorted_list=new_list))

            if args.closest == 'coverage':
                PTES_logger.info('Creating coverage file... ')

                cover_name = '%s/%s' % (
                    random_folder, os.path.basename(args.features) + '.cov')
                cmd = 'bedtools coverage -a %s -b %s -s > %s' % (
                    args.genes,
                    args.features,
                    cover_name,
                )
                shell_call(cmd)
                PTES_logger.info('Creating coverage file... done')

                cover_dict = {}
                with open(cover_name, 'r') as cover_file:
                    for line in cover_file:
                        line_list = line.strip().split()
                        gene_interval = interval[int(line_list[1]),
                                                 int(line_list[2])]
                        cov = float(line_list[-1])
                        cover_dict[gene_interval] = cov
                        try:
                            strand = line_list[5]
                            strand_dict[gene_interval] = strand
                        except IndexError:
                            strand_dict[gene_interval] = '.'
                            PTES_logger.error('No strand found')
                            PTES_logger.error(
                                'BED6 format is required for choosing strand-specific position'
                            )

                new_list = sorted(cover_dict.items(), key=lambda x: x[1])
                interval_dict.update(
                    choose_close(sorted_list=new_list, items='items'))

            PTES_logger.info('Reading containers... done')

        PTES_logger.info('Creating intersection file... ')

        intersection_name = '%s/%s' % (
            random_folder, os.path.basename(args.features) + '.intersect')
        cmd = 'bedtools intersect -a %s -b %s -wo -s > %s' % (
            args.features,
            args.genes,
            intersection_name,
        )
        shell_call(cmd)
        PTES_logger.info('Creating intersection file... done')

        PTES_logger.info('Reading intersection file and shuffling... ')
        PTES_logger.info('intersection file: %s' % intersection_name)

        if 'inside' in args.method:
            n_list_inside = np.empty(
                (args.iterations, 0)).tolist()  # make list of 1000 empty lists

        if 'outside' in args.method:
            n_list_outside = np.empty(
                (args.iterations, 0)).tolist()  # make list of 1000 empty lists

        with open(intersection_name, 'r') as intersect_file:
            for i, line in enumerate(intersect_file):
                line_list = line.strip().split()
                b_start = get_b_start(line)
                if not b_start:
                    continue
                chrom1 = line_list[0]
                feature_interval = interval[int(line_list[1]),
                                            int(line_list[2])]
                gene_interval = interval[int(line_list[b_start + 1]),
                                         int(line_list[b_start + 2])]

                for n in range(args.iterations):
                    if 'inside' in args.method:
                        random_interval_inside = randomize_interval(
                            small_i=feature_interval, large_i=gene_interval)
                        n_list_inside[n].append(
                            interval_to_bed_line(
                                chrom=chrom1,
                                single_interval=random_interval_inside,
                                name=line_list[3],
                                strand=line_list[5]))

                    if 'outside' in args.method:
                        new_large_interval = random.choice(
                            interval_dict[gene_interval]
                        )  # choose one of closest genes
                        new_strand = strand_dict[new_large_interval]
                        feature_len = get_interval_length(feature_interval)
                        gene_len = get_interval_length(gene_interval)
                        if feature_len <= gene_len:
                            try:
                                container_strand = line_list[b_start + 5]
                                relative_position = count_relative_position(
                                    feature=feature_interval,
                                    container=gene_interval,
                                    container_strand=container_strand)
                                random_interval_outside = randomize_interval(
                                    small_i=feature_interval,
                                    large_i=new_large_interval,
                                    large_i_strand=new_strand,
                                    same_position=True,
                                    p=relative_position)
                            except IndexError:
                                PTES_logger.error('No strand found')
                                PTES_logger.error(
                                    'BED6 format is required for choosing strand-specific position'
                                )
                                relative_position = count_relative_position(
                                    feature=feature_interval,
                                    container=gene_interval)
                                random_interval_outside = randomize_interval(
                                    small_i=feature_interval,
                                    large_i=new_large_interval,
                                    same_position=True,
                                    p=relative_position)
                        else:
                            random_interval_outside = randomize_interval(
                                small_i=feature_interval,
                                large_i=new_large_interval,
                                same_position=False,
                            )
                        n_list_outside[n].append(
                            interval_to_bed_line(
                                chrom=chrom1,
                                single_interval=random_interval_outside,
                                name=line_list[3],
                                strand=line_list[5]))

        PTES_logger.info('Reading intersection file and shuffling... done')
        PTES_logger.info('Creating output files... ')
        for n in range(args.iterations):
            if 'inside' in args.method:
                out_name = random_folder + '/%s_%i.bed' % ('inside', n)
                with open(out_name, 'w') as out_file:
                    out_file.write('\n'.join(n_list_inside[n]))

            if 'outside' in args.method:
                out_name = random_folder + '/%s_%i.bed' % ('outside', n)
                with open(out_name, 'w') as out_file:
                    out_file.write('\n'.join(n_list_outside[n]))

    PTES_logger.info('Creating output files... done')
    # Shuffling method 3
    if 'bedtools' in args.method:
        PTES_logger.info('Running bedtools shuffle... ')
        for n in range(args.iterations):
            random_file = 'bedtools_%i.bed' % n
            cmd = 'bedtools shuffle -incl %s -i %s -g %s -chrom > %s/%s' % (
                args.genes, args.features, args.chrom_sizes, random_folder,
                random_file)
            shell_call(cmd)
        PTES_logger.info('Running bedtools shuffle... done')