Python shell_call Exemples, ptes.lib.general.shell_call Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="Name of input list to grep")
    parser.add_argument("-n",
                        "--n_parts",
                        type=int,
                        default=10,
                        help="How many times to grep")
    parser.add_argument("-db", "--database", type=str, help="Where to grep")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        default='to_grep.sh',
                        help="Name of script with grep commands")
    parser.add_argument("-p",
                        "--prefix",
                        type=str,
                        default='',
                        help="Common prefix, command after cat $input")
    parser.add_argument("-s",
                        "--suffix",
                        type=str,
                        default='',
                        help="Common suffix, command before >> $output &")
    args = parser.parse_args()
    # Main

    with open(args.input, 'r') as inp_list:
        filelist = inp_list.readlines()
        n_patterns = len(
            filelist) // args.n_parts  # how many patterns in one command
        common_prefix = "cat %s | " % args.database + args.prefix
        common_suffix = args.suffix + " >> %s.grep &" % args.input
        out_list = []
        if n_patterns > 1:
            for i in range(args.n_parts - 1):
                out_list.append(common_prefix + "grep '" + "\|".join([
                    x.strip('\n')
                    for x in filelist[(i * n_patterns):(i * n_patterns +
                                                        n_patterns)]
                ]) + "'" + common_suffix)
            out_list.append(common_prefix + "grep '" + "\|".join([
                x.strip('\n') for x in filelist[(i * n_patterns + n_patterns):]
            ]) + "'" + common_suffix)
        else:
            out_list.append(common_prefix + "grep '" +
                            "\|".join([x.strip('\n') for x in filelist]) +
                            "'" + common_suffix)
    with open(args.output, 'w') as out_file:
        out_file.write('\n'.join(out_list))

    shell_call('chmod +x %s' % args.output)

Exemple #2

0

Afficher le fichier

Fichier : ucsc.py Projet : sunnymouse25/ptes

def to_bigbed(bed_name, folder_name, genome_version='hg19'):
    """
    Runs UCSC bedToBigBed script to convert bed to bigBed,
    bedToBigBed must be in $PATH
    :param bed_name: Name of BED file to be converted
    :param folder_name: Folder of BED file
    :param genome_version: Name of genome version, hg19 by default
    :return: sorted bed and bigBed files in the same folder
    """
    cmd1 = 'sort -k1,1 -k2,2n %s > %s.sorted' % (os.path.join(
        folder_name, bed_name), os.path.join(folder_name, bed_name))
    cmd2 = 'bedToBigBed \
            %s.sorted \
            http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes \
            %s' % (os.path.join(folder_name,
                                bed_name), genome_version, genome_version,
                   os.path.join(folder_name, bed_name.replace('.bed', '.bb')))
    for cmd in [cmd1, cmd2]:
        shell_call(cmd)

Exemple #3

0

Afficher le fichier

Fichier : bed_to_overlaps.py Projet : sunnymouse25/ptes

def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str,
                        help="BED file, bedtools intersect -a A_FEATURE -b B_BEATURE -s -wo output")
    parser.add_argument("-f", "--features", type=str,
                        help="Path to .BED6 file with features (small intervals)")
    parser.add_argument("-o", "--output", type=str,
                        help="Output folder with random subfolder for results")
    parser.add_argument("-a", "--afeature", type=str,
                        help="Name of A_FEATURE, i.e. circles")
    parser.add_argument("-b", "--bfeature", type=str,
                        help="Name of B_BEATURE, i.e. panhandles")
    parser.add_argument("-iter", "--iterations", type=int,
                        default='1000',
                        help="Number of iterations for randomizing results")
    parser.add_argument("-m", "--method", type=str,
                        nargs='+',
                        default=['inside', 'outside', 'bedtools', ],
                        help="Shuffling method(s): inside, outside, bedtools")

    args = parser.parse_args()

    path_to_file = args.output.rstrip('/')
    random_folder = path_to_file + '/random'

    # category_name = 'categories.csv'
    # pivot_name = 'categories_pivot.csv'

    PTES_logger.info('Reading input file... ')
    PTES_logger.info('Input file: %s ' % args.input)
    real_dict = parse_bedtools_wo(wo_outfile=args.input)  # category - number of intersections

    PTES_logger.info('Reading input file... done')

    PTES_logger.info('Running intersections with random files... ')
    random_dicts = {}   # method - category - list of values
    categories = ['a_in_b', 'b_in_a', 'overlap']
    for method in args.method:
        random_dicts[method] = dict.fromkeys(categories)
        for k, _ in random_dicts[method].items():
            random_dicts[method][k] = []   # now we have separate empty lists as values
        for n in range(args.iterations):
            random_input = '%s/%s_%i.bed' % (random_folder, method, n)
            random_output = '%s/%s_%i.bed.intersect' % (random_folder, method, n)
            cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % (args.features,
                                                                  random_input,
                                                                  random_output)
            shell_call(cmd)
            random_dict = parse_bedtools_wo(wo_outfile=random_output)
            for cat in categories:
                random_dicts[method][cat].append(random_dict[cat])

    PTES_logger.info('Running intersections with random files... done')

    PTES_logger.info('Creating output files...')
    # plotting
    for method in args.method:
        fig = plt.figure(figsize=(12,6))
        plt.suptitle("Shuffling method %s" % method)
        ax1 = fig.add_subplot(131)
        sns.distplot(random_dicts[method]['a_in_b'], kde=False)
        ax1.axvline(real_dict['a_in_b'], color='r')
        ax1.set(title='%s_in_%s' % (args.afeature, args.bfeature));

        ax2 = fig.add_subplot(132)
        sns.distplot(random_dicts[method]['b_in_a'], kde=False)
        ax2.axvline(real_dict['b_in_a'], color='r')
        ax2.set(title='%s_in_%s' % (args.bfeature, args.afeature));

        ax3 = fig.add_subplot(133)
        sns.distplot(random_dicts[method]['overlap'], kde=False)
        ax3.axvline(real_dict['overlap'], color='r')
        ax3.set(title='overlap');

        plt.savefig('%s/histograms_%s.png' % (path_to_file, method))

    # saving data for histograms
    for method in args.method:
        hist_name = '%s/data_hist_%s' % (path_to_file, method)
        with open(hist_name, 'w') as hist_file:
            hist_file.write('%s_in_%s' % (args.afeature, args.bfeature) + '\n')
            hist_file.write('real: %i' % real_dict['a_in_b'] + '\n')
            hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['a_in_b'])) + '\n')

            hist_file.write('%s_in_%s' % (args.bfeature, args.afeature) + '\n')
            hist_file.write('real: %i' % real_dict['b_in_a'] + '\n')
            hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['b_in_a'])) + '\n')

            hist_file.write('overlap' + '\n')
            hist_file.write('real: %i' % real_dict['overlap'] + '\n')
            hist_file.write('random: ' + ','.join(map(str, random_dicts[method]['overlap'])) + '\n')

    PTES_logger.info('Creating output files... done')
    PTES_logger.info('Remember to delete random subfolder')

Exemple #4

0

Afficher le fichier

Fichier : segemehl_parallel.py Projet : sunnymouse25/ptes

for i, name in enumerate(file_str.split()):
    id = name.strip(None)                    
    folder_name = '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/%s' % id   #without end 
    sh_filename = 'star_%i.sh' % i    
    init_file(sh_filename)
    cmd_list = []
    cmd_list.append('#!/bin/bash -il \ncd %s \n' %  '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/')
    cmd_list.append('source /uge_mnt/home/sunnymouse/tools/miniconda2/bin/activate')
    '''
    cmd_list.append('samtools collate -uOn 256 %s%s.bam %s/tmp-prefix \
                    | samtools fastq - > %s/%s.fq' % (bam_folder, id, folder_name, folder_name, id))
    cmd_list.append('gzip %s/%s.fq' % (folder_name, id))                
    cmd_list.append('%s/segemehl.x -S \
                                -Z 10  \
                                -t 10 \
                                -s \
                                -i /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.idx \
                                -d /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.fa  \
                                -q %s/%s.fq \
                                -o %s/segemehl.sam \
                                -u %s/segemehl_unmapped' % (segemehl_bin, folder_name,id, folder_name, folder_name)) 
    cmd_list.append('samtools view %s/segemehl.sam > %s/segemehl.sam.nohead' % (folder_name, folder_name))       
    cmd_list.append('samtools view -b %s/segemehl.sam > %s/segemehl.bam' % (folder_name, folder_name))    
    cmd_list.append('%s/segemehl.sam' % folder_name)    
    '''
    cmd_list.append('python segemehl_encode.py -i %s/segemehl.sam.nohead -o %s -t %s' % (folder_name, folder_name, id))    
    
    writeln_to_file('\n'.join(cmd_list), sh_filename)

    shell_call('chmod +x ./%s' % sh_filename)

Exemple #5

0

Afficher le fichier

def main():
    # Arguments
    '''
    args_s = ('-t ../tests/test_data/ptes/chim_reads_test.csv '
    '-j ../tests/test_data/ptes/junc_dict.json.gz '
    '-f ../tests/test_data/ptes/chim_junctions_test.csv '
    '-q letters_ss=="." '
    '-o ../tests/test_results/bed '
    '-p test '
    '-gz 1')
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--table", type=str,
                        help="DataFrame with data to create BED, required 4 columns are: chrom, strand, donor, acceptor")
    parser.add_argument("-sep", "--separator", type=str,
                        default='\t',
                        help="DataFrame separator, tab by default")
    parser.add_argument("-j", "--json", type=str,
                        help="JSON file with all read intervals as OrderedDicts")
    parser.add_argument("-gz", "--gzip", type=str,
                        help="Write anything to enable reading .json.gz")
    parser.add_argument("-q", "--query", type=str,
                        help="Conditions to filter junctions table, string as in pandas.DataFrame.query()")
    parser.add_argument("-f", "--filter", type=str,
                        help="DataFrame with (chrom, strand, donor, acceptor) to filter input table")
    parser.add_argument("-n", "--names", type=str,
                        nargs='+',
                        default=['chim1', 'chim2', 'mate2', ],
                        help="List of names for chim parts in BED name: [chim1, chim2, mate2]. \
                        Important: same order as parts in json values")
    parser.add_argument("-c", "--colors", type=str,
                        nargs='+',
                        default=['r', 'r', 'b', ],
                        help="List of colors for chim parts in BED name: [chim1, chim2, mate2]. \
                        Important: same order as parts in json values\
                        Colors: 'r', 'g', 'b' or in RGB code like '0,255,0'")
    parser.add_argument("-o", "--output", type=str,
                        default='bed',
                        help="Output folder for results, default is bed/")
    parser.add_argument("-p", "--prefix", type=str,
                        default='Output',
                        help="Prefix for all output files")
    parser.add_argument("-sort", "--sort", type=str,
                        help="Write anything to enable sorting BED files")
    parser.add_argument("-bb", "--bigbed", type=str,
                        help="Write anything to enable creating .bigBed files")
    args = parser.parse_args()
#    args = parser.parse_args(args_s.split(' '))

    PTES_logger.info('Reading input files...')
    make_dir(args.output)

    index_list = ['chrom', 'chain', 'donor', 'acceptor']
    input_df = pd.read_csv(args.table, sep=args.separator)

    for col in index_list:
        if col not in input_df.columns:
            PTES_logger.error('Input table does not contain required column %s ' % col)
            os._exit(1)

    if args.filter:   # filter by junctions
        filter_df = pd.read_csv(args.filter, sep=args.separator)
        for col in index_list:
            if col not in filter_df.columns:
                PTES_logger.error('Filter table does not contain required column %s ' % col)
                os._exit(1)
        cols_to_use = index_list + list(input_df.columns.difference(filter_df.columns))  # avoid repeating columns
        df_new = pd.merge(filter_df, input_df[cols_to_use], on=index_list, how='inner',)
    else:
        df_new = input_df

    if args.query:   # filter reads by conditions
        df_new = df_new.query(args.query)

    df_new.to_csv(os.path.join(args.output, 'df_filter.csv'), sep='\t')

    # Reading .json.gz file
    if args.gzip:
        with gzip.GzipFile(args.json, 'r') as fin:
            junc_dict = json.loads(fin.read().decode('utf-8'), object_pairs_hook=OrderedDict)
    else:
        junc_dict = json.load(open(args.json), object_pairs_hook=OrderedDict)

    len_read_dicts = len(junc_dict.values()[0].values())   # must be 3 for mate_inside/outside and 2 for circles
    if len(args.names) < len_read_dicts:
        PTES_logger.warning('List of names has less items than list of features in read_dicts!')
        part_names = [x[0]+str(x[1]) for x in list(zip(['part_']*len_read_dicts, range(1, len_read_dicts+1)))]
    else:
        part_names = args.names

    if len(args.colors) < len_read_dicts:
        PTES_logger.warning('List of colors has less items than list of features in read_dicts!')
        part_colors = ['r']*len_read_dicts
    else:
        part_colors = args.colors

    PTES_logger.info('Reading input files... done')
    PTES_logger.info('Creating BED files...')
    bed_name = '%s.bed' % args.prefix  # only track lines
    unique_bed_name = '%s.unique.bed' % args.prefix  # one representative read for unique junctions
    single_bed_name = '%s.single.bed' % args.prefix  # single line for one chimeric junction
    single_unique_bed_name = '%s.single.unique.bed' % args.prefix  # for unique junctions, single line for one junction
    code_name = '%s.codes.csv' % args.prefix   # table with codes for each read and descriptions
    coord_name = '%s.coords.csv' % args.prefix  # table with windows to paste into GB and descriptions
    info_name = '%s.track' % args.prefix  # file to submit to GB

    bed_list = []  # for outputting BED lines
    unique_dict = {}  # for outputting BED lines, unique chimeric junctions
    single_list = []  # for outputting BED lines, one row per one chimeric junction
    single_unique_list = []  # for outputting BED lines, one row per one unique chimeric junction
    coord_list = []  # for outputting coord lines
    code_list = []  # for outputting coord lines, one row per one read

    with open(os.path.join(args.output, info_name), 'w') as info_file:
        info_file.write('\n'.join(
        ['browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk',
         'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3',
         'browser pack gtexGene',
         'track type=bigBed \
         name="%s" \
         description="bigBed" \
         visibility=2 \
         itemRgb="On" \
         bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true' % (
             args.prefix,
             bed_name.replace('.bed', '.bb')
             )
            ]
           )
        )

    num = 0
    junctions = df_new.groupby(index_list)['read_name'].apply(list) # unique chimeric junctions
    for index, read_list in junctions.items():  # value is list of read_names
        chrom = index[0]  # index is (chrom, chain, donor_ss, acceptor_ss)
        chain = index[1]
        donor_ss = index[2]
        acceptor_ss = index[3]
        windows_min = []
        windows_max = []
        codes = []
        for read_name in read_list:  # for each read w. this junction
            num += 1
            code = digit_code(number=num)  # every unique number will be 6-digit
            codes.append(code)
            track_lists = []
            if not unique_dict.get(index, None):  # for each unique junction write the 1st read line(s)
                unique_dict[index] = []
                add_unique = True
            else:
                add_unique = False
            read_dict_list = junc_dict[str(index)][read_name]  # list of dicts: each dict is one track (i.e. chim_part)
            # Iterating over tracks
            for i, read_dict in enumerate(read_dict_list):
                for k, v in read_dict.items():
                    read_dict[k] = interval[v[0][0], v[0][1]]
                track_list = get_track_list(chrom=chrom,
                                            chain=chain,
                                            read_dict=read_dict,
                                            name='_'.join(map(str, [donor_ss, acceptor_ss, code, part_names[i]])),
                                            color=part_colors[i])
                track_lists.append(track_list)
            # Writing BED lines, collecting extremas for window size
            for track_list in track_lists:
                windows_min.append(int(track_list[1]))  # track_list[1] is chromStart, track_list[2] is chromEnd
                windows_max.append(int(track_list[2]))
                bed_line = '\t'.join(track_list)
                bed_list.append(bed_line)
                if add_unique:
                    unique_dict[index].append(bed_line)
            # Writing code line
            code_list.append({
                'chrom': chrom,
                'chain': chain,
                'donor': donor_ss,
                'acceptor': acceptor_ss,
                'read_name': read_name,
                'code': code
            })
            # Making BED file with one row for the pair of mates
            single_track = get_single_track(read_dict_list=read_dict_list,
                                            kwargs={'chrom': chrom,
                                                    'chain': chain,
                                                    'name': '_'.join(
                                                        map(str, [donor_ss, acceptor_ss, code])),
                                                    'color': '255,0,255'})  # for checking in GB that intervals are same
            single_list.append('\t'.join(single_track))
            if add_unique:
                single_unique_list.append('\t'.join(single_track))
        # Description for the junction into coords.csv
        window = (chrom,  # one window for junction
                  min(windows_min) - 200,
                  max(windows_max) + 200)
        coord_list.append({
                    'chrom': chrom,
                    'chain': chain,
                    'donor': donor_ss,
                    'acceptor': acceptor_ss,
                    'window': '%s:%i-%i' % window,
                    'codes': '-'.join(map(str,[codes[0], codes[-1]])),
        })

    PTES_logger.info('Creating BED files... done')

    PTES_logger.info('Writing BED files...')
    with open(os.path.join(args.output, bed_name), 'w') as bed_file, \
            open(os.path.join(args.output, unique_bed_name), 'w') as unique_bed_file, \
            open(os.path.join(args.output, single_bed_name), 'w') as single_bed_file, \
            open(os.path.join(args.output, single_unique_bed_name), 'w') as single_unique_bed_file, \
            open(os.path.join(args.output, coord_name), 'w') as coord_file, \
            open(os.path.join(args.output, code_name), 'w') as code_file:
        bed_file.write('\n'.join(bed_list))
        single_bed_file.write('\n'.join(single_list))
        single_unique_bed_file.write('\n'.join(single_unique_list))

        for unique_value in unique_dict.values():
            unique_bed_file.write('\n'.join(list(unique_value))+'\n')

    PTES_logger.info('Writing BED files... done')

    PTES_logger.info('Creating junctions dataframes...')
    coord_df = pd.DataFrame(coord_list)
    code_df = pd.DataFrame(code_list)
    coord_df.to_csv(os.path.join(args.output, coord_name), sep='\t')
    code_df.to_csv(os.path.join(args.output, code_name), sep='\t')

    PTES_logger.info('Creating junctions dataframes... done')

    if args.sort:
        PTES_logger.info('Sorting BED files...')
        for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]:
            shell_call('cat %s | sort -k1,1 -k2,2n  > %s.sorted' % (os.path.join(args.output, filename),
                                                                    os.path.join(args.output, filename),)                                                                                                                                                    )

        PTES_logger.info('Sorting BED files... done')

    if args.bigbed:   # will also sort files
        PTES_logger.info('Making bigBed...')
        for filename in [bed_name, unique_bed_name, single_bed_name, single_unique_bed_name]:
            to_bigbed(bed_name=filename, folder_name=args.output)
        PTES_logger.info('Making bigBed... done')

Exemple #6

0

Afficher le fichier

Fichier : segemehl_ucsc.py Projet : sunnymouse25/ptes

    path_to_file = '.'

junctions_name = args.input.rstrip('/') + '/' + 'junc_of_interest.csv'
junc_of_interest = []
with open(junctions_name, 'r') as junctions_file:
    for line in junctions_file:
        junc_of_interest.append(line.strip().split('\t'))

print 'Reading genome file...'        
genome_file = args.genome
genome = SeqIO.index(genome_file, "fasta")
print 'done'
        
folder_name = '%s/bed/' % path_to_file
cmd1 = 'if [ ! -d %s ]; then mkdir %s; fi' % (folder_name, folder_name)    
shell_call(cmd1)
bed_name = args.output
coord_name = bed_name + '.coords.csv'
init_file(bed_name, folder=folder_name)   # one BED file for all tracks
init_file(coord_name, folder=folder_name)   # read_name - window in genome browser
writeln_to_file('browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac', bed_name, folder=folder_name)
writeln_to_file('browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', bed_name, folder=folder_name)
writeln_to_file('browser pack gtexGene', bed_name, folder=folder_name)

for line_list in junc_of_interest:   
    key = line_list[0]  # key is mapped read_name, xi - number of current read alignment
    xi = line_list[1]
    n_junctions = line_list[2]
    annot_donors = line_list[3]
    annot_acceptors = line_list[4]    
    chrom = read_infos[key][xi][0]

Exemple #7

0

Afficher le fichier

args = parser.parse_args()

# Functions

# Main
make_dir(args.output)
PTES_logger.info('Creating intersection file... ')
intersection_name = os.path.join(
    args.output,
    os.path.basename(args.features) + '.intersect')
cmd = 'bedtools intersect -a %s -b %s -s -wo > %s' % (
    args.features,
    args.genes,
    intersection_name,
)
shell_call(cmd)
PTES_logger.info('Creating intersection file... done')
PTES_logger.info('Reading intersection file... ')

p_dict = {}
gene_p_dict = defaultdict(list)
feature_len_list = []
gene_len_list = []

with open(intersection_name, 'r') as intersect_file:
    for i, line in enumerate(intersect_file):
        line_list = line.strip().split()
        b_start = get_b_start(line)
        if not b_start:
            continue
        chrom1 = line_list[0]

Exemple #8

0

Afficher le fichier

Fichier : segemehl_encode.py Projet : sunnymouse25/ptes

                                            'chrom' : chrom,
                                            'chain' : chain,
                                            'donor' : str(donor_ss),
                                            'annot_donor' : annot_donor,
                                            'acceptor' : str(acceptor_ss),
                                            'annot_acceptor' : annot_acceptor,
                                            'chimeric' : chimeric})

  
mapped_junc_df = pd.DataFrame(junc_list)
mapped_junc_df = mapped_junc_df[['read_name', 'aln', 'n_junctions', 'chrom', 'chain', 'donor', 'annot_donor', 'acceptor', 'annot_acceptor', 'chimeric']].sort_values(by=['read_name','aln']).reset_index(drop=True)        
gr = mapped_junc_df.groupby(['read_name','aln']).apply(lambda x: x.chimeric.any()).reset_index(name='chim_read')
del gr['aln']
mapped_junc_df = pd.merge(mapped_junc_df, gr, on='read_name').reset_index(drop=True)
mapped_junc_df.to_csv('%s/mapped_junc_df_segemehl.csv' % path_to_file, sep = '\t')
shell_call('gzip -f %s/mapped_junc_df_segemehl.csv' % path_to_file)

PTES_logger.info('Creating junctions table... done')           

x = mapped_junc_df.groupby(['n_junctions','chim_read']).apply(lambda x: x.read_name.nunique()).reset_index(name='counts')
y = pd.pivot_table(x, index=['n_junctions'], columns=['chim_read'],values=['counts'], fill_value=0, aggfunc=sum, margins=True)
html_file = 'segemehl_pivot_table.html'
init_file(html_file, folder = path_to_file)
writeln_to_file(y.to_html(), html_file, folder = path_to_file)

junc_of_interest = mapped_junc_df.query('n_junctions >= 2 & chim_read == True').sort_values(by=['annot_donor','annot_acceptor'], ascending=False).reset_index(drop=True).groupby(['read_name','aln'])
junc_csv_name = 'junc_of_interest.csv'

PTES_logger.info('Reading genome file...')        
genome_file = args.genome
genome = SeqIO.index(genome_file, "fasta")

Exemple #9

0

Afficher le fichier

def main():
    ### Arguments

    parser = argparse.ArgumentParser()
    parser.add_argument("-m",
                        "--method",
                        type=str,
                        nargs='+',
                        default=[
                            'inside',
                            'outside',
                            'bedtools',
                        ],
                        help="Shuffling method(s): inside, outside, bedtools")
    parser.add_argument(
        "-c",
        "--closest",
        type=str,
        default='coverage',
        help="Choose close elements for outside method by coverage or length")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="Output folder for results")
    parser.add_argument("-iter",
                        "--iterations",
                        type=int,
                        default='1000',
                        help="Number of iterations, default 1000")
    parser.add_argument(
        "-f",
        "--features",
        type=str,
        help="Path to .BED6 file with features (small intervals)")
    parser.add_argument("-g",
                        "--genes",
                        type=str,
                        help="Path to .BED6 file with genes (containers)")
    parser.add_argument(
        "-s",
        "--chrom_sizes",
        type=str,
        default='/home/sunnymouse/Human_ref/hg19.chrom.sizes',
        help=
        "The chrom_sizes file should be tab delimited and structured as follows: \
                        <chromName><TAB><chromSize>, use bedtools shuffle -h for details"
    )

    args = parser.parse_args()
    make_dir(args.output)
    path_to_file = args.output.rstrip('/')
    random_folder = path_to_file + '/random'
    make_dir(random_folder)

    # Shuffling methods inside and outside:

    if 'inside' in args.method or 'outside' in args.method:
        if 'outside' in args.method:
            PTES_logger.info('Reading containers... ')
            PTES_logger.info('containers file: %s ' % args.genes)

            strand_dict = {}
            interval_dict = {}
            if args.closest == 'length':
                gene_dict = defaultdict(list)
                # open file with genes
                with open(args.genes, 'r') as genes_file:
                    for line in genes_file:
                        line_list = line.strip().split()
                        chrom = line_list[0]
                        gene_interval = interval[int(line_list[1]),
                                                 int(line_list[2])]
                        gene_dict[chrom].append(gene_interval)
                        try:
                            strand = line_list[5]
                            strand_dict[gene_interval] = strand
                        except IndexError:
                            strand_dict[gene_interval] = '.'
                            PTES_logger.error('No strand found')
                            PTES_logger.error(
                                'BED6 format is required for choosing strand-specific position'
                            )

                # sort lists by gene length
                for key in gene_dict:  # key is chromosome
                    new_list = sorted(gene_dict[key],
                                      key=lambda x: get_interval_length(x))
                    interval_dict.update(choose_close(sorted_list=new_list))

            if args.closest == 'coverage':
                PTES_logger.info('Creating coverage file... ')

                cover_name = '%s/%s' % (
                    random_folder, os.path.basename(args.features) + '.cov')
                cmd = 'bedtools coverage -a %s -b %s -s > %s' % (
                    args.genes,
                    args.features,
                    cover_name,
                )
                shell_call(cmd)
                PTES_logger.info('Creating coverage file... done')

                cover_dict = {}
                with open(cover_name, 'r') as cover_file:
                    for line in cover_file:
                        line_list = line.strip().split()
                        gene_interval = interval[int(line_list[1]),
                                                 int(line_list[2])]
                        cov = float(line_list[-1])
                        cover_dict[gene_interval] = cov
                        try:
                            strand = line_list[5]
                            strand_dict[gene_interval] = strand
                        except IndexError:
                            strand_dict[gene_interval] = '.'
                            PTES_logger.error('No strand found')
                            PTES_logger.error(
                                'BED6 format is required for choosing strand-specific position'
                            )

                new_list = sorted(cover_dict.items(), key=lambda x: x[1])
                interval_dict.update(
                    choose_close(sorted_list=new_list, items='items'))

            PTES_logger.info('Reading containers... done')

        PTES_logger.info('Creating intersection file... ')

        intersection_name = '%s/%s' % (
            random_folder, os.path.basename(args.features) + '.intersect')
        cmd = 'bedtools intersect -a %s -b %s -wo -s > %s' % (
            args.features,
            args.genes,
            intersection_name,
        )
        shell_call(cmd)
        PTES_logger.info('Creating intersection file... done')

        PTES_logger.info('Reading intersection file and shuffling... ')
        PTES_logger.info('intersection file: %s' % intersection_name)

        if 'inside' in args.method:
            n_list_inside = np.empty(
                (args.iterations, 0)).tolist()  # make list of 1000 empty lists

        if 'outside' in args.method:
            n_list_outside = np.empty(
                (args.iterations, 0)).tolist()  # make list of 1000 empty lists

        with open(intersection_name, 'r') as intersect_file:
            for i, line in enumerate(intersect_file):
                line_list = line.strip().split()
                b_start = get_b_start(line)
                if not b_start:
                    continue
                chrom1 = line_list[0]
                feature_interval = interval[int(line_list[1]),
                                            int(line_list[2])]
                gene_interval = interval[int(line_list[b_start + 1]),
                                         int(line_list[b_start + 2])]

                for n in range(args.iterations):
                    if 'inside' in args.method:
                        random_interval_inside = randomize_interval(
                            small_i=feature_interval, large_i=gene_interval)
                        n_list_inside[n].append(
                            interval_to_bed_line(
                                chrom=chrom1,
                                single_interval=random_interval_inside,
                                name=line_list[3],
                                strand=line_list[5]))

                    if 'outside' in args.method:
                        new_large_interval = random.choice(
                            interval_dict[gene_interval]
                        )  # choose one of closest genes
                        new_strand = strand_dict[new_large_interval]
                        feature_len = get_interval_length(feature_interval)
                        gene_len = get_interval_length(gene_interval)
                        if feature_len <= gene_len:
                            try:
                                container_strand = line_list[b_start + 5]
                                relative_position = count_relative_position(
                                    feature=feature_interval,
                                    container=gene_interval,
                                    container_strand=container_strand)
                                random_interval_outside = randomize_interval(
                                    small_i=feature_interval,
                                    large_i=new_large_interval,
                                    large_i_strand=new_strand,
                                    same_position=True,
                                    p=relative_position)
                            except IndexError:
                                PTES_logger.error('No strand found')
                                PTES_logger.error(
                                    'BED6 format is required for choosing strand-specific position'
                                )
                                relative_position = count_relative_position(
                                    feature=feature_interval,
                                    container=gene_interval)
                                random_interval_outside = randomize_interval(
                                    small_i=feature_interval,
                                    large_i=new_large_interval,
                                    same_position=True,
                                    p=relative_position)
                        else:
                            random_interval_outside = randomize_interval(
                                small_i=feature_interval,
                                large_i=new_large_interval,
                                same_position=False,
                            )
                        n_list_outside[n].append(
                            interval_to_bed_line(
                                chrom=chrom1,
                                single_interval=random_interval_outside,
                                name=line_list[3],
                                strand=line_list[5]))

        PTES_logger.info('Reading intersection file and shuffling... done')
        PTES_logger.info('Creating output files... ')
        for n in range(args.iterations):
            if 'inside' in args.method:
                out_name = random_folder + '/%s_%i.bed' % ('inside', n)
                with open(out_name, 'w') as out_file:
                    out_file.write('\n'.join(n_list_inside[n]))

            if 'outside' in args.method:
                out_name = random_folder + '/%s_%i.bed' % ('outside', n)
                with open(out_name, 'w') as out_file:
                    out_file.write('\n'.join(n_list_outside[n]))

    PTES_logger.info('Creating output files... done')
    # Shuffling method 3
    if 'bedtools' in args.method:
        PTES_logger.info('Running bedtools shuffle... ')
        for n in range(args.iterations):
            random_file = 'bedtools_%i.bed' % n
            cmd = 'bedtools shuffle -incl %s -i %s -g %s -chrom > %s/%s' % (
                args.genes, args.features, args.chrom_sizes, random_folder,
                random_file)
            shell_call(cmd)
        PTES_logger.info('Running bedtools shuffle... done')