Example #1
0
def make_bed_folder(prefix, path_to_file):
    """
    DEPRECATED
    Initiates 3 files essential for Genome Browser:
    :param path_to_file: folder where ./bed subfolder will be
    :param prefix: prefix for names of files, i.e. sample tag
    :return: ./bed subfolder,
    BED file for track lines (.bed),
    table with windows to copy-paste (.coords.csv),
    track file for GB (.track)
    """
    bed_name = '%s.bed' % prefix  # only track lines
    coord_name = '%s.coords.csv' % prefix  # table with windows to paste into GB and with descriptions
    info_name = '%s.track' % prefix  # file to submit to GB
    folder_name = '%s/bed/' % path_to_file
    make_dir(folder_name)

    init_file(bed_name, folder=folder_name)
    init_file(coord_name, folder=folder_name)
    init_file(info_name, folder=folder_name)

    writeln_to_file('\n'.join([
        'browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk',
        'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3',
        'browser pack gtexGene',
        'track type=bigBed \
         name="%s" \
         description="bigBed" \
         visibility=2 \
         itemRgb="On" \
         bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true'
        % (prefix, bed_name.replace('.bed', '.bb'))
    ]),
                    info_name,
                    folder=folder_name)
    return folder_name, bed_name, coord_name
Example #2
0
            continue
        read_name = line_dict['read_name']
        mate1, mate2 = return_mates(cigar1=line_dict['cigar1'],
                                    coord1=line_dict['coord1'],
                                    cigar2=line_dict['cigar2'],
                                    coord2=line_dict['coord2'],
                                    chain=chain)
        interval_intersection = mate_intersection(mate1, mate2)
        if line_dict['junction_letters'] == 'GT/AG':
            mates_gtag[interval_intersection] += 1
        else:
            mates_nc[interval_intersection] += 1
        if interval_intersection == 'outside':
            outside_list.append(line)
        if line_dict['donor_ss'] in gtf_donors[chrom]:
            annot_donors += 1
        if line_dict['acceptor_ss'] in gtf_acceptors[chrom]:
            annot_acceptors += 1

PTES_logger.info('Reading STAR output... done')
PTES_logger.info('Inside GT/AG: %i' % mates_gtag['inside'])
PTES_logger.info('Inside other: %i' % mates_nc['inside'])
PTES_logger.info('Outside GT/AG: %i' % mates_gtag['outside'])
PTES_logger.info('Outside other: %i' % mates_nc['outside'])
PTES_logger.info('Intron too large GT/AG: %i' % mates_gtag['non-chim'])
PTES_logger.info('Intron too large other: %i' % mates_nc['non-chim'])
PTES_logger.info('Annot donors: %i' % annot_donors)
PTES_logger.info('Annot acceptors: %i' % annot_acceptors)

writeln_to_file(''.join(outside_list), outside_name, folder=path_to_file)
Example #3
0
            chim_part2 = get_read_interval(cigar1, coord1)
            bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r')
            bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r')
            bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate1', color='r')
            track_lists = [bed1, bed2, bed3]
        else:   #single-read mode
            chim_part1 = get_read_interval(cigar1, coord1)   # not mates, chimeric parts!
            chim_part2 = get_read_interval(cigar2, coord2)
            bed1 = get_track_list(chrom, chain, chim_part1, name='chim_part1', color='r')
            bed2 = get_track_list(chrom, chain, chim_part2, name='chim_part2', color='b')
            track_lists = [bed1, bed2]
            
        for track_list in track_lists:
            windows_min.append(int(track_list[1]))   # track_list[1] is chromStart, track_list[2] is chromEnd
            windows_max.append(int(track_list[2]))  
            writeln_to_file('\t'.join(track_list), bed_name, folder = folder_name)
        window = (chrom, 
                  min(windows_min)-200, 
                  max(windows_max)+200)   # bed1[2] is chromEnd        
                                      
        description = "%s; %i read(s) found; chim junction %s" % (desc, num_reads, junction_letters)
        writeln_to_file('browser position %s:%i-%i\t' % window + description, coord_name, folder = folder_name) 
        if i == 0:
            writeln_to_file('browser position %s:%i-%i\t' % window, info_name, folder = folder_name) 

to_bigbed(bed_name=bed_name)
        
        
        
        
        
Example #4
0
for i, name in enumerate(file_str.split()):
    id = name.strip(None)                    
    folder_name = '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/%s' % id   #without end 
    sh_filename = 'star_%i.sh' % i    
    init_file(sh_filename)
    cmd_list = []
    cmd_list.append('#!/bin/bash -il \ncd %s \n' %  '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/')
    cmd_list.append('source /uge_mnt/home/sunnymouse/tools/miniconda2/bin/activate')
    '''
    cmd_list.append('samtools collate -uOn 256 %s%s.bam %s/tmp-prefix \
                    | samtools fastq - > %s/%s.fq' % (bam_folder, id, folder_name, folder_name, id))
    cmd_list.append('gzip %s/%s.fq' % (folder_name, id))                
    cmd_list.append('%s/segemehl.x -S \
                                -Z 10  \
                                -t 10 \
                                -s \
                                -i /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.idx \
                                -d /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.fa  \
                                -q %s/%s.fq \
                                -o %s/segemehl.sam \
                                -u %s/segemehl_unmapped' % (segemehl_bin, folder_name,id, folder_name, folder_name)) 
    cmd_list.append('samtools view %s/segemehl.sam > %s/segemehl.sam.nohead' % (folder_name, folder_name))       
    cmd_list.append('samtools view -b %s/segemehl.sam > %s/segemehl.bam' % (folder_name, folder_name))    
    cmd_list.append('%s/segemehl.sam' % folder_name)    
    '''
    cmd_list.append('python segemehl_encode.py -i %s/segemehl.sam.nohead -o %s -t %s' % (folder_name, folder_name, id))    
    
    writeln_to_file('\n'.join(cmd_list), sh_filename)

    shell_call('chmod +x ./%s' % sh_filename)
   
Example #5
0
    for line in junctions_file:
        junc_of_interest.append(line.strip().split('\t'))

print 'Reading genome file...'        
genome_file = args.genome
genome = SeqIO.index(genome_file, "fasta")
print 'done'
        
folder_name = '%s/bed/' % path_to_file
cmd1 = 'if [ ! -d %s ]; then mkdir %s; fi' % (folder_name, folder_name)    
shell_call(cmd1)
bed_name = args.output
coord_name = bed_name + '.coords.csv'
init_file(bed_name, folder=folder_name)   # one BED file for all tracks
init_file(coord_name, folder=folder_name)   # read_name - window in genome browser
writeln_to_file('browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac', bed_name, folder=folder_name)
writeln_to_file('browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', bed_name, folder=folder_name)
writeln_to_file('browser pack gtexGene', bed_name, folder=folder_name)

for line_list in junc_of_interest:   
    key = line_list[0]  # key is mapped read_name, xi - number of current read alignment
    xi = line_list[1]
    n_junctions = line_list[2]
    annot_donors = line_list[3]
    annot_acceptors = line_list[4]    
    chrom = read_infos[key][xi][0]           
    chain = read_infos[key][xi][1]        
    tuples = read_intervals[key][xi]   # list of intervals of current read alignment    
    tuples = sorted(tuples, key=lambda x:x[0])   # sort by xq        
    values =  [i[1] for i in tuples]   # get rid of xq
    values =  [interval[x[0][0], x[0][1]] for x in values]   # get rid of xq
Example #6
0
true_map_dict, false_map_dict = map_dicts(real_junc_df=junc_df,
                                          mapped_junc_df=mapped_junc_df)
false_map_df = pd.DataFrame.from_dict(false_map_dict, orient='index')
false_map_df.columns = ['real', 'mapped']
false_map_df['correct'] = False
true_map_df = pd.DataFrame.from_dict(true_map_dict, orient='index')
true_map_df.columns = ['real', 'mapped']
true_map_df['correct'] = True
all_map_df = pd.concat([false_map_df, true_map_df])
all_map_df.to_csv('%s/all_map_df.csv' % path_to_file, sep='\t')

info_df = pd.merge(junc_df, all_map_df, left_on='read_name',
                   right_index=True).reset_index(drop=True)
del info_df['real']
x = info_df.groupby([
    'n_junctions', 'mapped', 'correct'
]).apply(lambda x: x.read_name.nunique()).reset_index(name='counts')
y = pd.pivot_table(x,
                   index=['n_junctions', 'correct'],
                   columns=['mapped'],
                   values=['counts'],
                   fill_value=0,
                   aggfunc=sum,
                   margins=True)
y.to_csv('%s/pivot_table.csv' % path_to_file, sep='\t')
pivot_html_name = 'pivot_table.html'
init_file(pivot_html_name, folder=path_to_file)
writeln_to_file(y.to_html(), pivot_html_name, folder=path_to_file)

PTES_logger.info('Comparing junctions table with real junctions... done')