def make_bed_folder(prefix, path_to_file): """ DEPRECATED Initiates 3 files essential for Genome Browser: :param path_to_file: folder where ./bed subfolder will be :param prefix: prefix for names of files, i.e. sample tag :return: ./bed subfolder, BED file for track lines (.bed), table with windows to copy-paste (.coords.csv), track file for GB (.track) """ bed_name = '%s.bed' % prefix # only track lines coord_name = '%s.coords.csv' % prefix # table with windows to paste into GB and with descriptions info_name = '%s.track' % prefix # file to submit to GB folder_name = '%s/bed/' % path_to_file make_dir(folder_name) init_file(bed_name, folder=folder_name) init_file(coord_name, folder=folder_name) init_file(info_name, folder=folder_name) writeln_to_file('\n'.join([ 'browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac rmsk', 'browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', 'browser pack gtexGene', 'track type=bigBed \ name="%s" \ description="bigBed" \ visibility=2 \ itemRgb="On" \ bigDataUrl=https://github.com/sunnymouse25/ptes/blob/dev/research/bed/%s?raw=true' % (prefix, bed_name.replace('.bed', '.bb')) ]), info_name, folder=folder_name) return folder_name, bed_name, coord_name
continue read_name = line_dict['read_name'] mate1, mate2 = return_mates(cigar1=line_dict['cigar1'], coord1=line_dict['coord1'], cigar2=line_dict['cigar2'], coord2=line_dict['coord2'], chain=chain) interval_intersection = mate_intersection(mate1, mate2) if line_dict['junction_letters'] == 'GT/AG': mates_gtag[interval_intersection] += 1 else: mates_nc[interval_intersection] += 1 if interval_intersection == 'outside': outside_list.append(line) if line_dict['donor_ss'] in gtf_donors[chrom]: annot_donors += 1 if line_dict['acceptor_ss'] in gtf_acceptors[chrom]: annot_acceptors += 1 PTES_logger.info('Reading STAR output... done') PTES_logger.info('Inside GT/AG: %i' % mates_gtag['inside']) PTES_logger.info('Inside other: %i' % mates_nc['inside']) PTES_logger.info('Outside GT/AG: %i' % mates_gtag['outside']) PTES_logger.info('Outside other: %i' % mates_nc['outside']) PTES_logger.info('Intron too large GT/AG: %i' % mates_gtag['non-chim']) PTES_logger.info('Intron too large other: %i' % mates_nc['non-chim']) PTES_logger.info('Annot donors: %i' % annot_donors) PTES_logger.info('Annot acceptors: %i' % annot_acceptors) writeln_to_file(''.join(outside_list), outside_name, folder=path_to_file)
chim_part2 = get_read_interval(cigar1, coord1) bed1 = get_track_list(chrom, chain, mate1, name='mate1', color='r') bed2 = get_track_list(chrom, chain, mate2, name='mate2', color='r') bed3 = get_track_list(chrom, chain, chim_part2, name='chim_mate1', color='r') track_lists = [bed1, bed2, bed3] else: #single-read mode chim_part1 = get_read_interval(cigar1, coord1) # not mates, chimeric parts! chim_part2 = get_read_interval(cigar2, coord2) bed1 = get_track_list(chrom, chain, chim_part1, name='chim_part1', color='r') bed2 = get_track_list(chrom, chain, chim_part2, name='chim_part2', color='b') track_lists = [bed1, bed2] for track_list in track_lists: windows_min.append(int(track_list[1])) # track_list[1] is chromStart, track_list[2] is chromEnd windows_max.append(int(track_list[2])) writeln_to_file('\t'.join(track_list), bed_name, folder = folder_name) window = (chrom, min(windows_min)-200, max(windows_max)+200) # bed1[2] is chromEnd description = "%s; %i read(s) found; chim junction %s" % (desc, num_reads, junction_letters) writeln_to_file('browser position %s:%i-%i\t' % window + description, coord_name, folder = folder_name) if i == 0: writeln_to_file('browser position %s:%i-%i\t' % window, info_name, folder = folder_name) to_bigbed(bed_name=bed_name)
for i, name in enumerate(file_str.split()): id = name.strip(None) folder_name = '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/%s' % id #without end sh_filename = 'star_%i.sh' % i init_file(sh_filename) cmd_list = [] cmd_list.append('#!/bin/bash -il \ncd %s \n' % '/uge_mnt/home/sunnymouse/projects/PTES/ENCODE/') cmd_list.append('source /uge_mnt/home/sunnymouse/tools/miniconda2/bin/activate') ''' cmd_list.append('samtools collate -uOn 256 %s%s.bam %s/tmp-prefix \ | samtools fastq - > %s/%s.fq' % (bam_folder, id, folder_name, folder_name, id)) cmd_list.append('gzip %s/%s.fq' % (folder_name, id)) cmd_list.append('%s/segemehl.x -S \ -Z 10 \ -t 10 \ -s \ -i /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.idx \ -d /uge_mnt/home/sunnymouse/Human_ref/GRCh37.p13.genome.fa \ -q %s/%s.fq \ -o %s/segemehl.sam \ -u %s/segemehl_unmapped' % (segemehl_bin, folder_name,id, folder_name, folder_name)) cmd_list.append('samtools view %s/segemehl.sam > %s/segemehl.sam.nohead' % (folder_name, folder_name)) cmd_list.append('samtools view -b %s/segemehl.sam > %s/segemehl.bam' % (folder_name, folder_name)) cmd_list.append('%s/segemehl.sam' % folder_name) ''' cmd_list.append('python segemehl_encode.py -i %s/segemehl.sam.nohead -o %s -t %s' % (folder_name, folder_name, id)) writeln_to_file('\n'.join(cmd_list), sh_filename) shell_call('chmod +x ./%s' % sh_filename)
for line in junctions_file: junc_of_interest.append(line.strip().split('\t')) print 'Reading genome file...' genome_file = args.genome genome = SeqIO.index(genome_file, "fasta") print 'done' folder_name = '%s/bed/' % path_to_file cmd1 = 'if [ ! -d %s ]; then mkdir %s; fi' % (folder_name, folder_name) shell_call(cmd1) bed_name = args.output coord_name = bed_name + '.coords.csv' init_file(bed_name, folder=folder_name) # one BED file for all tracks init_file(coord_name, folder=folder_name) # read_name - window in genome browser writeln_to_file('browser full knownGene ensGene cons100way wgEncodeRegMarkH3k27ac', bed_name, folder=folder_name) writeln_to_file('browser dense refSeqComposite pubs snp150Common wgEncodeRegDnaseClustered wgEncodeRegTfbsClusteredV3', bed_name, folder=folder_name) writeln_to_file('browser pack gtexGene', bed_name, folder=folder_name) for line_list in junc_of_interest: key = line_list[0] # key is mapped read_name, xi - number of current read alignment xi = line_list[1] n_junctions = line_list[2] annot_donors = line_list[3] annot_acceptors = line_list[4] chrom = read_infos[key][xi][0] chain = read_infos[key][xi][1] tuples = read_intervals[key][xi] # list of intervals of current read alignment tuples = sorted(tuples, key=lambda x:x[0]) # sort by xq values = [i[1] for i in tuples] # get rid of xq values = [interval[x[0][0], x[0][1]] for x in values] # get rid of xq
true_map_dict, false_map_dict = map_dicts(real_junc_df=junc_df, mapped_junc_df=mapped_junc_df) false_map_df = pd.DataFrame.from_dict(false_map_dict, orient='index') false_map_df.columns = ['real', 'mapped'] false_map_df['correct'] = False true_map_df = pd.DataFrame.from_dict(true_map_dict, orient='index') true_map_df.columns = ['real', 'mapped'] true_map_df['correct'] = True all_map_df = pd.concat([false_map_df, true_map_df]) all_map_df.to_csv('%s/all_map_df.csv' % path_to_file, sep='\t') info_df = pd.merge(junc_df, all_map_df, left_on='read_name', right_index=True).reset_index(drop=True) del info_df['real'] x = info_df.groupby([ 'n_junctions', 'mapped', 'correct' ]).apply(lambda x: x.read_name.nunique()).reset_index(name='counts') y = pd.pivot_table(x, index=['n_junctions', 'correct'], columns=['mapped'], values=['counts'], fill_value=0, aggfunc=sum, margins=True) y.to_csv('%s/pivot_table.csv' % path_to_file, sep='\t') pivot_html_name = 'pivot_table.html' init_file(pivot_html_name, folder=path_to_file) writeln_to_file(y.to_html(), pivot_html_name, folder=path_to_file) PTES_logger.info('Comparing junctions table with real junctions... done')