def make_T_top_regions(signal_table_path, top=1000): ''' makes a gff and fasta of the top N T regions based off the signal table ''' signal_table = utils.parseTable(signal_table_path, '\t') signal_dict = defaultdict(float) for line in signal_table[1:]: signal = (max(float(line[2]) - float(line[3]), 0) + max(float(line[4]) - float(line[5]), 0)) / 2 signal_dict[line[1]] = signal signal_vector = [signal_dict[line[1]] for line in signal_table[1:]] signal_order = utils.order(signal_vector, decreasing=True) t_top_gff_path = '%sCH22_T_UNION_TOP_%s_-0_+0.gff' % (gffFolder, str(top)) print(t_top_gff_path) t_top_gff = [] for i in range(top): signal_row = signal_order[i] + 1 line = signal_table[signal_row] region_id = line[1] chrom = region_id.split('(')[0] coords = region_id.split(':')[-1].split('-') gff_line = [ chrom, region_id, '', coords[0], coords[1], '', '.', '', region_id ] t_top_gff.append(gff_line) utils.unParseTable(t_top_gff, t_top_gff_path, '\t') t_top_fasta = utils.gffToFasta('HG19', genomeDirectory, t_top_gff) t_top_fasta_path = '%sHG19_CH22_T_UNION_TOP_%s_-0_+0.fasta' % (fastaFolder, top) utils.unParseTable(t_top_fasta, t_top_fasta_path, '') return t_top_fasta_path
import sys sys.path.append('/storage/cylin/bin/pipeline/') import utils import re gff_path = '/storage/cylin/grail/projects/rasmc_all/gff/rasmc_h3k27ac_0_tss_all_subpeak.gff' genome_directory='/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/' genome = 'RN6' print('gffToFasta Tool running on ' + gff_path + ' for ' + genome) fasta = utils.gffToFasta(genome,genome_directory,gff_path,UCSC=True,useID=False) print('Creating density table') table=[] header=['DENSITY','POSITIONS','POS_COUNT','SUBPEAK_LENGTH'] table.append(header) #CArG box motif seq='CC[AT]{6}GG' table_path='/storage/cylin/grail/projects/rasmc_all/motif_density/CArG_box_seq_density_from_fasta_full_length_no_slash.txt' for i in range(0,len(fasta),2): positions=[] line=fasta[i+1]