def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [x[:-len("_0")] if x.endswith("_0") else x for x in df.columns] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg({"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg({"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg( {"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [ x[:-len("_0")] if x.endswith("_0") else x for x in df.columns ] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg( {"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def qcSummary(infiles, outfile): '''create a summary table of relevant QC metrics''' # Some QC metrics are specific to paired end data if PAIRED: exclude = [] paired_columns = '''READ_PAIRS_EXAMINED as no_pairs, PERCENT_DUPLICATION as pct_duplication, ESTIMATED_LIBRARY_SIZE as library_size, PCT_READS_ALIGNED_IN_PAIRS as pct_reads_aligned_in_pairs, MEDIAN_INSERT_SIZE as median_insert_size, ''' pcat = "PAIR" else: exclude = ["qc_library_complexity", "qc_insert_size_metrics"] paired_columns = '' pcat = "UNPAIRED" tables = [P.toTable(x) for x in infiles if P.toTable(x) not in exclude] t1 = tables[0] name_fields = PARAMS["name_field_titles"].strip() stat_start = '''select distinct %(name_fields)s, sample_information.sample_id, fraction_spliced, fraction_spike, qc_no_genes_cufflinks.protein_coding as cufflinks_no_genes_pc, qc_no_genes_cufflinks.total as cufflinks_no_genes, qc_no_genes_featurecounts.protein_coding as featurecounts_no_genes_pc, qc_no_genes_featurecounts.total as featurecounts_no_genes, three_prime_bias as three_prime_bias, nreads_uniq_map_genome, nreads_uniq_map_spike, %(paired_columns)s PCT_MRNA_BASES as pct_mrna, PCT_CODING_BASES as pct_coding, PCT_PF_READS_ALIGNED as pct_reads_aligned, TOTAL_READS as total_reads, PCT_ADAPTER as pct_adapter, PF_HQ_ALIGNED_READS*1.0/PF_READS as pct_pf_reads_aligned_hq from %(t1)s ''' % locals() join_stat = "" for table in tables[1:]: join_stat += "left join " + table + "\n" join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n" where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s" ''' % locals() statement = "\n".join([stat_start, join_stat, where_stat]) df = DB.fetch_DataFrame(statement, PARAMS["database_name"]) df.to_csv(outfile, sep="\t", index=False)
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def defineTads(infile, outfile): ''' Motif is "forward" if present on + strand, and "reverse" if present on - strand insulator pairs (from intersection) can therefore have motifs in the following orientations: 1) convergent (F, R) 2) divergent (R, F) 3) same direction + strand (F, F) 4) same direction - strand (R, R) Intervals generated from peak intersections with motifs in convergent orientation will represent TADs (or subTADS...) ''' db = PARAMS["database"] npeaks = PARAMS["tads_npeaks"] pwidth = PARAMS["tads_pwidth"] tmp_dir = "$SCRATCH_DIR" # fetch insulator peaks with fimo motifs table = "insulators_" + '_'.join([str(npeaks), str(pwidth) ]) + "_fimo_table" statement = '''select * from %(table)s''' % locals() motifs = DB.fetch_DataFrame(statement, db) # get most significant motif for each peak motifs = motifs.sort_values(["sequence_name", "q_value"], 0).drop_duplicates(subset="sequence_name", keep="first") motifs.to_csv("insulators_fimoMotifs.txt", sep="\t", header=True) # save peaks w/ annotated motifs as df upload2csvdb(motifs, "insulators_fimoMotifs", db) # upload to csvdb # get peaks (bed format) corresponding to fimo motifs statement = '''select b.contig, b.start, b.end, a.sequence_name, b.peak_score, a.strand, a.q_value from insulators_fimoMotifs a inner join insulators b on a.sequence_name = b.peak_id''' motif_bed = DB.fetch_DataFrame(statement, db) motif_bed = motif_bed.sort_values(["sequence_name", "q_value"], 0).drop_duplicates( subset="sequence_name", keep="first") motif_bed.to_csv("motif_bed.txt", sep="\t", header=True, index=False) # merge peaks # iterate over a range of distances (1mb - 1kb) within which insulators peaks are merged ### descending order of distances -> favours bigger TADs, and joins up remaining intervals up to min. size of 1kb # merged insulators selected for with awk "," in $6 (strand col) ***Limited to merges of two peaks with F,R orientation with ($6=="\+,-") # merged insulators written to tmp file (tmp + str(counter)) # for each successive merge after n=1 peaks from previous merge are subtracted from results with bedtools (w/ -A flag to remove entire intervals) ### a few of the later TADs are ver large and are merged over previous, how to correct for this? merge final file (only overlapping tads?) # n = 0 # distances = range(0, 10100000, 10000) # 10mb to 1kb, 10kb decreases # distances = distances[::-1] # invert list -> descending order # for dist in distances: # n = n +1 # tmp = "tmp" + str(n) # if n == 1: # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - > $tmp; checkpoint; # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i $tmp | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > %(tmp)s''' % locals() # elif n > 1 and n < len(distances): # merge = tmp.replace(str(n), str(n-1)) # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint; # subtractBed -A -a $tmp -b %(merge)s > %(tmp)s''' % locals() # elif n == len(distances): # merge = tmp.replace(str(n), str(n-1)) # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint; # subtractBed -A -a $tmp -b %(merge)s > %(tmp)s; checkpoint; # awk 'BEGIN {OFS="\\t"} {if ($3-$2 > 1000) print $0}' <(cat tmp*) | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -i - > %(outfile)s; checkpoint; # rm tmp*''' % locals() ### Instead of merging peaks with F/R motif orientation I could seperate insulator peaks into F & R files, ### then use bedtools closest to intersect peaks up to a max distance of n & remove peaks with divergent motifs. # Ensure "closest" matches are on the same chromosome with awk, also remove features > 1mb wide statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; Fstrand=`mktemp -p %(tmp_dir)s`; checkpoint; Rstrand=`mktemp -p %(tmp_dir)s`; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($6 == "+") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Fstrand; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($6 == "-") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Rstrand; checkpoint; ~/devel/GIT/bedtools2/bin/closestBed -iu -D ref -a $Fstrand -b $Rstrand > $tmp; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($1 == $8 && $9-$2 < 1000000) print $1,$2,$9,$4"/"$11,($5+$12)/2,$6","$13,($7+$14)/2}' $tmp > %(outfile)s ''' ### This works better! # Need to incoporate CTCF & cohesin coverage over candidate insulators. Then filter out insulator pairs (candidate TADS) with large discrepancies in ChIP signal # Czimmerer et al use a cut off of > 2fold difference betweeen start & end peaks of TADS in ChIP signal # Add ChIP coverage code for insulator peaks & save to db, then incoporate CTCF & cohesin signal into awk filter at the end of this statement print statement P.run()