Exemple #1
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [x[:-len("_0")] if x.endswith("_0") else x
                  for x in df.columns]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg({"value": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="variable",
                              values="value", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemple #2
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg({"counts": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="track",
                              values="counts", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemple #3
0
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg(
        {"counts": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="track",
                              values="counts",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemple #4
0
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [
        x[:-len("_0")] if x.endswith("_0") else x for x in df.columns
    ]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg(
        {"value": lambda x: np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df,
                              index="variable",
                              values="value",
                              columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
Exemple #5
0
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
Exemple #6
0
def qcSummary(infiles, outfile):
    '''create a summary table of relevant QC metrics'''

    # Some QC metrics are specific to paired end data
    if PAIRED:
        exclude = []
        paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
                              PERCENT_DUPLICATION as pct_duplication,
                              ESTIMATED_LIBRARY_SIZE as library_size,
                              PCT_READS_ALIGNED_IN_PAIRS
                                       as pct_reads_aligned_in_pairs,
                              MEDIAN_INSERT_SIZE
                                       as median_insert_size,
                           '''
        pcat = "PAIR"

    else:
        exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
        paired_columns = ''
        pcat = "UNPAIRED"

    tables = [P.toTable(x) for x in infiles
              if P.toTable(x) not in exclude]

    t1 = tables[0]

    name_fields = PARAMS["name_field_titles"].strip()

    stat_start = '''select distinct %(name_fields)s,
                                    sample_information.sample_id,
                                    fraction_spliced,
                                    fraction_spike,
                                    qc_no_genes_cufflinks.protein_coding
                                       as cufflinks_no_genes_pc,
                                    qc_no_genes_cufflinks.total
                                       as cufflinks_no_genes,
                                    qc_no_genes_featurecounts.protein_coding
                                       as featurecounts_no_genes_pc,
                                    qc_no_genes_featurecounts.total
                                       as featurecounts_no_genes,
                                    three_prime_bias
                                       as three_prime_bias,
                                    nreads_uniq_map_genome,
                                    nreads_uniq_map_spike,
                                    %(paired_columns)s
                                    PCT_MRNA_BASES
                                       as pct_mrna,
                                    PCT_CODING_BASES
                                       as pct_coding,
                                    PCT_PF_READS_ALIGNED
                                       as pct_reads_aligned,
                                    TOTAL_READS
                                       as total_reads,
                                    PCT_ADAPTER
                                       as pct_adapter,
                                    PF_HQ_ALIGNED_READS*1.0/PF_READS
                                       as pct_pf_reads_aligned_hq
                   from %(t1)s
                ''' % locals()

    join_stat = ""
    for table in tables[1:]:
        join_stat += "left join " + table + "\n"
        join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"

    where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
                 ''' % locals()

    statement = "\n".join([stat_start, join_stat, where_stat])

    df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
    df.to_csv(outfile, sep="\t", index=False)
Exemple #7
0
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement)
    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement)

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
Exemple #8
0
def qcSummary(infiles, outfile):
    '''create a summary table of relevant QC metrics'''

    # Some QC metrics are specific to paired end data
    if PAIRED:
        exclude = []
        paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
                              PERCENT_DUPLICATION as pct_duplication,
                              ESTIMATED_LIBRARY_SIZE as library_size,
                              PCT_READS_ALIGNED_IN_PAIRS
                                       as pct_reads_aligned_in_pairs,
                              MEDIAN_INSERT_SIZE
                                       as median_insert_size,
                           '''
        pcat = "PAIR"

    else:
        exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
        paired_columns = ''
        pcat = "UNPAIRED"

    tables = [P.toTable(x) for x in infiles if P.toTable(x) not in exclude]

    t1 = tables[0]

    name_fields = PARAMS["name_field_titles"].strip()

    stat_start = '''select distinct %(name_fields)s,
                                    sample_information.sample_id,
                                    fraction_spliced,
                                    fraction_spike,
                                    qc_no_genes_cufflinks.protein_coding
                                       as cufflinks_no_genes_pc,
                                    qc_no_genes_cufflinks.total
                                       as cufflinks_no_genes,
                                    qc_no_genes_featurecounts.protein_coding
                                       as featurecounts_no_genes_pc,
                                    qc_no_genes_featurecounts.total
                                       as featurecounts_no_genes,
                                    three_prime_bias
                                       as three_prime_bias,
                                    nreads_uniq_map_genome,
                                    nreads_uniq_map_spike,
                                    %(paired_columns)s
                                    PCT_MRNA_BASES
                                       as pct_mrna,
                                    PCT_CODING_BASES
                                       as pct_coding,
                                    PCT_PF_READS_ALIGNED
                                       as pct_reads_aligned,
                                    TOTAL_READS
                                       as total_reads,
                                    PCT_ADAPTER
                                       as pct_adapter,
                                    PF_HQ_ALIGNED_READS*1.0/PF_READS
                                       as pct_pf_reads_aligned_hq
                   from %(t1)s
                ''' % locals()

    join_stat = ""
    for table in tables[1:]:
        join_stat += "left join " + table + "\n"
        join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"

    where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
                 ''' % locals()

    statement = "\n".join([stat_start, join_stat, where_stat])

    df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
    df.to_csv(outfile, sep="\t", index=False)
Exemple #9
0
def defineTads(infile, outfile):
    '''
    Motif is "forward" if present on + strand, and "reverse" if present on - strand
    insulator pairs (from intersection) can therefore have motifs in the following orientations:
    1) convergent (F, R)
    2) divergent (R, F)
    3) same direction + strand (F, F)
    4) same direction - strand (R, R)
    
    Intervals generated from peak intersections with motifs in convergent orientation will represent TADs (or subTADS...)

    '''

    db = PARAMS["database"]
    npeaks = PARAMS["tads_npeaks"]
    pwidth = PARAMS["tads_pwidth"]
    tmp_dir = "$SCRATCH_DIR"

    # fetch insulator peaks with fimo motifs
    table = "insulators_" + '_'.join([str(npeaks), str(pwidth)
                                      ]) + "_fimo_table"
    statement = '''select * from %(table)s''' % locals()
    motifs = DB.fetch_DataFrame(statement, db)

    # get most significant motif for each peak
    motifs = motifs.sort_values(["sequence_name", "q_value"],
                                0).drop_duplicates(subset="sequence_name",
                                                   keep="first")

    motifs.to_csv("insulators_fimoMotifs.txt", sep="\t",
                  header=True)  # save peaks w/ annotated motifs as df
    upload2csvdb(motifs, "insulators_fimoMotifs", db)  # upload to csvdb

    # get peaks (bed format) corresponding to fimo motifs
    statement = '''select b.contig, b.start, b.end, a.sequence_name, b.peak_score, a.strand, a.q_value
                   from insulators_fimoMotifs a inner join insulators b on a.sequence_name = b.peak_id'''
    motif_bed = DB.fetch_DataFrame(statement, db)
    motif_bed = motif_bed.sort_values(["sequence_name", "q_value"],
                                      0).drop_duplicates(
                                          subset="sequence_name", keep="first")
    motif_bed.to_csv("motif_bed.txt", sep="\t", header=True, index=False)

    # merge peaks
    # iterate over a range of distances (1mb - 1kb) within which insulators peaks are merged
    ### descending order of distances -> favours bigger TADs, and joins up remaining intervals up to min. size of 1kb
    # merged insulators selected for with awk "," in $6 (strand col) ***Limited to merges of two peaks with F,R orientation with ($6=="\+,-")
    # merged insulators written to tmp file (tmp + str(counter))
    # for each successive merge after n=1 peaks from previous merge are subtracted from results with bedtools (w/ -A flag to remove entire intervals)
    ### a few of the later TADs are ver large and are merged over previous, how to correct for this? merge final file (only overlapping tads?)

    # n = 0

    # distances = range(0, 10100000, 10000) # 10mb to 1kb, 10kb decreases
    # distances = distances[::-1] # invert list -> descending order

    # for dist in distances:
    #     n = n +1
    #     tmp = "tmp" + str(n)

    #     if n == 1:
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - > $tmp; checkpoint;
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i $tmp |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > %(tmp)s''' % locals()

    #     elif n > 1 and n < len(distances):
    #         merge = tmp.replace(str(n), str(n-1))
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - |
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint;
    #                        subtractBed -A -a $tmp -b %(merge)s > %(tmp)s''' % locals()

    #     elif n == len(distances):
    #         merge = tmp.replace(str(n), str(n-1))
    #         statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
    #                        tail -n+2 motif_bed.txt |
    #                          sort -k1,1 -k2,2n - |
    #                        mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - |
    #                          awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint;
    #                        subtractBed -A -a $tmp -b %(merge)s > %(tmp)s; checkpoint;
    #                        awk 'BEGIN {OFS="\\t"} {if ($3-$2 > 1000) print $0}' <(cat tmp*) |
    #                          sort -k1,1 -k2,2n - |
    #                          mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -i - > %(outfile)s; checkpoint;
    #                        rm tmp*''' % locals()

    ### Instead of merging peaks with F/R motif orientation I could seperate insulator peaks into F & R files,
    ### then use bedtools closest to intersect peaks up to a max distance of n & remove peaks with divergent motifs.

    # Ensure "closest" matches are on the same chromosome with awk, also remove features > 1mb wide

    statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint;
                   Fstrand=`mktemp -p %(tmp_dir)s`; checkpoint;
                   Rstrand=`mktemp -p %(tmp_dir)s`; checkpoint;
                   awk 'BEGIN {OFS="\\t"} {if ($6 == "+") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Fstrand; checkpoint;
                   awk 'BEGIN {OFS="\\t"} {if ($6 == "-") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Rstrand; checkpoint;
                   ~/devel/GIT/bedtools2/bin/closestBed -iu -D ref 
                         -a $Fstrand 
                         -b $Rstrand
                         > $tmp; checkpoint; 
                   awk 'BEGIN {OFS="\\t"} {if ($1 == $8 && $9-$2 < 1000000) print $1,$2,$9,$4"/"$11,($5+$12)/2,$6","$13,($7+$14)/2}' $tmp > %(outfile)s '''

    ### This works better!

    # Need to incoporate CTCF & cohesin coverage over candidate insulators. Then filter out insulator pairs (candidate TADS) with large discrepancies in ChIP signal
    # Czimmerer et al use a cut off of > 2fold difference betweeen start & end peaks of TADS in ChIP signal
    # Add ChIP coverage code for insulator peaks & save to db, then incoporate CTCF & cohesin signal into awk filter at the end of this statement

    print statement
    P.run()