コード例 #1
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
def output_seq(name, seq, out, t=False, first=False):
    out.write('<tr>')
    out.write('<td>%s</td>\n' % name)
    for v in seq:
        if first:
            out.write('    <td>%s</td>\n' %
                      (time_to_string(v[0]) if t else str(v[0])))
        else:
            out.write('    <td>%s</td>\n' %
                      (time_to_string(v[1]) if t else str(v[1])))
    out.write('</tr>')
コード例 #2
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, attempt_id):
     jobid = get_jobid_by_attempid(attempt_id)
     job = all.get(jobid)
     if not job:
         return web.notfound
     attempt = job.attempts.get(attempt_id)
     if not attempt:
         return web.notfound
     exes = attempt.exes
     fig = plt.figure(figsize=(15, 10))
     #内存图
     ax = fig.add_subplot(211)
     for exe in exes.itervalues():
         pn = "%s(%s)" % (exe.exe, exe.id)
         ax.plot([time_to_string(x[0]) for x in exe.seqs['r'].vs],
                 [x[1] for x in exe.seqs['r'].vs],
                 'm.-',
                 label=pn + ' RSS',
                 linewidth=1)
     # 设置y轴范围
     plt.ylim(0, 1024)
     #坐标轴朝内
     plt.rcParams['xtick.direction'] = 'in'
     plt.rcParams['ytick.direction'] = 'in'
     #设置坐标轴名称
     plt.xlabel('RSS(MB)')
     plt.ylabel('Time')
     x_major_locator = MultipleLocator(10)
     ax.xaxis.set_major_locator(x_major_locator)
     ax.legend()  #添加图例
     #CPU图
     ax2 = fig.add_subplot(212)
     for exe in exes.itervalues():
         pn = "%s(%s)" % (exe.exe, exe.id)
         ax2.plot([time_to_string(x[0]) for x in exe.seqs['c'].vs],
                  [x[1] for x in exe.seqs['c'].vs],
                  'm.-',
                  label=pn + ' CPU',
                  linewidth=1)
     plt.ylim(-5, 150)
     plt.rcParams['xtick.direction'] = 'in'
     plt.rcParams['ytick.direction'] = 'in'
     # 设置坐标轴名称
     plt.xlabel('CPU')
     plt.ylabel('Time')
     x_major_locator = MultipleLocator(10)
     ax2.xaxis.set_major_locator(x_major_locator)
     ax2.legend()
     web.header("Content-Type", "image/png")
     buff = StringIO.StringIO()
     fig.savefig(buff, format='png')
     return buff.getvalue()
コード例 #3
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, job_id):
     out = StringIO.StringIO()
     out.write('<html><head><title>Job %s</title></head><body>' % job_id)
     out.write('<h1>JobID: %s</h1>' % job_id)
     job = all.get(job_id)
     if not job:
         out.write('Not Found!')
     else:
         out.write('<h2>Start Time: %s</h2>' %
                   time_to_string(job.start_time, True))
         out.write('<h2>List of task attempts:</h2>')
         have = False
         for attempt, obj in job.attempts.iteritems():
             have = True
             out.write(
                 '<li><a href="/%s">%s</a> Last Update: %s on %s</li>' %
                 (attempt, attempt, time_to_string(obj.time,
                                                   True), obj.host))
         if not have:
             out.write('<h1>No Jobs Found</h1>')
     out.write('</body></html>')
     return out.getvalue()
コード例 #4
0
ファイル: load.py プロジェクト: Radcliffe/ParkLife
def load():
    with open('cal.json') as f:
        events = json.load(f)
    for data in events:
        if 'RRULE' in data:  # Recurring event
            continue
        start = utc_to_central(data['DTSTART'])
        end = utc_to_central(data['DTEND'])
        date = "%d%02d%02d" % (start.year, start.month, start.day)
        start_time = time_to_string(start)
        end_time = time_to_string(end)

        event = Event(
            summary = data['SUMMARY'].strip(),
            description = data['DESCRIPTION'].strip(),
            date = date,
            start = start_time,
            end = end_time,
            location = data['LOCATION'].strip(),
            uid = data['UID'])
        event.put()
    return 'OK'
        
コード例 #5
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self):
     out = StringIO.StringIO()
     out.write('<html><head><title>Jobs</title></head><body>')
     out.write('<h2>List of all monitoring jobs:</h2>')
     have = False
     for jobid, job in all.iteritems():
         have = True
         out.write(
             '<li><a href="/job_%s">job %s</a> Last Update: %s  <a href="/job_info_%s">about job info</a></li>'
             % (jobid, jobid, time_to_string(job.time, True), jobid))
     if not have:
         out.write('<h1>No Jobs Found</h1>')
     out.write('</body></html>')
     return out.getvalue()
コード例 #6
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, job_id):
     out = StringIO.StringIO()
     out.write('<html><head><title>Job %s</title></head><body>' % job_id)
     job = all.get(job_id)
     if not job:
         out.write('Not Found!')
     else:
         out.write('<h1>Job %s last update: %s</h1>' %
                   (job_id, time_to_string(job.time)))
         out.write('<h3><a href="/text/%s" >Text Detail</a></h3>' %
                   (job_id))
         out.write('<image src="/fig/%s.png" />' % (job_id))
     out.write('</body></html>\n')
     return out.getvalue()
コード例 #7
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, job_id):
     out = StringIO.StringIO()
     out.write('<html><head><title>Job %s</title></head><body>\n' % job_id)
     job = all.get(job_id)
     if not job:
         out.write('Not Found!')
     else:
         out.write('<h1>Job %s last update: %s</h1>\n' %
                   (job_id, time_to_string(job.time)))
         out.write('<table>')
         output_seq("Time:", job.seqs['c'].vs, out, True, True)
         output_seq("CPU:", job.seqs['c'].vs, out)
         output_seq("RSS(MB):", job.seqs['r'].vs, out)
         output_seq("VM(MB):", job.seqs['v'].vs, out)
         out.write('</table>')
     out.write('</body></html>\n')
     return out.getvalue()
コード例 #8
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, attempt_id):
     out = StringIO.StringIO()
     out.write('<html><head><title>Job Attempt %s</title></head><body>\n' %
               attempt_id)
     jobid = get_jobid_by_attempid(attempt_id)
     job = all.get(jobid)
     if not job:
         out.write('Not Found!')
     else:
         attempt = job.attempts.get(attempt_id)
         if not attempt:
             out.write('Not Found!')
         else:
             out.write('<h1>%s last update: %s</h1>\n' %
                       (attempt_id, time_to_string(attempt.time)))
             for exe in attempt.exes.itervalues():
                 output_exeinfo(exe, out)
     out.write('</body></html>\n')
     return out.getvalue()
コード例 #9
0
ファイル: server.py プロジェクト: njuwkf/hadoop-monitor
 def GET(self, attempt_id):
     out = StringIO.StringIO()
     out.write('<html><head><title>Job Attempt %s</title></head><body>' %
               attempt_id)
     jobid = get_jobid_by_attempid(attempt_id)
     job = all.get(jobid)
     if not job:
         out.write('Not Found!')
     else:
         attempt = job.attempts.get(attempt_id)
         if not attempt:
             out.write('Not Found!')
         else:
             out.write(
                 '<h1>Attempt run on %s, %s last update: %s</h1>' %
                 (attempt.host, attempt_id, time_to_string(attempt.time)))
             out.write('<h3><a href="/text/%s" >Text Detail</a></h3>' %
                       (attempt_id))
             out.write('<image src="/fig/%s.png" />' % (attempt_id))
     out.write('</body></html>\n')
     return out.getvalue()
コード例 #10
0
ファイル: pairwise.py プロジェクト: Lukeasargen/TnseqPipeline
def pairwise_comparison(args):
    """ Compare the controls and samples.
        Support replicates. Input as lists.
    """

    output_folder = "data/{}/analysis/{}".format(args.experiment, args.output)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    command = "python3 " + " ".join(sys.argv)
    command_filename = "{}/run_command.txt".format(output_folder)
    with open(command_filename, "w") as f:
        f.write(command)

    # Supports biological replicates
    print("Controls :", args.controls)
    print("Samples :", args.samples)

    # Append "_"+args.strand to match the column names in the TAmap and Genehits
    controls = [c + "_" + str(args.strand) for c in args.controls]
    samples = [c + "_" + str(args.strand) for c in args.samples]
    test_columns = controls + samples

    # Load the tamap
    tamap_filename = "data/{}/maps/{}_TAmaps.csv".format(
        args.experiment, args.index)
    print(" * Loading TAmap : {}".format(tamap_filename))
    tamap = pd.read_csv(tamap_filename, delimiter=",")

    print(" * Normalizing...")
    if args.debug:
        print("\nStats before norm:")
        column_stats(tamap, columns=test_columns)

    if args.length_norm:
        tamap = gene_length_norm(tamap, columns=test_columns, debug=args.debug)

    norms = {
        "total":
        partial(total_count_norm,
                tamap,
                columns=test_columns,
                debug=args.debug),
        "quantile":
        partial(quantile_norm,
                tamap,
                q=args.quantile,
                columns=test_columns,
                debug=args.debug),
        "ttr":
        partial(ttr_norm,
                tamap,
                trim=args.ttr,
                columns=test_columns,
                debug=args.debug),
        "nzmean":
        partial(nzmean_norm, tamap, columns=test_columns, debug=args.debug),
    }
    tamap = norms[args.norm]()

    if args.debug:
        print("\nStats after norm:")
        column_stats(tamap, columns=test_columns)

    print(" * Compressing TAmap into Genehits table...")
    fasta_filename = "data/{}/references/{}.fasta".format(
        args.experiment, args.index) if args.gc else None
    if args.debug: print(f"GC content fasta filename : {fasta_filename}")

    # Removes sites that are in the first or last percentage of a gene
    tamap = exclude_sites_tamap(tamap,
                                exclude_first=args.exclude_first,
                                exclude_last=args.exclude_last)

    print(" * Merging into genehits table...")
    # NOTE : This function removes intergenic regions
    # This might mess up the normalization totals, but this is likely fine
    # since the normalization above accounts for sequencing depth of the entire
    # library and not just for the gene regions.
    genehits = tamap_to_genehits(tamap,
                                 fasta_filename=fasta_filename,
                                 pooling=args.pooling)

    # Combine the replicates, average the counts
    print(" * Combining replicates...")
    tamap["Control_Hits"] = tamap[controls].mean(axis=1)
    tamap["Sample_Hits"] = tamap[samples].mean(axis=1)
    genehits["Control_Hits"] = genehits[controls].mean(axis=1)
    genehits["Sample_Hits"] = genehits[samples].mean(axis=1)
    column_stats(genehits, columns=["Control_Hits", "Sample_Hits"])

    # NOTE : Pairwise analysis below

    # TODO : can this be moved into tamap_to_genehits
    print(" * Calculating insertion density per gene...")
    # Need remove intergenic and group by gene
    temp = tamap[tamap['Gene_ID'].notna()].copy()  # Remove intergenic
    temp["Control"] = temp[controls].mean(axis=1).astype(
        bool)  # Combine control replicates
    temp["Sample"] = temp[samples].mean(axis=1).astype(
        bool)  # Combine sample replicates
    grouped = temp.groupby("Gene_ID", as_index=False).sum()

    # Unique_Insertions : Unique TA hits / TA Sites
    # Diversity = Unique_Insertions / TA_Counts
    genehits["Control_Unique_Insertions"] = grouped["Control"]
    genehits["Sample_Unique_Insertions"] = grouped["Sample"]
    genehits["Control_Diversity"] = genehits[
        "Control_Unique_Insertions"] / genehits["TA_Count"]
    genehits["Sample_Diversity"] = genehits[
        "Sample_Unique_Insertions"] / genehits["TA_Count"]
    # Don't need these anymore
    del temp
    del grouped
    column_stats(genehits, columns=["Control_Diversity", "Sample_Diversity"])

    # TSAS Weighting
    if args.insert_weighting:
        """ https://github.com/srimam/TSAS """
        print(
            " * Calculating insertion weighting (Idea from : TSAS by Saheed Imam)..."
        )
        if args.debug:
            print("\nStats before insertion weighting:")
            column_stats(genehits, columns=["Control_Hits", "Sample_Hits"])
        avg_unique = (genehits["Control_Unique_Insertions"].mean() +
                      genehits["Sample_Unique_Insertions"].mean()) / 2.0
        genehits["Control_Hits"] = genehits["Control_Hits"] * (
            genehits["Control_Unique_Insertions"] / avg_unique)
        genehits["Sample_Hits"] = genehits["Sample_Hits"] * (
            genehits["Sample_Unique_Insertions"] / avg_unique)
        if args.debug:
            print("\nStats after insertion weighting:")
            column_stats(genehits, columns=["Control_Hits", "Sample_Hits"])

    # NOTE : Below starts the actual statistical analysis, everything above was just building the table

    print(" * Calculating differential statistics...")
    # Reads differences
    genehits["Ratio_Reads"] = (genehits["Sample_Hits"] + args.smoothing) / (
        genehits["Control_Hits"] + args.smoothing)
    genehits["Log2FC_Reads"] = np.log2(genehits["Ratio_Reads"])
    genehits["LinearDiff_Reads"] = genehits["Sample_Hits"] - genehits[
        "Control_Hits"]
    # Inserts differences
    genehits["Ratio_Inserts"] = (
        genehits["Sample_Unique_Insertions"] + args.smoothing) / (
            genehits["Control_Unique_Insertions"] + args.smoothing)
    genehits["Log2FC_Inserts"] = np.log2(genehits["Ratio_Inserts"])
    genehits["LinearDiff_Inserts"] = genehits[
        "Sample_Unique_Insertions"] - genehits["Control_Unique_Insertions"]

    print(" * Calculating fitness...")
    genehits["Sample_Fitness"] = calc_sample_fitness(genehits,
                                                     control="Control_Hits",
                                                     sample="Sample_Hits",
                                                     expansion=args.expansion)
    if args.debug: column_stats(genehits, columns="Sample_Fitness")

    # Survival Index
    print(" * Calculating survival index...")
    si = calc_survival_index(genehits,
                             control="Control_Hits",
                             sample="Sample_Hits")
    genehits["Survival_Index"] = si
    genehits["Log2SI"] = np.log2(si, out=np.zeros_like(si), where=(si != 0))
    if args.debug: column_stats(genehits, columns=["Survival_Index", "Log2SI"])

    # First, save the whole table, just in case you need to look something up
    genehits_filename = "{}/genehits.csv".format(output_folder)
    print(" * Saved genehits table to {}".format(genehits_filename))
    genehits.to_csv(genehits_filename, header=True, index=False)

    # Find "possibly essential" genes
    print(" * Finding possibly essential genes...")
    # Idea from: "Tn-seq; high-throughput parallel sequencing for fitness and genetic interaction studies in microorganisms"
    possibly = (genehits[["Sample_Unique_Insertions"]] < 3).any(
        axis=1) & (genehits["Gene_Length"] >= 400)
    print("{}/{}({:.2f}%) possibly essential genes.".format(
        possibly.sum(), len(genehits), 100 * possibly.sum() / len(genehits)))

    # Save the possibly essential genes
    possibly_filename = "{}/possibly_essential.csv".format(output_folder)
    print(" * Saved possibly essential genes to {}".format(possibly_filename))
    genehits[possibly].to_csv(possibly_filename, header=True, index=False)

    # Count and Insertion thresholding
    print(" * Trimming to use only genes that had hits...")
    # This bool saves whether the gene has counts in all groups
    hit_bool = ~(genehits[[
        "Control_Unique_Insertions", "Sample_Unique_Insertions"
    ]] == 0).any(axis=1)
    num_hit = hit_bool.sum()
    print("{}/{}({:.2f}%) had no insertions".format(
        len(genehits) - num_hit, len(genehits),
        100 * (len(genehits) - num_hit) / len(genehits)))
    print("{}/{}({:.2f}%) had at least one insertion.".format(
        num_hit, len(genehits), 100 * num_hit / len(genehits)))

    # Save genes that had no insertions in at least one group
    no_hit_filename = "{}/no_hits.csv".format(output_folder)
    print(" * Saved genes with no hits to {}".format(no_hit_filename))
    genehits[~hit_bool].to_csv(no_hit_filename, header=True, index=False)

    # Create boolean columns for user defined thresholds
    keep_count = (genehits[["Control_Hits", "Sample_Hits"]] >=
                  args.min_count).all(axis=1)
    keep_inserts = (genehits[[
        "Control_Unique_Insertions", "Sample_Unique_Insertions"
    ]] >= args.min_inserts).all(axis=1)
    keep_sites = (genehits["TA_Count"] >= args.min_sites)
    keep = keep_count & keep_inserts & keep_sites
    # Separate the genes that will be tested from the rest
    trimmed = genehits[keep & hit_bool].copy()
    removed = genehits[~keep & hit_bool].copy()
    print(" * Thresholds: min_count={}. min_inserts={}. min_sites={}.".format(
        args.min_count, args.min_inserts, args.min_sites))
    print("{}/{}({:.2f}%) genes removed by threshold.".format(
        len(removed), len(genehits), 100 * len(removed) / len(genehits)))
    print("{}/{}({:.2f}%) genes remaining.".format(
        len(trimmed), len(genehits), 100 * len(trimmed) / len(genehits)))
    if args.debug:
        print("Trimmed Stats")
        column_stats(trimmed,
                     columns=[
                         "Control_Hits", "Sample_Hits",
                         "Control_Unique_Insertions",
                         "Sample_Unique_Insertions", "Control_Diversity",
                         "Sample_Diversity"
                     ])

    # Save genes that are removed
    removed_filename = "{}/removed.csv".format(output_folder)
    print(" * Saved removed genes to {}".format(removed_filename))
    removed.to_csv(removed_filename, header=True, index=False)

    # Values per column for ZINB-GLM offsets
    nzmean = np.array(genehits[test_columns].replace(0, np.NaN).mean())
    diversity = np.array(tamap[test_columns].astype(bool).sum() / len(tamap))

    trimmed["P_Value"] = np.nan
    trimmed["P_Sig"] = False
    if args.stat:
        print(" * Calculating statistical significance...")
        print("Stop early by pressing Ctrl+C in the terminal.")
        t0 = time.time()  # Start time
        c = 0  # genes counter, this is different than the index value
        for i in trimmed.index:
            try:
                # Helpful time estimate
                c += 1
                if c % 10 == 0:
                    duration = time.time() - t0
                    remaining = duration / c * (len(trimmed) - c)
                    print(
                        "gene {}/{}. {:.1f} genes/second. elapsed={}. remaining={}."
                        .format(c, len(trimmed), c / duration,
                                time_to_string(duration),
                                time_to_string(remaining)),
                        end="\r")
                # # gene_name is used to index the full TAmap
                # # size is used to get the length of the condition array
                gene_name, size = trimmed.loc[i][["Gene_ID", "TA_Count"]]
                df = tamap[tamap["Gene_ID"] == gene_name]

                if args.stat == "zinb":
                    gene_data = np.array(df[test_columns]).T.reshape(-1)
                    conditions = np.array([0] * size * len(controls) +
                                          [1] * size * len(samples))
                    pvalue = zinb_glm_llr(gene_data,
                                          conditions,
                                          nzmean,
                                          diversity,
                                          dist="nb",
                                          rescale=0,
                                          debug=args.debug)
                else:
                    data1 = np.array(df[controls].mean(
                        axis=1))  # Combine control replicates
                    data2 = np.array(
                        df[samples].mean(axis=1))  # Combine sample replicates
                    if args.stat == "mannu":
                        u_stat, pvalue = mannwhitneyu(data1, data2)
                    elif args.stat == "ttest":
                        t_stat, pvalue = ttest_ind(data1, data2)
                    elif args.stat == "wilcoxon":
                        t_stat, pvalue = wilcoxon(data1, data2)

                if pvalue == 0:
                    print(f"\n{gene_name} failed.")
                    print(trimmed.loc[i])
                trimmed.loc[i, "P_Value"] = pvalue

                if args.debug and c > 5: break
            except KeyboardInterrupt:
                break

        # ^time estimate ended with return character so this prints a newline
        duration = time.time() - t0
        remaining = duration / c * (len(trimmed) - c)
        print("gene {}/{}. {:.1f} genes/second. elapsed={}. remaining={}.".
              format(c, len(trimmed), c / duration, time_to_string(duration),
                     time_to_string(remaining)))

    # Make a boolean column for the P-value and a negative log10 for the volcano plot
    trimmed["P_Sig"] = np.logical_and(trimmed["P_Value"] < args.alpha,
                                      trimmed["P_Value"] != 0)
    pv = trimmed["P_Value"]
    trimmed["Log10P"] = -np.log10(pv, out=np.zeros_like(pv), where=(pv != 0))

    sig_genes = trimmed["P_Sig"].sum()
    print("Significant p-values : {} ({:.2f}%)".format(
        sig_genes, 100 * sig_genes / len(trimmed)))
    print("Genes not tested : {}".format(np.sum(np.isnan(trimmed["P_Value"]))))
    fails = np.sum(trimmed["P_Value"] == 0)
    print("Test failures : {} ({:.2f}%)".format(fails,
                                                100 * fails / len(trimmed)))

    # The same as above but for adjusted Q-values
    print(" * Adjusting p-values for multiple test...")
    qvalues, new_alpha = bh_procedure(np.nan_to_num(trimmed["P_Value"]))
    print("New Alpha :", new_alpha)
    qvalues[qvalues == 0] = np.nan
    trimmed["Q_Value"] = qvalues
    trimmed["Q_Sig"] = np.logical_and(trimmed["Q_Value"] < args.alpha,
                                      trimmed["Q_Value"] != 0)
    trimmed["Log10Q"] = -np.log10(
        qvalues, out=np.zeros_like(qvalues), where=(qvalues != 0))
    sig_genes = trimmed["Q_Sig"].sum()
    print("Significant q-values : {} ({:.2f}%)".format(
        sig_genes, 100 * sig_genes / len(trimmed)))

    # Save the comparison
    pairwise_filename = "{}/pairwise.csv".format(output_folder)
    print(" * Saved pairwise analysis to {}".format(pairwise_filename))
    trimmed.to_csv(pairwise_filename, header=True, index=False)

    if args.plot:
        print(" * Generating plots...")
        pairwise_plots(trimmed, output_folder, args.alpha)

        print("Plotting Boxplots")
        fig = plt.figure(figsize=[3 * len(test_columns), 6])
        ax = fig.add_subplot(111)
        np.log10(tamap[test_columns].copy().replace(0, np.NaN)).boxplot(
            column=test_columns, ax=ax, showfliers=True)
        plt.title("Hits per TA site")
        ax.set_xlabel("Condition")
        ax.set_ylabel("log10(Hits)")
        plt.savefig(f"{output_folder}/boxplots.png")
        plt.close(fig)