Exemple #1
0
def validate_cores(args):
    mylims = Lims(BASEURI, USERNAME, PASSWORD)
    pjs=mylims.get_projects()
    couch = load_couch_server(args.conf)
    proj_db = couch['projects']
    samp_db = couch['samples']
    tested=0
    passed=0
    failed=[]
    log=setupLog(args)
    for p in pjs:
        if p.open_date:
            opDate=datetime.datetime.strptime(p.open_date, '%Y-%m-%d')
            now=datetime.datetime.now()

            if now-opDate < datetime.timedelta(days=args.days):
                tested+=1
                log.info("comparing project {0}".format(p.name))
                oldpj= oldDB.ProjectDB(mylims, p.id, samp_db)
                newpj= newDB.ProjectDB(mylims, p.id, samp_db)
                if oldpj.obj == newpj.obj:
                    log.info("passed")
                    passed+=1
                else:
                    log.error("PROJECT {0} FAILED COMPARISON".format(p.name))
                    failed.append(p.name)

            else:
                log.info("skipping project {0}, too old".format(p.name))
        else:
            log.info("skipping project {0}, no open date".format(p.name))

    log.info("Final stats :")
    log.info("{0}/{1} passed".format(passed, tested))
    if failed:
        log.error("Failed projects : {0}".format(", ".join(failed)))
def generate_report(config_file, proj_conf, single_end, stranded):

    d = {
        "project_id": proj_conf["id"],
        "samplenames": " ".join(proj_conf["samples"]),
        "latex_opt": "",
        "uppnex": "",
        "mapping": "",
        "dup_rem": "",
        "read_count": "",
        "quantifyer": "",
        "gene_body_cov": "",
        "FPKM_heatmap": "",
        "Mapping_statistics": "",
        "Read_Distribution": "",
        "rRNA_table": "",
        "GBC": "",
        "strandness_table": "",
        "complexity_plot": "",
        "species": "",
        "genombuild": "",
        "rseqc_version": "",
        "Preseq": "",
        "date": date.today(),
        "anotation_version": "",
    }

    ## Latex option (no of floats per page)
    floats_per_page = ".. raw:: latex\n\n   \setcounter{totalnumber}{8}"
    d["latex_opt"] = floats_per_page

    ## Project information fetched from StatusDB
    couch = load_couch_server(config_file)
    proj_db = couch["projects"]
    key = find_proj_from_view(proj_db, proj_conf["id"])
    info = proj_db[key]
    try:
        uppnex_proj = info["uppnex_id"]
        reference_genome = info["reference_genome"]
        if reference_genome == "hg19":
            d["species"] = "Human"
        elif reference_genome == "mm9":
            d["species"] = "Mouse"
        elif reference_genome == "rn4":
            d["species"] = "Rat"
        elif reference_genome == ("Zv9" or "Zv8"):
            d["species"] = "Zebrafish"
        elif reference_genome == "sacCer2":
            d["species"] = "Saccharomyces cerevisiae"
        elif reference_genome == "dm3":
            d["species"] = "Drosophila melanogaster"
        else:
            d["species"] = reference_genome
    except:
        uppnex_proj = ""
        print "No uppnex ID fetched"
        pass
    d["uppnex"] = uppnex_proj

    ## RNA-seq tools fetched from config file post_process.yaml
    try:
        tools = proj_conf["config"]["custom_algorithms"]["RNA-seq analysis"]
        d["mapping"] = os.path.join(tools["aligner"], tools["aligner_version"])
        d["dup_rem"] = os.path.join(tools["dup_remover"], tools["dup_remover_version"])
        d["read_count"] = os.path.join(tools["counts"], tools["counts_version"])
        d["quantifyer"] = os.path.join(tools["quantifyer"], tools["quantifyer_version"])
        d["genombuild"] = tools[reference_genome]["name"]
        d["rseqc_version"] = tools["rseqc_version"]
        d["Preseq"] = tools["preseq"]
        d["anotation_version"] = tools[reference_genome]["annotation_release"]
    except:
        print "Could not fetched RNA-seq tools from config file post_process.yaml"
        pass

    ## Mapping Statistics
    tab = Texttable()
    tab.set_cols_dtype(["t", "t", "t", "t"])
    tab.header(["Sample", "Tot NO Reads", "UniqMapped", "UniqMapped DuplRem"])
    statistics = {}
    try:
        for sample_name in proj_conf["samples"]:
            try:
                f = open("tophat_out_" + sample_name + "/logs/prep_reads.log", "r")
                tot_NO_read_pairs = f.readlines()[2].split()[3]
                f.close()
                f = open("tophat_out_" + sample_name + "/stat" + sample_name, "r")
                dict = make_stat(f, tot_NO_read_pairs, single_end)
                tab.add_row(
                    [
                        sample_name,
                        tot_NO_read_pairs,
                        str(dict["bef_dup_rem"]["%uniq_mapped"]) + "%",
                        str(dict["aft_dup_rem"]["%uniq_mapped"]) + "%",
                    ]
                )
                statistics[sample_name] = dict
            except:
                print "Could not make mapping statistics for sample " + sample_name

        d["Mapping_statistics"] = indent_texttable_for_rst(tab)
        stat_json = open("stat.json", "w")
        print >> stat_json, statistics
        stat_json.close()
    except:
        print "Could not make Mapping Statistics table"
        pass

    ## Read Distribution
    try:
        tab = Texttable()
        tab.set_cols_dtype(["t", "t", "t", "t", "t", "t", "t", "t"])
        tab.header(["Sample", "CDS", "5'UTR", "3'UTR", "Intron", "TSS", "TES", "mRNA"])
        read_dist = {}
        for i in range(len(proj_conf["samples"])):
            sample_name = proj_conf["samples"][i]
            dict = {}
            try:
                f = open("RSeQC_rd_" + sample_name + ".out", "r")
                dict = read_RSeQC_rd233(f)
                row = [
                    sample_name,
                    dict["CDS_Exons"]["Tags/Kb"],
                    dict["5'UTR_Exons"]["Tags/Kb"],
                    dict["3'UTR_Exons"]["Tags/Kb"],
                    dict["Introns"]["Tags/Kb"],
                    dict["TSS_up_1kb"]["Tags/Kb"],
                    dict["TES_down_1kb"]["Tags/Kb"],
                    dict["mRNA_frac"],
                ]
                tab.add_row(row)
                read_dist[sample_name] = dict
            except:
                print "Could not make read distribution for sample " + sample_name
                pass
        RSeQC_rd_json = open("RSeQC_rd.json", "w")
        print >> RSeQC_rd_json, read_dist
        RSeQC_rd_json.close()
        d["Read_Distribution"] = indent_texttable_for_rst(tab)
    except:
        print "Could not make Read Distribution table"
        pass

    ## Gene Body Coverage
    try:
        figure()
        x = range(0, 101)
        for i in range(len(proj_conf["samples"])):
            y = zeros(101)
            sample_name = proj_conf["samples"][i]
            f = open(sample_name + ".geneBodyCoverage.txt", "r")
            for line in f.readlines():
                try:
                    key = int(line.split()[0])
                    val = int(line.split()[1])
                    y[key] = val
                except:
                    pass
            plot(x, y)  # ,label=proj_conf['samples'][i])
        # legend(loc='upper left',fontsize='xx-small')
        ylabel("read number")
        xlabel("percentile of gene body (5'->3')")
        savefig("gbc.pdf")
        d["GBC"] = image("gbc.pdf", width="100%")
    except:
        print "could not make GBC plot"

    ##  FPKM_heatmap
    if os.path.exists("FPKM_heatmap.pdf"):
        d["FPKM_heatmap"] = image("FPKM_heatmap.pdf", width="100%")
    else:
        print "could not make FPKM heatmap"

    ## complexity plot
    if os.path.exists("complexity_curves.pdf"):
        d["complexity_plot"] = image("complexity_curves.pdf", width="100%")
    else:
        complexity = False
        print "could not make complexity plot"

    ## rRNA_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(["t", "t"])
        tab.header(["Sample", "rRNA"])
        f = open("rRNA.quantification", "r")
        D = {}
        for line in f:
            D[str(line.split("\t")[0].strip())] = str(line.split("\t")[1].strip())
        for sample_name in proj_conf["samples"]:
            if D.has_key(sample_name):
                tab.add_row([sample_name, D[sample_name]])
        d["rRNA_table"] = indent_texttable_for_rst(tab)
        f.close()

    except:
        print "could not generate rRNA table"
        pass

    ## strandness_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(["t", "t"])
        tab.header(["Sample", "strand-specific reads"])
        try:
            f = open("infer_experiment.json", "rb")
            data = json.load(f)
        except:
            print "can't open infer_experiment.json\n"
        D = data
        for sample_name in proj_conf["samples"]:
            if D.has_key(sample_name):
                tab.add_row([sample_name, str(float(D[sample_name]) * 100) + "%"])
        d["strandness_table"] = indent_texttable_for_rst(tab)
        f.close()
    except:
        print "could not generate strandness_table"
        pass

    return d
def generate_report(config_file,proj_conf,single_end,stranded,genome):

    d = {
        'project_id': proj_conf['id'],
        'samplenames': ' '.join(proj_conf['samples']),
        'latex_opt' : "",
        'uppnex': "",
        'mapping':"",
        'dup_rem':"",
        'read_count':"",
        'quantifyer':"",
        'gene_body_cov':"",
        'FPKM_heatmap':"",
        'Mapping_statistics': "",
        'Read_Distribution':"",
        'rRNA_table':"",
        'GBC':"",
        'strandness_table':"",
        'complexity_plot': "",
        'species':"",
        'genombuild':"",
        'rseqc_version':'',
        'Preseq':'',
        'date':date.today(),
        'anotation_version':''
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d['latex_opt'] = floats_per_page


    ## Project information fetched from StatusDB
    couch=load_couch_server(config_file)
    proj_db = couch['projects']
    key = find_proj_from_view(proj_db, proj_conf['id'])
    info = proj_db[key]
    species= {
        'hg19': 'Human',
        'mm9': 'Mouse',
        'rn4': 'Rat',
        'rn5': 'Rat',
        'Zv8': 'Zebrafish',
        'Zv9': 'Zebrafish',
        'Zv10': 'Zebrafish',
        'sacCer2': 'Saccharomyces cerevisiae',
        'dm3': 'Drosophila melanogaster'
    }
    try:
        uppnex_proj = info['uppnex_id']
        reference_genome = genome if genome else info['reference_genome']
        if reference_genome in species.keys():
                d['species'] = species.get(reference_genome, reference_genome)
        else:
                d['species'] = reference_genome
    except:
        uppnex_proj = ""
        print "No uppnex ID fetched"
        pass
    d['uppnex'] = uppnex_proj


     ## RNA-seq tools fetched from config file post_process.yaml
    try:
        tools = proj_conf['config']['custom_algorithms']['RNA-seq analysis']
        d['mapping'] = os.path.join(tools['aligner'],tools['aligner_version'])
        d['dup_rem'] = os.path.join(tools['dup_remover'],tools['dup_remover_version'])
        d['read_count'] = os.path.join(tools['counts'],tools['counts_version'])
        d['quantifyer'] = os.path.join(tools['quantifyer'],tools['quantifyer_version'])
        d['genombuild'] = tools[reference_genome]['name']
        d['rseqc_version'] = tools['rseqc_version']
        d['Preseq'] = os.path.join(tools['Preseq'],tools['Preseq_version'])
        d['anotation_version'] = tools[reference_genome]['annotation_release']
    except:
        print "Could not fetched RNA-seq tools from config file post_process.yaml"
        pass


    ## Mapping Statistics
    tab = Texttable()
    tab.set_cols_dtype(['t','t','t','t'])
    tab.header(['Sample','Tot NO Reads','UniqMapped','UniqMapped DuplRem'])
    statistics={}
    try:
        for sample_name in proj_conf['samples']:
            try:
                f = open('tophat_out_'+sample_name+'/logs/prep_reads.log', 'r')
                tot_NO_read_pairs = f.readlines()[2].split()[3]
                f.close()
                f = open('tophat_out_'+sample_name+'/stat'+sample_name, 'r')
                dict = make_stat(f,tot_NO_read_pairs,single_end)
                tab.add_row([sample_name,tot_NO_read_pairs,str(dict['bef_dup_rem']['%uniq_mapped'])+'%',str(dict['aft_dup_rem']['%uniq_mapped'])+'%'])
                statistics[sample_name] = dict
            except:
                print 'Could not make mapping statistics for sample '+sample_name

        d['Mapping_statistics'] = indent_texttable_for_rst(tab)
        stat_json = open('stat.json','w')
        print>> stat_json, statistics
        stat_json.close()
    except:
        print "Could not make Mapping Statistics table"
        pass

    ## Read Distribution
    try:
        tab = Texttable()
        tab.set_cols_dtype(['t','t','t','t','t','t','t','t'])
        tab.header(["Sample","CDS","5'UTR","3'UTR","Intron","TSS","TES","mRNA"])
        read_dist = {}
        for i in range(len(proj_conf['samples'])):
            sample_name = proj_conf['samples'][i]
            dict = {}
            try:
                f = open('RSeQC_rd_'+sample_name+'.out','r')
                dict = read_RSeQC_rd233(f)
                row = [sample_name,dict['CDS_Exons']['Tags/Kb'], dict["5'UTR_Exons"]['Tags/Kb'],
                dict["3'UTR_Exons"]['Tags/Kb'],dict['Introns']['Tags/Kb'],
                dict['TSS_up_1kb']['Tags/Kb'],dict['TES_down_1kb']['Tags/Kb'], dict['mRNA_frac']]
                tab.add_row(row)
                read_dist[sample_name] = dict
            except:
                print "Could not make read distribution for sample "+sample_name
                pass
        RSeQC_rd_json = open('RSeQC_rd.json','w')
        print >> RSeQC_rd_json, read_dist
        RSeQC_rd_json.close()
        d['Read_Distribution'] = indent_texttable_for_rst(tab)
    except:
        print "Could not make Read Distribution table"
        pass

    ## Gene Body Coverage
    try:
        pdf_fn="geneBodyCoverage.pdf"
        values = [1] *101
        fig = plt.figure()
        axes = fig.add_subplot(111)
        plt.subplots_adjust(right=0.7)
        sample_num=len(proj_conf['samples'])
        n=math.ceil(float(sample_num)/12)
        col=math.ceil(float(sample_num)/40)
        colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c',
                '#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']*int(n)
        i = 0
        for s in range(0,sample_num):
            sample_name = proj_conf['samples'][s]
            coverage_file = sample_name + '.geneBodyCoverage.txt'
            fn = os.path.realpath(coverage_file)
            try:
                lines=open(fn).readlines()
                for l in range(1,101):
                    (percentile, count) = (lines[0].split()[l],lines[1].split()[l])
                    if percentile.isdigit() is False:
                        continue
                    values[int(percentile)] = float(count) / 1000000
                    #print values
            except IOError as e:
                print "Could not load input file: {}".format(fn)
            axes.plot(values, label=sample_name, color=colours[i])
            i += 1
        # Tidy the axes
        axes.tick_params(which='both', labelsize=8, direction='out', top=False, right=False)
        # Make the x axis labels percentages
        axes.set_xticklabels(["%d%%" % p for p in axes.get_xticks()])
        # Labels
        matplotlib.rcParams['mathtext.default'] = 'regular'
        plt.xlabel(r"Gene body position ($5' \rightarrow 3'$)")
        plt.ylabel(r'Cumulative Read Count ($\times 10^6$)')
        plt.title('Gene Body Coverage')
        #Legend
        axes.legend(loc='upper left', bbox_to_anchor = (1.02, 1.02), fontsize=6, ncol=int(col))
        plt.savefig(pdf_fn)
        d['GBC'] = image(pdf_fn, width="100%")
        plt.close(fig)
    except:
        print sys.exc_info()
        print "could not make GBC plot"

    ##  FPKM_heatmap
    if os.path.exists("FPKM_heatmap.pdf"):
        d['FPKM_heatmap'] = image("FPKM_heatmap.pdf", width="100%")
    else:
        print "could not make FPKM heatmap"

    ## complexity plot
    if os.path.exists("complexity_curves.pdf"):
        d['complexity_plot'] = image("complexity_curves.pdf", width="100%")
    else:
        complexity=False
        print "could not make complexity plot"

    ## rRNA_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(['t','t'])
        tab.header(["Sample","rRNA"])
        f=open('rRNA.quantification','r')
        D={}
        for line in f:
            D[str(line.split('\t')[0].strip())]=str(line.split('\t')[1].strip())
        for sample_name in proj_conf['samples']:
            if D.has_key(sample_name):
                tab.add_row([sample_name,D[sample_name]])
        d['rRNA_table']=indent_texttable_for_rst(tab)
        f.close()

    except:
        print "could not generate rRNA table"
        pass

    ## strandness_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(['t','t'])
        tab.header(["Sample","strand-specific reads"])
        try:
            f=open('infer_experiment.json', 'rb')
            data=json.load(f)
        except:
            print "can't open infer_experiment.json\n"
        D=data
        for sample_name in proj_conf['samples']:
            if D.has_key(sample_name):
                tab.add_row([sample_name,str(float(D[sample_name])*100)+'%'])
        d['strandness_table']=indent_texttable_for_rst(tab)
        f.close()
    except:
        print "could not generate strandness_table"
        pass

    return d