コード例 #1
0
def plotFalsePositiveRates(infile, outfile):
    '''
    barplot the false positive rates across
    taxonomic levels
    '''
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % infile)
    for i in [0, 1]:
        # specificity
        outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

        # sensitivity
        outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

    P.touch(outfile)
コード例 #2
0
ファイル: pipeline_expression.py プロジェクト: santayana/cgat
def computeExpressionLevels(infiles, outfiles):
    '''normalize data using gcrma libary.

    output a file with the R object and
    another as human readable table.
    '''

    outfile_r, outfile_table = outfiles

    R.library("simpleaffy")
    R.library("gcrma")

    E.info("reading data")

    raw_data = R('''raw.data = ReadAffy()''')

    E.info("normalization")

    R('''gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )''' %
      PARAMS)

    E.info("saving data")
    R('''save( gcrma.eset, raw.data, file = "%s") ''' % outfile_r)

    data = R('''as.list(assayData(gcrma.eset))''')['exprs']
    probesets, headers = R('''dimnames( assayData(gcrma.eset)$exprs )''')
    headers = [re.sub(".CEL", "", x) for x in headers]

    outf = open(outfile_table, "w")
    outf.write("probeset\t%s\n" % "\t".join(headers))

    for probeset, data in zip(probesets, data):
        outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data))))
    outf.close()
コード例 #3
0
def buildPCAVarianceExplained(infile, outfile):
    '''
    output PCA variance explained
    '''
    R('''source("%s/microarray_utils.R")''' % PARAMS.get("rdir"))
    R('''pc.dat <- runPCA("%s")''' % infile)
    R('''buildPCAVarianceExplained(pc.dat, "%s")''' % outfile)
コード例 #4
0
ファイル: RnaseqqcReport.py プロジェクト: wbyu/CGATPipelines
    def getCorrelations(self, dataframe):
        '''
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        '''

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name",
                                columns="transcript_id",
                                values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R('''p.mat <- apply(p.df, 2, as.numeric)''')
        R('''cor.df <- cor(p.mat)''')
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame
コード例 #5
0
def buildPCAScores(infile, outfile):
    '''
    output PCA scores - mainly for reporting
    '''
    R('''source("%s/microarray_utils.R")''' % PARAMS.get("rdir"))
    R('''pc.dat <- runPCA("%s")''' % infile)
    R('''buildPCAScores(pc.dat, "%s")''' % outfile)
コード例 #6
0
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw):
    x = []
    y = []
    for triad in stats:
        for r in stats[triad]:
    #        pair = sorted([(v, k) 
    #            for k, v in r[0]['js'].items() if len(k)==2]).pop()[1]
            x.append(r[0]['js'][pair])
            y.append(sum(r[0]['EN'][t] for t in pair))

    title = str(len(x)) + ' samples'
    if output_file:
        title = output_file + ', ' + title
    print title

    globalenv['df'] = qcrop([x], [y])

    cmd = 'gg <- ggplot(df, aes(x,y)) + ' + \
            'geom_point(aes(xcrop, ycrop), alpha=0.2) + ' + \
            'stat_smooth(method="loess", color="white", size=1.5, alpha=0.2, se=FALSE) + ' + \
            'stat_smooth(method="loess", color="black") + ' + \
            'xlab("'+' to '.join(pair)+' JSD") + ' + \
            'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + coord_flip()'
    R(cmd)
    if output_file:
        R('ggsave("'+output_file+'", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
コード例 #7
0
def nmiConservationFisherTest(infile, outfile):
    '''Plot heatmap of pairwise scores in R'''
    scriptsdir = PARAMS["scriptsdir"]
    R('''source("%(scriptsdir)s/R/proj007/proj007.R")''' % locals())
    #print '''nmi_conservation(infile="%(infile)s", outfile="%(outfile)s") '''  % locals()
    R('''nmi_conservation(infile="%(infile)s", outfile="%(outfile)s") ''' %
      locals())
コード例 #8
0
def plotMDS(infile, outfile):
    '''
    perform multidimensional scaling of normalised
    counts
    '''
    outname_matrix = P.snip(outfile, ".pdf") + ".tsv"
    R('''library(gtools)''')
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % infile)
    R('''rownames(dat) <- dat$taxa
         dat <- dat[,1:ncol(dat)-1]
         dat <- dat[, mixedsort(colnames(dat))]
         conds <- unlist(strsplit(colnames(dat),
                         ".R[0-9].*"))[seq(1, ncol(dat)*2, 2)]
         conds <- unlist(strsplit(conds, ".",
                         fixed = T))[seq(2, length(conds)*2, 2)]
         dat <- as.matrix(t(dat))
         dist <- dist(dat)
         ord1 <- cmdscale(dist)
         ord2 <- as.data.frame(ord1)
         ord2$cond <- conds
         plot1 <- ggplot(ord2, aes(x = V1, y = V2, colour = cond))
         plot2 <- plot1 + geom_point(size = 3)
         cols <- rainbow(length(unique(conds)))
         plot3 <- plot2 + scale_colour_manual(values = c(cols))
         ggsave("%s")''' % outfile)
コード例 #9
0
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw):
    x = []
    y = []
    ya = []
    for triad in stats:
        for r in stats[triad]:
            paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw)
            ns_EN = sum(r[0]['EN'][t] for t in pair)
            s_EN = sum(r[1]['EN'][t] for t in pair)
            para = paralinear_dists[pair]
            if para:
                x.append(ns_EN)
                y.append(para)
                ya.append(s_EN)
    
    print 'paralinear stats'
    print_stats(x, y)
    print 'GTR stats'
    print_stats(x, ya)
  
    df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)})
    globalenv['df'] = df
    cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \
            'geom_abline(intercept=0, slope=1, color="white") + ' + \
            'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \
            'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \
            'coord_cartesian(xlim=c(0,1), ylim=c(0,1))'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
    return
コード例 #10
0
def get_exons(mart):
    """Queries a Mart object to find all exons of its dataset attribute.

    Forms a specific getBM query that is sent to the BioMart API to
    retrieve information about the exons (and their exonic coordinates)
    of a specific Dataset. The output is then transformed via the GRanges
    Bioconductor package and seqnames converted to UCSC standard.

    Args:
        mart: an rpy2-converted biomaRt Mart object.

    Returns:
        An rpy2 DataFrame containing a table of relevant exon information.
        DataFrame column headers are:
        ["seqnames", "start", "end", "width", "strand"]
    """
    exons = R.getBM(attributes = StrVector(("chromosome_name",
                "exon_chrom_start", "exon_chrom_end", "strand")),
                mart=mart)

    exons_ranges = R.GRanges(
        seqnames=exons.rx2('chromosome_name'),
        ranges=R.IRanges(start=exons.rx2('exon_chrom_start'),
                         end=exons.rx2('exon_chrom_end')),
        strand='+' if exons.rx2('strand') == '1L' else '-')

    # This was hell to find
    # https://stackoverflow.com/questions/38806898/
    set_method = R("`seqlevelsStyle<-`")
    exons_ranges = set_method(exons_ranges, "UCSC")

    as_data_frame = R("function(x) as.data.frame(x)")
    exons_ranges_df = as_data_frame(exons_ranges)

    return exons_ranges_df
コード例 #11
0
def buildProbe2GeneMap(infile, 
                       outfile,
                       PARAMS,
                       platform = "affy"):
    '''
    build file mapping probe id to gene id
    '''
    if platform == "affy":
        array = PARAMS.get("affy_array")
        dataset = PARAMS.get("affy_dataset")
        
        R('''library("biomaRt")''')
        R('''library("affy")''')
        R('''dat <- ReadAffy()''')

        E.info("getting probes")
        R('''probes <- featureNames(dat)''')

        E.info("getting mart")
        R('''mart <- useMart("ensembl", dataset = "%s")''' % dataset)

        # matches to hgnc symbol - this might not be appropriate for mouse data...
        E.info("mapping probes to gene")
        R('''probe2gene <- getBM(attributes = c("%s", "external_gene_name"), filters = "%s", values = probes, mart = mart)''' % (array, array))
        R('''colnames(probe2gene) <- c("probe", "gene")''')
        R('''probe2gene$gene <- toupper(probe2gene$gene)''')

        # remove probes that have no gene assignment (i.e returned "" from biomaRt) and those with 
        # multiple gene assignments - cross-hyb
        temp = P.getTempFile(".")
        E.info("writing temp file")
        R('''write.table(probe2gene, file = "%s", sep = "\t", row.names = F)''' % temp.name)
        temp.close()
        E.info("filtering probes")
        inf = open(temp.name)
        header = inf.readline()
        outf = open(outfile, "w")
        outf.write(header)
        counts = collections.defaultdict(int)
        probe2gene = {}
        for line in inf.readlines():
            data = line[:-1].split("\t")
            probe, gene = data[0], data[1]
            if gene.strip('"') == '': continue
            probe2gene[probe] = gene
            counts[probe] += 1
        for probe, count in probe2gene.iteritems():
            if count > 1:
                outf.write("%s\t%s\n" % (probe, probe2gene[probe]))
        outf.close()
        os.unlink(temp.name)
    else:
        R('''
          library(limma)
          # read in data - maintain detection p-values for bg correction
          dat <- read.ilmn(files = "%s", other.columns = "Detection")
          probe2gene <- data.frame("probe" = rownames(dat), "gene" = dat$genes$TargetID)
          write.table(probe2gene, file = "%s", row.names = F, sep = "\t")
          ''' % (infile, outfile))
コード例 #12
0
ファイル: bar.py プロジェクト: HuttleyLab/geneticdistance
def plot_bar(stats, output_file=None, **kw):
    names = [r['name'] for r in stats.values()[0][0]]
    with_rates = [r['with_rate'] for r in stats.values()[0][0]]
    names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)]

    by_dir = defaultdict(list)
    for triad in stats:
        for r in stats[triad]:
            by_dir[r[0]['from_directory']].append(r)

    for d in by_dir:
        by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]])

    runs = []
    g_stats = []
    data = []
    alpha = 0
    for d, v in by_dir.items():
        if 'exons' in d.split('/'):
            dataset = 'Nuclear'
        elif 'mtDNA' in d.split('/'):
            dataset = 'Mitochondrial'
        else:
            dataset = 'Microbial'
        print dataset
        for j, g in enumerate(v):
            g_stats += g
            data += [dataset] * len(g)
            runs += [j] * len(g)
            print names[j], sum(1 for _g in g if _g > 0.05) / len(g)
            alpha = max(alpha, get_alpha(g))
        print 'Samples', len(g)
    labels = 'expression(' + ','.join(names) + ')'

    df = DataFrame({
        'run': IntVector(runs),
        'g_stat': FloatVector(g_stats),
        'data': StrVector(data)
    })
    globalenv['df'] = df
    R('library(scales)')
    #            'geom_jitter(alpha=0.2, size=1) + ' + \
    #            'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \
    #            'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \
    #            'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \
    #            'geom_hline(yintercept=0.05, color="black") + ' + \
    cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \
            'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \
            'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \
            'scale_x_discrete(labels=' + labels + ') + ' + \
            'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \
            'facet_grid(. ~ data)'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
コード例 #13
0
def covarFilter(infile, time_points, replicates, quantile):
    '''
    Filter gene list based on the distribution of the
    sums of the covariance of each gene.  This is highly
    recommended to reduce the total number of genes used
    in the dynamic time warping clustering to reduce the
    computational time.  The threshold is placed at the
    intersection of the expected and observed value
    for the given quantile.
    '''

    time_points.sort()
    time_rep_comb = [x for x in itertools.product(time_points, replicates)]
    time_cond = ro.StrVector([x[0] for x in time_rep_comb])
    rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
    df = pd.read_table(infile, sep="\t", header=0, index_col=0)

    df.drop(['replicates'], inplace=True, axis=1)
    df.drop(['times'], inplace=True, axis=1)
    df = df.fillna(0.0)

    # convert data frame and import into R namespace
    # py2ri requires activation
    pandas2ri.activate()
    R.assign('diff_data', pandas2ri.py2ri(df))

    E.info("loading data frame")

    # need to be careful about column headers and transposing data frames

    R('''trans_data <- data.frame(diff_data)''')
    R('''times <- c(%s)''' % time_cond.r_repr())
    R('''replicates <- c(%s)''' % rep_cond.r_repr())

    # calculate the covariance matrix for all genes
    # sum each gene's covariance vector

    E.info("calculating sum of covariance of expression")

    R('''covar.mat <- abs(cov(trans_data))''')
    R('''sum.covar <- rowSums(covar.mat)''')
    R('''exp.covar <- abs(qnorm(ppoints(sum.covar),'''
      '''mean=mean(sum.covar), sd=sd(sum.covar)))''')
    R('''sum.covar.quant <- quantile(sum.covar)''')
    R('''exp.covar.quant <- quantile(exp.covar)''')

    E.info("filter on quantile")

    R('''filtered_genes <- names(sum.covar[sum.covar > '''
      '''sum.covar.quant[%(quantile)i]'''
      ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals())
    R('''filtered_frame <- data.frame(diff_data[, filtered_genes],'''
      '''times, replicates)''')

    # load data and convert to pandas object
    filtered_frame = pandas2ri.ri2py(R["filtered_frame"]).T

    return filtered_frame
コード例 #14
0
def Rconnect():
    '''
    connect to a database through R
    '''
    R('''library("RSQLite")''')
    R('''library("sciplot")''')
    R('''drv <- dbDriver("SQLite")''')
    R('''con <- dbConnect(drv, dbname = "%s") ''' % PARAMS["database_name"])
    return R('''con''')
コード例 #15
0
def plotFigure1cGCContent(infiles, outfiles):
    '''Figure 1c: density plots of GC content'''
    capseq_out, control_out = outfiles
    indir = os.path.dirname(infiles[0])
    scriptsdir = PARAMS["scriptsdir"]
    R('''source("%(scriptsdir)s/R/proj007/proj007.R") ''' % locals())
    R('''speciesPlot(dir="%(indir)s", pattern="*testes-cap.replicated.gc.export", main="Testes CAPseq", xlab="GC Content", filename="%(capseq_out)s", plotcol=2, xlimit=c(0,1), ylimit=c(0,15))'''
      % locals())
    R('''speciesPlot(dir="%(indir)s", pattern="*testes-cap.replicated.gc.export", main="Testes Control", xlab="GC Content", filename="%(control_out)s", plotcol=3, xlimit=c(0,1), ylimit=c(0,15))'''
      % locals())
コード例 #16
0
def load_stone_in_sling(path_sling, stone_name, exts=['', '.py', '.ipynb']):
    if not path_sling or not stone_name:
        print('!! sling or stone not specified')
        return
    elif not os.path.exists(path_sling):
        new_path = None
        for ext in exts:
            abs_path_sling_ext = os.path.join(CONFIG['PATH_SLINGS'],
                                              path_sling + ext)
            #print(abs_path_sling_ext)
            if os.path.exists(abs_path_sling_ext):
                new_path = abs_path_sling_ext
                break
        if not new_path:
            print("!!", path_sling, "does not exist")
            return
        path_sling = new_path

    if path_sling.endswith('.py'):
        try:
            import importlib.util
            spec = importlib.util.spec_from_file_location("sling", path_sling)
            sling = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(sling)
        except ImportError:
            import imp
            sling = imp.load_source('sling', path_sling)

        stone = getattr(sling, stone_name)
        return stone

    elif path_sling.endswith('.ipynb'):
        import nbimporter
        nbimporter.options['only_defs'] = CONFIG.get('NBIMPORTER_ONLY_DEFS',
                                                     False)

        ppath, pfn = os.path.split(path_sling)
        pname, pext = os.path.splitext(pfn)

        NBL = nbimporter.NotebookLoader(path=[ppath])
        sling = NBL.load_module(pname)
        stone = getattr(sling, stone_name)
        return stone

    elif path_sling.endswith('.R'):
        from rpy2.robjects import r as R
        # load all source
        with open(path_sling) as f:
            code = f.read()

            R('library(RJSONIO)')
            rfunc = R(code)
            #print('done!')
            stone = lambda _path: rconvert(rfunc(_path))
            return stone
コード例 #17
0
ファイル: PipelineKEGG.py プロジェクト: santayana/cgat
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package. Note that since KEGG is no longer publically availible,
    this is not up-to-date and maybe removed from bioconductor in
    future releases

    '''

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")
    entrez2ensembl = PipelineBiomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict(
        (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
コード例 #18
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(
        ["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
コード例 #19
0
def runPCA(infile, outfile, rownames=1):
    '''
    run principle components analysis on 
    normalised matrix
    '''
#    ncol = len(open(infile).readline().strip("\n").split("\t"))
    # read in and format data
    R('''dat <- read.csv("%s",
                          header=T,
                          stringsAsFactors=F,
                          sep="\t",
                          row.names=%i)''' % (infile, rownames))
    # run PCA
    R('''pc.dat <- prcomp(as.matrix(t(dat)))''')

    # get scores
    R('''pc.dat.scores <- data.frame(pc.dat$x)''')
    R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''')
    R('''pc.dat.scores <- pc.dat.scores[, c("sample", 
                                          colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''')
    R('''write.table(pc.dat.scores,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outfile)

    # get the variance explained
    outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv"
    R('''ve <- data.frame(summary(pc.dat)$importance)''')
    R('''ve <- ve[2,]''')
    R('''write.table(ve,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outf_ve)
コード例 #20
0
def buildLcaProportionsAcrossSamples(infile, outfile, dtype="pathway"):
    '''
    build the proportion of reads mapped to
    each taxoomic level per sample
    '''
    R('''library(dplyr)''')
    R('''dat <- read.csv(
                         "%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t",
                         row.names=1
                        )''' % infile)

    if dtype == "pathway":
        R('''dat <- data.frame(dat %>% group_by(taxa)
                               %>% summarise_each(funs(sum)))''')
        R('''rownames(dat) <- dat$taxa''')
        R('''dat <- dat[,2:ncol(dat)]''')
    else:
        R('''dat <- dat''')
    R('''dat.t <- data.frame(sweep(as.matrix(dat),
                                   2,
                                   colSums(dat), "/"))''')
    R('''dat.t$taxa <- rownames(dat.t)''')
    R('''write.table(dat.t, file = "%s",
                     sep = "\t",
                     quote=F,
                     row.names = F)''' % outfile)
コード例 #21
0
def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--images-dir",
                      dest="images_dir",
                      type="string",
                      help="directory to save hilbert curves image files to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]
    pref = infile.split("/")[1].split(".")[0]
    spec = infile.split("/")[1].split(".")[1].split("-")[1]
    header = "%s-%s" % (pref, spec)
    image_dir = options.images_dir

    if os.path.exists(image_dir):
        pass
    else:
        os.mkdir(image_dir)

    # set path for R scripts to source
    lib_dir = os.path.dirname(__file__)
    root_dir = os.path.dirname(lib_dir)
    r_dir = os.path.join(root_dir, "R")

    # test R scripts directory - fail if not present
    assert r_dir

    R('''suppressPackageStartupMessages(library(rtracklayer))''')
    R('''data.rle <- rtracklayer::import.bw(con="%(infile)s", '''
      '''as="Rle")''' % locals())
    R('''source("%(r_dir)s/wiggle2hilbert.R")''' % locals())
    R('''wiggle2Hilbert(wiggleRle=data.rle, '''
      '''image.dir="%(image_dir)s", datName="%(header)s")''' % locals())

    # write footer and output benchmark information.
    E.Stop()
コード例 #22
0
ファイル: Requirements.py プロジェクト: wangdi2014/cgat
def getRPackageList():
    '''return a dictionary of installed R packages
    mapping to their version.'''

    a = R('''installed.packages(
    fields=c("Package", "Version"))[,c("Package", "Version")]
    ''')
    b = R('''installed.packages(
    fields=c("Package", "Version"))[,c("Version")]
    ''')
    return dict(list(zip(a, b)))
コード例 #23
0
def plotCoverageHistogram(infile, outfile):
    '''
    plot the coverage over kmers
    '''
    inf = P.snip(infile, ".contigs.fa") + ".stats.txt"
    outf = P.snip(inf, ".txt") + ".pdf"
    R('''library(plotrix)''')
    R('''data = read.table("%s", header=TRUE)''' % inf)
    R('''pdf("%s", height = 7, width = 7 )''' % outf)
    R('''weighted.hist(data$short1_cov, data$lgth, breaks=seq(0, 200, by=1))''')
    R["dev.off"]()
コード例 #24
0
ファイル: sqlgeo.py プロジェクト: UH-CI/ngeo_analytics
def get_data_table_by_id(id, cache):
    id_type = id[0:3].lower()
    R.assign("id", id)
    R.assign("id_type", id_type)
    R.assign("cache", cache)
    R("""
        library(GEOquery)
        data = getGEO(id, destdir=cache)
    """)
    data = R("Table(data)")
    data = pandas2ri.ri2py(data)
    return data
コード例 #25
0
def plotFilteredSamples(infiles, outfiles):
    '''Create a plot of the SNP profiles for each filtered sample'''

    error_profile, otu_assignment = infiles

    otu_assignment = P.snip(otu_assignment, '.fasta') + '_up.txt'
    otu_dict = {}
    for row in open(otu_assignment):
        sample_id = row.split()[0].split(';')[0]
        otu_id = row.split().pop()
        otu_dict[sample_id] = otu_id

    def _fetch_loci(infile):
        # Some samples have no snps...
        if not open(infile).readline():
            L.warn('Sample %s has no SNPs' % infile)
            idx = [i for i in range(1, 1501)]
            snp = [
                0,
            ] * 1500
            df = pd.DataFrame([idx, snp]).transpose()
        else:
            df = pd.DataFrame(
                [x.split(',') for x in open(infile).readline().split('\t')])

        df.columns = ['Locus', 'Frequency']
        df = df.applymap(float)

        return df

    sample_id = P.snip(error_profile, '_true_snps.tsv', strip_path=True)
    otu_id = otu_dict[sample_id]
    outfile = os.path.join('14_filter_sample_error_profiles.dir',
                           otu_id + '_' + \
                           sample_id + '.pdf')

    R('''rm(list=ls())''')
    R('''require('ggplot2')''')
    df = _fetch_loci(error_profile)
    R.assign('df', df)

    R('''require('ggplot2')
         pl <- ggplot(df, aes(x=Locus, xend=Locus, y=0, yend=Frequency)) + geom_segment()
         pl <- pl + theme_bw() + theme(panel.grid=element_blank())
         pl <- pl + xlim(0, 1500) + scale_y_continuous(expand=c(0,0), limits=c(0, 100))
         pl <- pl + xlab('Position Along 16S Gene') + ylab('Frequency (%%)')
         pl <- pl + ggtitle('%s\n%s')
         pdf('%s', height=3, width=5)
         plot(pl)
         dev.off()
      ''' % (otu_id, sample_id, outfile))

    R('''rm(list=ls())''')
コード例 #26
0
def calculatePerSampleMeasurementError(infile, outfile):
    '''Sample, calculate the measurement error across variabile
    loci for all technical replicates in one
    '''

    R('''rm(list=ls())''')

    zeta_w = R('''
               zeta_w <- function(df, y="value", x="locus"){
               res = anova(lm(value ~ locus, data=df))
               return(sqrt(res[["Mean Sq"]][2]))
               }
    ''')

    # Hack... I forgot about samples with only a single locus
    zeta_w2 = R('''
                zeta_w2 <- function(df){
                v = apply(df, 1, var)
                m = mean(v)
                return(sqrt(m))
                }
    ''')

    # Open the dataframe and check that there is more than one measurement...
    df = pd.read_table(infile, sep='\t', index_col=0)
    if len(df.columns) < 2:
        pass
    else:

        # Fetch the sequencing depths
        depths = [float(x.split('_')[1]) for x in df.columns]
        mean_depth = np.mean(depths)

        sample_id = P.snip(infile, '.tsv', strip_path=True)
        outf = open(outfile, 'w')
        outf.write('SampleID\tMeanDepth\tMeasurementError\n')

        if len(df.index) == 1:
            L.warn('Sample %s has only one variable lcous' % \
                   os.path.basename(infile))
            m_error = zeta_w2(df)[0]
        else:
            # melt the dataframe
            df['locus'] = [str(x) for x in df.index]
            df = df.melt(id_vars='locus')

            # calculate the measurement error
            m_error = zeta_w(df)[0]

        outf.write('\t'.join(map(str, [sample_id, mean_depth, m_error])) +
                   '\n')
        outf.close()
コード例 #27
0
def plotPathwayGenes(infile, outfile):
    '''
    plot the genes that are differentially expressed
    and fall into pathways
    '''
    # R will not be able to plot anything if none of the 
    # differentially expressed genes are associated
    # with a pathway. plot nothing if this is the case

    # colour of the pathways should associate with the 
    # track that they come from 

    # because the plots can get unwieldy with large 
    # gene sets, if there are more than 20 genes
    # associated with a pathway then take the top 20
    # This should be explained in the documentation

    col = random.sample(range(1,600,1), 1)[0]
    track = os.path.basename(infile).replace(".genes", "")

    if len(open(infile).readlines()) == 1:
        R('''pdf("%s")
             plot(c(0,1,2,3,4), c(0,1,2,3,4), cex = 0)
             text(2, y = 2, labels = "No genes were associated with pathways", cex = 1)
          ''' % outfile.replace(".plots", ".pdf"))
        P.touch(outfile)
    else:
        # NB. size of plot should be proportional to the
        # number of genes in the pathways
        R('''
          library("ggplot2")
          dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")
          pathways <- unique(dat$pathway)
          for (p in pathways){
              toPlot <- aggregate(l2fold~gene, dat[dat$pathway == p,], mean)
              if (regexpr("/", p)[1] != -1){
                  # "/" in name not compatible with outfile names
                  p <- sub("/", "|", p)}
              outf <- paste(paste("pathways.dir/", paste("%s", p, sep = "."), sep = ""), "genes.pdf", sep = ".")
              cols <- col2rgb(%i)
              col <- rgb(cols[1], cols[2], cols[3], maxColorValue = 255)
              toPlot$col <- col
              if (nrow(toPlot) > 10){
                  toPlot <- toPlot[order(abs(toPlot$l2fold), decreasing = T),][1:10,]}
              plot1 <- ggplot(toPlot, aes(x = gene, y = l2fold, fill = col, stat = "identity")) + geom_bar(stat = "identity") + coord_flip() + scale_fill_manual(values = toPlot$col)
              plot1 + ggtitle(p) + theme(text = element_text(size = 40, color = "black"), axis.text = element_text(colour = "Black"))
              ggsave(file = outf, width = 11, height = nrow(toPlot), limitsize = F)
          }
        ''' % (infile, track, col))
        P.touch(outfile)
コード例 #28
0
def MAPlot(infile,
           threshold_stat,
           p_threshold,
           fc_threshold,
           outfile):
    '''
    MA plot the results
    '''
    if threshold_stat == "p":
        p = "P.Value"
    elif threshold_stat == "padj":
        p = "adj.P.Val"
    else:
        p = "adj.P.Val"

    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F, sep = "\t")''' % infile)
    R('''dat$sig <- ifelse(dat$%s < %f & abs(dat$logFC) > %f, 1, 0)'''
      % (p, p_threshold, fc_threshold))

    R('''a <- aes(x = AveExpr, y = logFC, colour = factor(sig))''')
    R('''plot1 <- ggplot(dat, a)''')
    R('''plot2 <- plot1 + geom_point(alpha = 0.5)''')
    R('''plot3 <- plot2 + scale_colour_manual(values = c("black", "blue"))''')
    R('''ggsave("%s")''' % outfile)
コード例 #29
0
 def __init__(self,
              y,
              spec_uGarch=None,
              nums_uGarch=None,
              DCC_order=None,
              out_of_sample=0,
              DCC_distribution='mvnorm'):
     global id_uGarch_spec
     id_uGarch_spec += 1
     [T, N] = y.shape
     #creates the default uGarch_spec for every column, if not provided
     if spec_uGarch == None:
         spec_uGarch = [uGarch_spec()]
         nums_uGarch = [y.shape[1]]
     assert len(spec_uGarch) == len(nums_uGarch)
     #creates the object multispec (resemble the R one, it contains the multispecification for every Garch)
     str_vec_spec = ''
     for spc in range(len(nums_uGarch)):
         str_vec_spec += f'replicate({nums_uGarch[spc]},{spec_uGarch[spc].R_name}),'
     str_vec_spec = str_vec_spec.rstrip(',')
     self.R_uGarch_multispec_name = 'uGarchmulti_spec' + str(id_uGarch_spec)
     R(f'{self.R_uGarch_multispec_name}=multispec(c({str_vec_spec}))')
     self.R_uGarch_multispec = R[self.R_uGarch_multispec_name]
     #creates the global DCC specification object (dccspec, in R)
     if DCC_order is None:
         DCC_order = [1, 1]
     str_DCC_order = mat2rSyntax(DCC_order)
     self.R_DCC_spec_name = 'DCC_spec' + str(id_uGarch_spec)
     self.R_DCC_spec = R(
         f'{self.R_DCC_spec_name} = dccspec(uspec = {self.R_uGarch_multispec_name}, dccOrder = {str_DCC_order}, distribution = \'{DCC_distribution}\')'
     )
     #fits the data with the DCC_spec
     self.R_DCCfit_name = 'DCC_fit' + str(id_uGarch_spec)
     R_DCCfit_func = R('dccfit')
     pandas2ri.activate()
     if isinstance(y, pd.DataFrame):
         R_y = pandas2ri.py2ri(y)
         self.fit = R_DCCfit_func(self.R_DCC_spec, R_y, out_of_sample)
     else:
         rpy2.robjects.numpy2ri.activate()
         self.fit = R_DCCfit_func(self.R_DCC_spec, y, out_of_sample)
     #creates the empty fields that will be extracted from the R fit object
     self.fit_rcor = None
     self._chol_vcv = None
     self.out_of_sample = out_of_sample
     #7 refers to: (mu,ar,ma,omega,alpha,beta,gamma) coefficients of the ARMA+GARCH of the title
     _coef_ = r_coef_method(self.fit)
     self.coef_ = np.array(_coef_[:-2]).reshape(N, 6)
     self.global_coef_ = np.array(_coef_[-2:])
     self.N = N
コード例 #30
0
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [
        P.toTable(x) for x in infiles[1:]
        if P.snip(os.path.basename(x), ".load") == track
    ][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute(
            """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'"""
            % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" %
                           tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")'''
      % col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)
コード例 #31
0
ファイル: R.py プロジェクト: CGATOxford/cgat
 def __new__(cls):
     c = RBase.__new__(cls)
     cls._instance = c
     c._history = []
     return cls._instance
コード例 #32
0
ファイル: R.py プロジェクト: CGATOxford/cgat
 def __call__(self, string):
     self._history.append(string)
     RBase.__call__(self, string)