def biomart_iterator( columns, 
                      biomart = "ensembl", 
                      dataset = "hsapiens_gene_ensembl",
                      host = 'www.biomart.org' ):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.
    
    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset= dataset, host=host )
    result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), 
                      mart=mart )
    
    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip( *[ result.rx(x)[0] for x in columns] ):
        yield dict( zip(columns, data) )
Esempio n. 2
0
    def plot( self,  matrix, row_headers, col_headers, path ):
        '''plot matrix. 

        Large matrices are split into several plots.
        '''

        self.debug("HeatmapPlot started")

        self.startPlot()

        R.library( 'gplots' )
        
        R["heatmap.2"]( matrix,
                        trace = 'none',
                        dendrogram = 'none',
                        col=R.bluered(75),
                        symbreaks = True,
                        symkey = True,
                        cexCol = 0.5,
                        cexRow = 0.5,
                        labRow = row_headers,
                        labCol = col_headers,
                        mar = ro.IntVector((10,10)),
                        keysize = 1 )

        self.debug("HeatmapPlot finished")

        return self.endPlot( None, path )
Esempio n. 3
0
def tune_model(scenario, params, log):
    irace_command = create_command(scenario, params, log)

    # Run irace
    process = subprocess.Popen(irace_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

    # Load the irace output and transdorm it to a csv file
    pandas2ri.activate()

    r.load(log.as_posix())

    r.library('irace')
    b = r.getFinalElites(r.iraceResults, n=0)

    with localconverter(default_converter + pandas2ri.converter):
        r_pd_params = b
    out_csv = Path(log.parent, log.stem + '.csv')
    r_pd_params.to_csvfile(out_csv.as_posix(), sep=',')

    # Import to pandas to pythonic cleaning
    py_pd_params = pd.read_csv(out_csv)
    py_pd_params = py_pd_params.drop(columns=['.ID.', '.PARENT.'])

    # Giving the correct order to rows
    correct_table = {}
    for c in py_pd_params.columns:
        rows = []
        for j in py_pd_params[c].keys():
            rows.append(py_pd_params[c][j])
        correct_table[c] = rows

    correct_table = pd.DataFrame(data=correct_table)
    correct_table.to_csv(out_csv)
Esempio n. 4
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org'):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     mart=mart)

    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip(*[result.rx(x)[0] for x in columns]):
        yield dict(zip(columns, data))
Esempio n. 5
0
def q_value_analysis(p_values):
    logging.info('q-value analysis on %d p-values', len(p_values))
    from rpy2.robjects import r, FloatVector
    r.library('qvalue')
    ps = FloatVector(p_values)
    qobj = r.qvalue(ps)
    return qobj
Esempio n. 6
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a dictionary mapping biomart columns to columns in
    the output tables. *biomart* and *dataset* denote the database and
    dataset to get the data from.

    '''

    R.library("biomaRt")

    keys = columns.keys()

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Esempio n. 7
0
def computeExpressionLevels(infiles, outfiles):
    """normalize data using gcrma libary.
    
    output a file with the R object and
    another as human readable table.
    """

    outfile_r, outfile_table = outfiles

    R.library("simpleaffy")
    R.library("gcrma")

    E.info("reading data")

    raw_data = R("""raw.data = ReadAffy()""")

    E.info("normalization")

    R("""gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )""" % PARAMS)

    E.info("saving data")
    R("""save( gcrma.eset, raw.data, file = "%s") """ % outfile_r)

    data = R("""as.list(assayData(gcrma.eset))""")["exprs"]
    probesets, headers = R("""dimnames( assayData(gcrma.eset)$exprs )""")
    headers = [re.sub(".CEL", "", x) for x in headers]

    outf = open(outfile_table, "w")
    outf.write("probeset\t%s\n" % "\t".join(headers))

    for probeset, data in zip(probesets, data):
        outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data))))
    outf.close()
Esempio n. 8
0
def import_cummeRbund_library():
    """
    imports cummeRbund library [r.library('cummeRbund')] or asks to install it otherwise.
    """
    try:
        r.library('cummeRbund')
    
    except RRuntimeError as exc:
        if "there is no package called" in str(exc):
            
            print '\n\nIt looks like you have not installed "cummeRbund" yet.\nWould you like me to try to install it for you now?\n'
            print '(This will require an active internet connection)'
            
            while 1:
                install_cmrbnd = raw_input('y/n: ')
                if install_cmrbnd == ('y'):
                    print '\nOK, I am going to hand you off to the R install process. Answer any prompts it asks you, and I will see you on the other side.\n\n'
                    raw_input('Press "Enter" when ready...')
                    
                    run_cummeRbund_install()
                    
                    print '\nGreat.  Now lets try this again, and hopefully we will be good to go!\n\nSee you soon!'
                    exit(0)
                    
                elif install_cmrbnd == 'n':
                    print '\nThis script requires that cummeRbund be installed. Please install it manually and try again.\n\nGoodbye.'
                    exit(0)
                    
                else:
                    print "Please type only 'y' or 'n'."
        else:
            raise exc
Esempio n. 9
0
def computeExpressionLevels(infiles, outfiles):
    '''normalize data using gcrma libary.

    output a file with the R object and
    another as human readable table.
    '''

    outfile_r, outfile_table = outfiles

    R.library("simpleaffy")
    R.library("gcrma")

    E.info("reading data")

    raw_data = R('''raw.data = ReadAffy()''')

    E.info("normalization")

    R('''gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )''' %
      PARAMS)

    E.info("saving data")
    R('''save( gcrma.eset, raw.data, file = "%s") ''' % outfile_r)

    data = R('''as.list(assayData(gcrma.eset))''')['exprs']
    probesets, headers = R('''dimnames( assayData(gcrma.eset)$exprs )''')
    headers = [re.sub(".CEL", "", x) for x in headers]

    outf = open(outfile_table, "w")
    outf.write("probeset\t%s\n" % "\t".join(headers))

    for probeset, data in zip(probesets, data):
        outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data))))
    outf.close()
def r_pca_plot(x, argAxes=[1,2], argChoix="ind"):
	# R PCA plot function. 
	# The "FactoMineR" package is required.
	r.library("FactoMineR")
	rcmd = r["plot.PCA"]
	raxe = r_vector(argAxes)
	return rcmd(x,choix=argChoix,axes=raxe)
def r_conPgsql(user, passwd, dbname):
	# create a PostgreSQL instance and create one connection.
	r.library("RPostgreSQL")
	rcmd_drv = r["dbDriver"]
	rcmd_con = r["dbConnect"]
	drv = rcmd_drv("PostgreSQL")
	return rcmd_con(drv,user=user,password=passwd,dbname=dbname)
Esempio n. 12
0
def q_value_analysis(p_values):
    logging.info("q-value analysis on %d p-values", len(p_values))
    from rpy2.robjects import r, FloatVector

    r.library("qvalue")
    ps = FloatVector(p_values)
    qobj = r.qvalue(ps)
    return qobj
Esempio n. 13
0
    def build_bardmap(self, config):
        """
        Build the BARD reporting base maps.

        Parameters:
            config - The XML configuration.
        """

        # The first geolevel is the base geolevel of EVERYTHING
        lbody = LegislativeBody.objects.all()[0]
        basegl = Geolevel.objects.get(id=lbody.get_base_geolevel())
        gconfig = config.xpath('//GeoLevels/GeoLevel[@name="%s"]' %
                               basegl.name)[0]
        shapefile = gconfig.xpath('Shapefile')[0].get('path')
        srs = DataSource(shapefile)[0].srs
        bconfig = config.find('//Reporting/BardConfigs/BardConfig')
        if srs.name == 'WGS_1984_Web_Mercator_Auxiliary_Sphere':
            # because proj4 doesn't have definitions for this ESRI def,
            # but it does understand 3785
            srs = SpatialReference(3785)

        try:
            # We don't need to install rpy unless we're doing this
            from rpy2.robjects import r
            r.library('rgeos')
            logger.debug("Loaded rgeos library.")
            r.library('BARD')
            logger.debug("Loaded BARD library.")
            sdf = r.readShapePoly(shapefile, proj4string=r.CRS(srs.proj))
            logger.debug("Read shapefile '%s'.", shapefile)

            # The following lines perform the bard basemap computation
            # much faster, but require vast amounts of memory. Disabled
            # by default.
            #fib = r.poly_findInBoxGEOS(sdf)
            #logger.debug("Created neighborhood index file.")
            #nb = r.poly2nb(sdf,foundInBox=fib)

            # nb = r.poly2nb(sdf)
            # logger.debug("Computed neighborhoods.")
            # bardmap = r.spatialDataFrame2bardBasemap(sdf)
            bardmap = r.spatialDataFrame2bardBasemap(sdf, keepgeom=r(False))

            logger.debug("Created bardmap.")
            r.writeBardMap(bconfig.get('shape'), bardmap)
            logger.debug("Wrote bardmap to disk.")
        except:
            logger.info("""
ERROR:

The BARD map could not be computed. Please check the configuration settings
and try again.
""")
            logger.debug(
                "The following traceback may provide more information:")
            logger.debug(traceback.format_exc())
            return False
        return True
    def build_bardmap(self, config):
        """
        Build the BARD reporting base maps.

        Parameters:
            config - The XML configuration.
        """

        # The first geolevel is the base geolevel of EVERYTHING
        lbody = LegislativeBody.objects.all()[0]
        basegl = Geolevel.objects.get(id=lbody.get_base_geolevel())
        gconfig = config.xpath('//GeoLevels/GeoLevel[@name="%s"]' % basegl.name)[0]
        shapefile = gconfig.xpath('Shapefile')[0].get('path')
        srs = DataSource(shapefile)[0].srs
        bconfig = config.find('//Reporting/BardConfigs/BardConfig')
        if srs.name == 'WGS_1984_Web_Mercator_Auxiliary_Sphere':
            # because proj4 doesn't have definitions for this ESRI def,
            # but it does understand 3785
            srs = SpatialReference(3785)

        try:
            # We don't need to install rpy unless we're doing this
            from rpy2.robjects import r
            r.library('rgeos')
            logger.debug("Loaded rgeos library.")
            r.library('BARD')
            logger.debug("Loaded BARD library.")
            sdf = r.readShapePoly(shapefile,proj4string=r.CRS(srs.proj))
            logger.debug("Read shapefile '%s'.", shapefile)

            # The following lines perform the bard basemap computation
            # much faster, but require vast amounts of memory. Disabled
            # by default.
            #fib = r.poly_findInBoxGEOS(sdf)
            #logger.debug("Created neighborhood index file.")
            #nb = r.poly2nb(sdf,foundInBox=fib)

           # nb = r.poly2nb(sdf)
           # logger.debug("Computed neighborhoods.")
           # bardmap = r.spatialDataFrame2bardBasemap(sdf)
            bardmap = r.spatialDataFrame2bardBasemap(sdf,keepgeom=r(False))


            logger.debug("Created bardmap.")
            r.writeBardMap(bconfig.get('shape'), bardmap)
            logger.debug("Wrote bardmap to disk.")
        except:
            logger.info("""
ERROR:

The BARD map could not be computed. Please check the configuration settings
and try again.
""")
            logger.debug("The following traceback may provide more information:")
            logger.debug(traceback.format_exc())
            return False
        return True
Esempio n. 15
0
def computePresenceCalls(infiles, outfile):
    """
    modified workflow from https://stat.ethz.ch/pipermail/bioconductor/2005-July/009758.html

    Calculate MAS5 detection pvalues and Present/Marginal/Absent                                                                                                                                                                            
    calls.

    Note that this analysis only applies to 3' arrays, but not for
    other arrays like HuGene. The simpleaffy package was designed
    to correct for 3' bias due to the 7' oligo dT primers used
    for building cDNAs. It requires PM and MM probes, which might
    not exist on other arrays.
    """

    R.library("simpleaffy")
    R.library("gcrma")

    infile_r, infile_table = infiles

    # load the normalized data
    # (raw.data and gcrma.eset)
    R("""load( '%s' )""" % infile_r)

    # calculate detection calls
    call_eset = R(
        """call.eset = detection.p.val( raw.data, 
                                 alpha1=%(mas5_alpha1)f,
                                 alpha2=%(mas5_alpha2)f,
                  )"""
        % PARAMS
    )

    # output to files
    R(
        """write.table( call.eset$call, '%s.calls', 
                      sep='\t', 
                      quote=FALSE,
                      col.names = NA, row.names = TRUE)"""
        % outfile
    )
    R(
        """write.table( call.eset$pval, '%s.pval', 
                      sep='\t',
                      quote=FALSE,
                      col.names = NA, row.names = TRUE)"""
        % outfile
    )

    # build list of probesets to keep
    # keep all probesets where at least 3 out of 4 replicates
    # are indicated as present (P/M) in at least one treatment group
    R("""absent = rowSums( call.eset$call == 'A')""")
    keep = R("""remove = names(absent[absent <= length(colnames(assayData( gcrma.eset )$exprs))-4] )""")
    outf = open(outfile, "w")
    outf.write("probeset\n")
    outf.write("\n".join(map(str, keep)) + "\n")
    outf.close()
Esempio n. 16
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package. Note that since KEGG is no longer publically availible,
    this is not up-to-date and maybe removed from bioconductor in
    future releases

    '''

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")
    entrez2ensembl = PipelineBiomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict(
        (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Esempio n. 17
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(
        ["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Esempio n. 18
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
Esempio n. 19
0
    def __init__(self, *args, **kwargs):
        Renderer.__init__(self, *args, **kwargs)
        Plotter.__init__(self, *args, **kwargs)

        if "statement" not in kwargs:
            raise ValueError("r-ggplot renderer requires a statement option")

        self.statement = kwargs.get('statement')

        if R:
            R.library('ggplot2')
Esempio n. 20
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart,
                     dataset=dataset,
                     host=host,
                     path=path,
                     archive=archive)

    if filters is not None:
        filter_names = R.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(
        attributes=rpy2.robjects.vectors.StrVector(columns),
        filters=filter_names,
        values=filter_values,
        mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Esempio n. 21
0
 def add_result_for_graph(self,G):
     embed.embed(G);
     x=embed.get_embedding(self.d,self.scale)
     r.library('mclust')
     mcr = r.Mclust(x,G=self.kRange)
     self.mcr.append(mcr)
     self.khat.append(mcr['G'])
     
     trueLabel = nx.get_node_attribute(G,'block')
     predLabel = np.array(mcr['classification']).astype(int)-1
     
     self.ari.append(adjusted_rand_score(trueLabel,predLabel))
     self.error.append(num_diff_w_perms(trueLabel,predLabel))
Esempio n. 22
0
def computePresenceCalls(infiles, outfile):
    '''
    modified workflow from https://stat.ethz.ch/pipermail/bioconductor/2005-July/009758.html

    Calculate MAS5 detection pvalues and Present/Marginal/Absent                                                                                                                                                                            
    calls.

    Note that this analysis only applies to 3' arrays, but not for
    other arrays like HuGene. The simpleaffy package was designed
    to correct for 3' bias due to the 7' oligo dT primers used
    for building cDNAs. It requires PM and MM probes, which might
    not exist on other arrays.
    '''

    R.library("simpleaffy")
    R.library("gcrma")

    infile_r, infile_table = infiles

    # load the normalized data
    # (raw.data and gcrma.eset)
    R('''load( '%s' )''' % infile_r)

    # calculate detection calls
    call_eset = R('''call.eset = detection.p.val( raw.data, 
                                 alpha1=%(mas5_alpha1)f,
                                 alpha2=%(mas5_alpha2)f,
                  )''' % PARAMS)

    # output to files
    R('''write.table( call.eset$call, '%s.calls', 
                      sep='\t', 
                      quote=FALSE,
                      col.names = NA, row.names = TRUE)''' % outfile)
    R('''write.table( call.eset$pval, '%s.pval', 
                      sep='\t',
                      quote=FALSE,
                      col.names = NA, row.names = TRUE)''' % outfile)

    # build list of probesets to keep
    # keep all probesets where at least 3 out of 4 replicates
    # are indicated as present (P/M) in at least one treatment group
    R('''absent = rowSums( call.eset$call == 'A')''')
    keep = R(
        '''remove = names(absent[absent <= length(colnames(assayData( gcrma.eset )$exprs))-4] )'''
    )
    outf = open(outfile, "w")
    outf.write("probeset\n")
    outf.write("\n".join(map(str, keep)) + "\n")
    outf.close()
Esempio n. 23
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = columns.keys()

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Esempio n. 24
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = list(columns.keys())

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Esempio n. 25
0
    def rconnect( self, creator = None ):
        '''open connection within R to database.'''

        if not self.rdb:

            R.library( 'RSQLite' )

            if creator:
                self.rdb = creator()

            else:
                if self.backend.startswith( 'sqlite' ):
                    self.rdb = R.dbConnect(R.SQLite(), dbname=re.sub( "sqlite:///./", "", self.backend ) )
                else:
                    raise NotImplementedError("can not connect to %s in R" % self.backend )
Esempio n. 26
0
def mog_cross_validate(x):
    import rpy2.robjects as robjects
    from rpy2.robjects import r as R
    from rpy2.robjects.packages import importr
    R.library('mclust')
    log_predictions = []
    for training, validation in k_fold_cross_validation(x, options.x_validate_groups):
        r_x = robjects.FloatVector(inv_sigmoid(training))
        clustering = R.Mclust(r_x)
        predictions = R.dens(
            modelName = clustering.rx2('modelName'),
            data = robjects.FloatVector(validation),
            parameters = clustering.rx2('parameters')
        )
        log_predictions.extend(np.log(predictions))
    return log_predictions
Esempio n. 27
0
def estimateExpression(infiles, outfile):
    '''estimate expression levels.'''

    R.library("affy")

    E.info("reading data")

    raw_data = R.ReadAffy(infiles)

    E.info("RMA normalization")

    eset = R.rma(raw_data)

    R.boxplot(raw_data)
    R.boxplot(eset)

    print R.as_list(R.assayData(eset))
Esempio n. 28
0
def estimateExpression(infiles, outfile):
    """estimate expression levels."""

    R.library("affy")

    E.info("reading data")

    raw_data = R.ReadAffy(infiles)

    E.info("RMA normalization")

    eset = R.rma(raw_data)

    R.boxplot(raw_data)
    R.boxplot(eset)

    print R.as_list(R.assayData(eset))
Esempio n. 29
0
def clusterize_r_em(*args):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("any key to pass")
Esempio n. 30
0
def clusterize_r_em(*args):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("any key to pass")
Esempio n. 31
0
    def render(self, dataframe, path):

        R.library('ggplot2')

        # add all indices as columns
        dataframe.reset_index(inplace=True)

        rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe)

        # for the issue below, see:
        # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute
        unAsIs = R('''function (x) {
                         if(typeof(x) %in% c("integer","double")) {
                             class(x) <- "numeric"
                             return (x)}
                         else if (typeof(x) == "character") {
                             class(x) <- "character"
                             return (x) }
                         else {
                             return(x) } }''')

        rframe = R["as.data.frame"](R.lapply(rframe, unAsIs))
        R.assign("rframe", rframe)

        # start plot
        R('''gp = ggplot(rframe)''')

        # add aesthetics and geometries
        try:
            pp = R('''gp + %s ''' % self.statement)
        except ValueError as msg:
            raise ValueError(
                "could not interprete R statement: "
                "gp + %s; msg=%s" % (self.statement, msg))

        figname = re.sub('/', '_', path2str(path))
        r = ResultBlock('#$ggplot %s$#' % figname,
                        title=path2str(path))
        r.rggplot = pp
        r.figname = figname

        return ResultBlocks(r)
Esempio n. 32
0
    def fit(self, X_train, y_train, subset_no):
        from rpy2.robjects import r, pandas2ri
        import rpy2.robjects as ro
        from rpy2.robjects.conversion import localconverter

        print('.libPaths( c( "{}" , .libPaths() ) )'.format(scratch_path))
        r('.libPaths( c( "{}" , .libPaths() ) )'.format(scratch_path))
        r('install.packages("rmcfs", repos="https://cloud.r-project.org/")')
        r.library('rmcfs')
        r.library('dplyr')

        original_features = X_train.columns.values
        X_train.columns = [
            'f' + str(i + 1) for i, _ in enumerate(X_train.columns.values)
        ]
        X_train[TARGET] = y_train

        with localconverter(ro.default_converter + pandas2ri.converter):
            r_from_pd_df = ro.conversion.py2rpy(X_train)

        r.assign('df1', r_from_pd_df)
        r('result <- mcfs({} ~ ., df1, cutoffPermutations = {}, seed = 2, threadsNumber = 16)'
          .format(TARGET, self.cutoff_permutations))
        r('RI <- result$RI')
        result = r['result']
        ri = r['RI']

        with localconverter(ro.default_converter + pandas2ri.converter):
            pd_from_r_df = ro.conversion.rpy2py(ri)

        features_to_ri = dict()

        for _, row in pd_from_r_df.iterrows():
            features_to_ri[row['attribute']] = row['RI']

        del X_train[TARGET]
        X_train.columns = original_features
        self.features_importances = [
            features_to_ri['f' + str(i + 1)]
            for i in range(0, len(original_features))
        ]
        print('RMCFS:FEATURE:IMPORTANCES', self.features_importances)
Esempio n. 33
0
def hclust(pts, distance, clust_method="single", use_great_circle=True):
    """ Hierarchical clustering method.  Exposes R hclust function in Python.
    Algorithm:
    - Calculate distances for each pair of input pts
    (use_great_circle:  TRUE then use great_circle else use Euclidean)
    - Cluster the points via hclust with distances and clust_method
    - Cut the resultant clusters via cutree at the desired distance
    - distance should be in kilometers IF use_great_circle=TRUE, otherwise,
    they should be in the same metric as the input pts. 
    (see R doc for sp.spDist)
    - Return the set of sets of points (i.e. the clusters)
    """

    rpy2.robjects.numpy2ri.activate() 

    pt_array = numpy.array(pts)
    r.library('sp')
    sp_dists = r.spDists(pt_array, longlat=use_great_circle)
    dists = r('as.dist')(sp_dists)
    tree = r.hclust(dists, method=clust_method)
    clusters = r.cutree(tree, h=distance)
    
    # this is a little tricky
    # clusters maintains a list of indexes for each point in the original list.  
    # The indexes represent the cluster number
    # For example: 
    #   clusters = [1, 3, 3, 1, 2, 1]
    #
    # where the 0th, 3rd, and last point in the original set
    # belong to cluster number 1
    #
    # We want to return a list of clusters, each containing
    # the index to the original point, so we map them here.  
    #
    # Things are a little more confusing since R counts arrays
    # from 1, python from 0 (hence the "- 1" from the cluster index)
    list_of_pts = [[] for i in range(max(clusters))]
    for j in range(0, len(clusters)):
        list_of_pts[clusters[j] - 1].append(j)

    
    return list_of_pts
Esempio n. 34
0
def getProbeset2Gene(database):
    '''build map relating a probeset to an ENSEMBL gene_id'''

    prefix = database[:-len(".db")]
    mapping = prefix + "ENSEMBL"
    R.library(database)

    # map is a Bimap object
    m = R(mapping)

    result = R.toTable(m)

    mapping = collections.defaultdict(list)
    for probeset_id, gene_id in zip(result["probe_id"], result["ensembl_id"]):
        mapping[probeset_id].append(gene_id)

    E.info("obtained %i mappings: probes=%i, genes=%i" %
           (len(result), len(set(
               result["probe_id"])), len(set(result["ensembl_id"]))))
    return mapping
Esempio n. 35
0
def getProbeset2Gene(database):
    """build map relating a probeset to an ENSEMBL gene_id"""

    prefix = database[: -len(".db")]
    mapping = prefix + "ENSEMBL"
    R.library(database)

    # map is a Bimap object
    m = R(mapping)

    result = R.toTable(m)

    mapping = collections.defaultdict(list)
    for probeset_id, gene_id in zip(result["probe_id"], result["ensembl_id"]):
        mapping[probeset_id].append(gene_id)

    E.info(
        "obtained %i mappings: probes=%i, genes=%i"
        % (len(result), len(set(result["probe_id"])), len(set(result["ensembl_id"])))
    )
    return mapping
Esempio n. 36
0
    def render(self, dataframe, path):

        R.library('ggplot2')

        # add all indices as columns
        dataframe.reset_index(inplace=True)

        rframe = rpy2.robjects.pandas2ri.py2ri(dataframe)

        # for the issue below, see:
        # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute
        unAsIs =  R('''function (x) {
                      if("AsIs" %in% class(x)) {
                          class(x) <- class(x)[-match("AsIs", class(x))]
                      }
                      return (x) } ''')


        rframe = R["as.data.frame"](R.lapply(rframe, unAsIs))
        R.assign("rframe", rframe)

        # start plot
        R('''gp = ggplot(rframe)''')

        # add aesthetics and geometries
        try:
            pp = R('''gp + %s ''' % self.statement)
        except ValueError as msg:
            raise ValueError(
                "could not interprete R statement: "
                "gp + %s; msg=%s" % (self.statement, msg))

        figname = re.sub('/', '_', path2str(path))
        r = ResultBlock('#$ggplot %s$#' % figname,
                        title=path2str(path))
        r.rggplot = pp
        r.figname = figname

        return ResultBlocks(r)
Esempio n. 37
0
def clusterize_r_em(*args, **kwargs):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
        from sklearn.decomposition import PCA
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        if kwargs.get('clf_on_pca', False):
            pca = PCA(2)
            arg = pca.fit(arg).transform(arg)
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("press any key to pass")
Esempio n. 38
0
    def render(self, dataframe, path ):

        R.library( 'ggplot2' )

        rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe)
        
        # sometimes the row/column mapping did not work
        # rframe.colnames = dataframe.columns

        unAsIs = R('''function (x) {
                         if(typeof(x) %in% c("integer","double")) {
                             class(x) <- "numeric"
                             return (x)}
                         else if (typeof(x) == "character") {
                             class(x) <- "character"
                             return (x) }
                         else {
                             return(x) } }''')

        rframe = R["as.data.frame"](R.lapply(rframe,unAsIs))

        R.assign( "rframe", rframe )
        # start plot
        R('''gp = ggplot( rframe )''')

        # add aesthetics and geometries
        try:
            pp = R('''gp + %s ''' % self.statement )
        except ValueError as msg:
            raise ValueError( "could not interprete R statement: gp + %s; msg=%s" % (self.statement, msg ))
            
        figname = re.sub( '/', '_', path2str(path) )
        r = ResultBlock( '#$ggplot %s$#' % figname,
                         title = path2str(path) )
        r.rggplot = pp
        r.figname = figname

        return ResultBlocks( r )
Esempio n. 39
0
def getProbeset2Location(database="hgu133plus2.db"):
    '''build map with genomic coordinates for each probeset.

    The mapping is not necessarily unique.
    '''

    R.library(database)

    prefix = database[:-len(".db")]

    contigs = dict(R(prefix + "CHRLENGTHS"))

    # map is a Bimap object
    result2start = R.toTable(R(prefix + "CHRLOC"))
    result2end = R.toTable(R(prefix + "CHRLOCEND"))

    mapping = collections.defaultdict(list)

    # make sure order is the same
    assert result2start["probe_id"] == result2end["probe_id"]

    for probeset_id, contig, start, end in zip(result2start["probe_id"],
                                               result2start["Chromosome"],
                                               result2start["start_location"],
                                               result2end["end_location"]):

        if start < 0:
            start = contigs[contig] - start
            end = contigs[contig] - end

        mapping[probeset_id].append((contig, start, end))

    E.info("mappings: probes=%i, contigs=%i" % (
        len(set(result2start["probe_id"])),
        len(set(result2start["Chromosome"])),
    ))

    return mapping
Esempio n. 40
0
def getProbeset2Location(database="hgu133plus2.db"):
    '''build map with genomic coordinates for each probeset.

    The mapping is not necessarily unique.
    '''

    R.library(database)

    prefix = database[:-len(".db")]

    contigs = dict(R(prefix + "CHRLENGTHS"))

    # map is a Bimap object
    result2start = R.toTable(R(prefix + "CHRLOC"))
    result2end = R.toTable(R(prefix + "CHRLOCEND"))

    mapping = collections.defaultdict(list)

    # make sure order is the same
    assert result2start["probe_id"] == result2end["probe_id"]

    for probeset_id, contig, start, end in zip(result2start["probe_id"],
                                               result2start["Chromosome"],
                                               result2start["start_location"],
                                               result2end["end_location"]):

        if start < 0:
            start = contigs[contig] - start
            end = contigs[contig] - end

        mapping[probeset_id].append((contig, start, end))

    E.info("mappings: probes=%i, contigs=%i" %
           (len(set(result2start["probe_id"])),
            len(set(result2start["Chromosome"])),
            ))

    return mapping
Esempio n. 41
0
def CalculateEVDParameters(MaxSimilarities):

    import rpy2.robjects as robjects
    from rpy2.robjects import r

    # Create an R object of sorted scores
    maxsims = robjects.FloatVector(sorted(MaxSimilarities))

    # MLE Estimates using "ismev" package
    r.library("ismev")
    gev_fit = robjects.r['gev.fit'](maxsims)

    #Mu = location, Sigma = scale, Xi = shape
    Mu = gev_fit[6][0]
    Sigma = gev_fit[6][1]
    Xi = gev_fit[6][2]

    #Standard errors for the Mu, Sigma and Xi
    eMu = gev_fit[8][0]
    eSigma = gev_fit[8][1]
    eXi = gev_fit[8][2]

    return (Xi, Mu, Sigma, eXi, eMu, eSigma)
Esempio n. 42
0
from rpy2.robjects import r
import os
import rpy2.robjects.packages as rpackages
import rpy2.robjects as robj


utils=rpackages.importr('utils')	# R 기본 패키지 호출

robj.r('setwd("./")')
r.library('plyr')

robj.r('load("~/MediWeb-master/R_file/main.RData")')
robj.r('table_region<-data.frame()')

##### Category by Region #####

# start
robj.r('start<-function(sido_s, sggu_s){\n'
	'for(i in 1:nrow(table_united)){\n'
	'if((sido_s==as.character(table_united$item.sidoCdNm[i])) && '
	'(sggu_s==as.character(table_united$item.sgguCdNm[i]))) {\n'
	'start_num=i\n'
	'return(start_num)}}}')

# end
robj.r('end<-function(sido_e, sggu_e, start_num){\n'
	'for(i in start_num:nrow(table_united)){\n'
	'if(!((sido_e==as.character(table_united$item.sidoCdNm[i])) && '
	'(sggu_e==as.character(table_united$item.sgguCdNm[i])))) {\n'
	'end_num=i-1\n'
	'return(end_num)}}}')
Esempio n. 43
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bam", ),
                      help="input file format [default=%default].")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size [default=%default].")

    parser.add_option("-c",
                      "--control-filename",
                      dest="control_filename",
                      type="string",
                      help="filename of input/control data in "
                      "bed format [default=%default].")

    parser.add_option("-t",
                      "--threads",
                      dest="threads",
                      type="int",
                      help="number of threads to use [default=%default].")

    parser.add_option("-q",
                      "--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="fdr threshold [default=%default].")

    parser.add_option("-z",
                      "--spp-z-threshold",
                      dest="z_threshold",
                      type="float",
                      help="z threshold [default=%default].")

    parser.add_option("--bin",
                      dest="bin",
                      type="int",
                      help="bin tags within the specified number "
                      " of basepairs to speed up calculation;"
                      " increasing bin size decreases the accuracy "
                      "of the determined parameters [default=%default]")

    parser.add_option("--spp-srange-min",
                      dest="srange_min",
                      type="float",
                      help="srange gives the possible range for the "
                      " size of the protected region;"
                      " srange should be higher than tag length; "
                      " making the upper boundary too high"
                      " will increase calculation time [%default]")

    parser.add_option("--spp-srange-max",
                      dest="srange_max",
                      type="float",
                      help="srange gives the possible range for the "
                      " size of the protected region;"
                      " srange should be higher than tag length; "
                      " making the upper boundary too high"
                      " will increase calculation time [%default]")

    parser.set_defaults(
        input_format="bam",
        threads=1,
        fdr_threshold=0.05,
        window_size=1000,
        offset=125,
        srange_min=50,
        srange_max=500,
        bin=5,
        z_threshold=3,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please specify a filename with sample data and an output file")

    filename_sample, filename_output = args[0], args[1]
    filename_control = options.control_filename

    # load Zinba
    R.library('spp')
    R.library('snow')

    # read data
    E.info("reading data")
    R('''chip.data <- read.bam.tags('%s')''' % filename_sample)
    R('''input.data <- read.bam.tags('%s')''' % filename_control)
    R('''cluster = makeCluster( %i )''' % (options.threads))

    E.info("computing binding characteristics")
    # get binding info from cross-correlation profile

    # srange gives the possible range for the size of the protected region;
    # srange should be higher than tag length; making the upper boundary too
    # high will increase calculation time

    # bin - bin tags within the specified number of basepairs to speed
    # up calculation; increasing bin size decreases the accuracy of
    # the determined parameters
    srange_min, srange_max = options.srange_min, options.srange_max
    bin = options.bin
    R('''binding.characteristics <- get.binding.characteristics(chip.data,
    srange=c(%(srange_min)i,%(srange_max)i),
    bin=%(bin)s,
    cluster=cluster);''' % locals())
    # print out binding peak separation distance
    options.stdout.write("shift\t%i\n" %
                         R('''binding.characteristics$peak$x''')[0])

    ##################################################
    ##################################################
    ##################################################
    E.info("plot cross correlation profile")
    # plot cross-correlation profile
    R('''pdf(file="%s.crosscorrelation.pdf",width=5,height=5)''' %
      filename_output)
    R('''par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);''')
    R('''plot(binding.characteristics$cross.correlation,
    type='l',
    xlab="strand shift",
    ylab="cross-correlation");''')
    R('''abline(v=binding.characteristics$peak$x,lty=2,col=2)''')
    R('''dev.off();''')

    E.info("selecting informative tags based on the binding characteristics")
    # select informative tags based on the binding characteristics
    R('''chip.data <- select.informative.tags(
    chip.data,binding.characteristics);''')
    R('''input.data <- select.informative.tags(
    input.data,binding.characteristics);''')

    E.info("outputting broad peaks")
    window_size, z_threshold = options.window_size, options.z_threshold
    R('''broad.clusters <- get.broad.enrichment.clusters(chip.data,input.data,
    window.size=%(window_size)i,
    z.thr=%(z_threshold)f,
    tag.shift=round(binding.characteristics$peak$x/2))''' % locals())
    # write out in broadPeak format
    R('''write.broadpeak.info(broad.clusters,"%s.broadpeak.txt")''' %
      filename_output)

    # binding detection parameters desired FDR (1%). Alternatively, an
    # E-value can be supplied to the method calls below instead of the
    # fdr parameter the binding.characteristics contains the optimized
    # half-size for binding detection window
    R('''detection.window.halfsize <- binding.characteristics$whs;''')

    # determine binding positions using wtd method
    E.info("determining binding positions using wtd method")
    fdr = options.fdr_threshold
    R('''bp <- find.binding.positions(
    signal.data=chip.data,control.data=input.data,
    fdr=%(fdr)f,whs=detection.window.halfsize,cluster=cluster)''' % locals())
    options.stdout.write(
        "detected_peaks\t%i\n" %
        R('''sum(unlist(lapply(bp$npl,function(d) length(d$x))))''')[0])

    # output detected binding positions
    R('''output.binding.results(bp,"%s.summit.txt");''' % filename_output)

    R('''bp <- add.broad.peak.regions(chip.data,input.data,bp,
    window.size=%(window_size)i,z.thr=%(z_threshold)f)''' % locals())
    # output using narrowPeak format
    R('''write.narrowpeak.binding(bp,"%s.narrowpeak.txt")''' % filename_output)

    # write footer and output benchmark information.
    E.stop()
Esempio n. 44
0
# <headingcell level=4>

# ggplot2 in python with Rpy2

# <markdowncell>

# Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette.

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

r = robjects.r
r.library("ggplot2")
rnorm = r["rnorm"]

dtf = robjects.DataFrame({
    'x': rnorm(300, mean=0) + rnorm(100, mean=3),
    'y': rnorm(300, mean=0) + rnorm(100, mean=3)
})
robjects.globalenv['dtf'] = dtf  # assign to R's environment
r("gp = ggplot(dtf, aes(x, y))")
r("pp = gp + geom_point(size=0.6)")
# r("plot(pp)")    # I can't get this to work on my system, but saving the plot is just as good.
r("ggsave(filename='test.png', plot=pp, scale=0.3)")

from IPython.core.display import Image
Image(filename='test.png')
(O_Name2,O_Ext2) = os.path.splitext(O_File2)

TrailFile2       = open(options.TxtFile2,'r')
LineCount2       = len( open(options.TxtFile2,'r').readlines() )
TrailVectorsAll_2 = []
for line2 in TrailFile2:
        line2 = line2.rstrip('\n')
        element2 = line2.split('\t')
        TrailVectorsAll_2.append( [float(element2[0]),float(element2[1])] )
TrailFile2.close()

import rpy2.robjects as robjects
import rpy2.rpy_classic as rpy
from rpy2.robjects import r

r.library("CircStats")

Angle1   = []
Radians1 = []
Mags1    = []
i = 0
while i < len(TrailVectorsAll_1):
	Angle1.append( float(PS_Maths.CalculateVectorAngle( TrailVectorsAll_1[i] ) ) )
        Radians1.append( float(((math.pi/180)*PS_Maths.CalculateVectorAngle( TrailVectorsAll_1[i] ))) )
        Mags1.append( PS_Maths.CalculateVectorMagnitude( TrailVectorsAll_1[i] ))
        i += 1
rRadians1  = robjects.FloatVector(Radians1)

Angle2   = []
Radians2 = []
Mags2    = []
Esempio n. 46
0
      pd_from_r_df = ro.conversion.rpy2py(df)
    return pd_from_r_df

def get_fungi_mart(ds):
    return R.useMart(biomart = 'fungi_mart',
                     host = "https://fungi.ensembl.org/",
                     dataset=ds)

def return_bm_df(atts, mart):
    atts_vector = StrVector([x for x in atts])
    BM = R.getBM(attributes = atts_vector, mart = mart)
    return r2pd_dataframe(BM)

base = sys.argv[1]

R.library("biomaRt")

print(R.listMarts(host = "https://fungi.ensembl.org/"))

# Get the fungi marts
mart_names = ['fculmorum_eg_gene' ,'fgraminearum_eg_gene', 'fpseudograminearum_eg_gene']
marts = [get_fungi_mart(mart) for mart in mart_names]
f_culmorum_mart, f_graminearum_mart, f_pseudogram_mart = marts[0], marts[1], marts[2]

# Get the attributes and the filters
f_culmorum_attributes, f_graminearum_attributes, f_pseudogram_attriibtes = R.listAttributes(f_culmorum_mart), R.listAttributes(f_graminearum_mart), R.listAttributes(f_pseudogram_mart)

f_cul_att_df = r2pd_dataframe(f_culmorum_attributes)
gram_df = f_cul_att_df.name[f_cul_att_df.name.str.startswith('f')]
gram_df.to_string()
Esempio n. 47
0
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"
class GENES_TO_UNIPROT_CLASS:  #Takes list of genes and returns uniprots. Details of output in each definition
    "Implementation for the biomaRt (R) gene id to uniprot using the hsapiens_gene_ensembl"

    #Load rpy2 needed libraries
    from rpy2 import robjects
    from rpy2.robjects import r

    #Declare r objects
    c = robjects.r["c"]
    #Load biomaRt
    r.library("biomaRt")
    #Choose biomaRt hsapiens
    ensembl = robjects.r["useMart"]("ensembl", dataset="hsapiens_gene_ensembl")

    def __init__(self, GENES):
        self.robjects = GENES_TO_UNIPROT_CLASS.robjects
        self.c = GENES_TO_UNIPROT_CLASS.c
        self.GENES = GENES
        self.ensembl = GENES_TO_UNIPROT_CLASS.ensembl
        #Vectorize gene list
        self.gene_vector = self.robjects.StrVector(self.GENES)
        #Call biomaRt
        RESULTS = self.robjects.r["getBM"](attributes=self.c(
            "hgnc_symbol", "uniprot_swissprot_accession"),
                                           filters="hgnc_symbol",
                                           values=self.gene_vector,
                                           mart=self.ensembl)

        #Make RESULTS pythonic
        hgnc = list(RESULTS.rx2(1))
        uniprot = list(RESULTS.rx2(2))
        self.GENE_UNIPROT_PAIRS = [list(x) for x in zip(hgnc, uniprot)]
        #To filter out record that do not have a uniprot in file
        self.PRE_GENE_UNIPROT_PAIRS_FILTERED = filter(lambda x: len(x[1]) > 0,
                                                      self.GENE_UNIPROT_PAIRS)

        #Further search against personal database
        LEFTOVER_GENE_SET = filter(
            lambda x: x not in zip(*self.PRE_GENE_UNIPROT_PAIRS_FILTERED)[0],
            self.GENES)
        import urllib2
        import pickle
        REVIEWED_FILE = urllib2.urlopen(
            "https://sites.google.com/site/jdatabases/home/uniprot/DICT_UNIPROT_TO_GENES_REVIEWED.pi?attredirects=0&d=1"
        )
        REVIEWED_DICT = pickle.load(REVIEWED_FILE)

        self.PLUS_GENE_UNIPROT_PAIRS = []
        for gene in LEFTOVER_GENE_SET:
            for key, value in REVIEWED_DICT.iteritems():
                if gene in value:
                    self.PLUS_GENE_UNIPROT_PAIRS.append([gene, key])
                elif gene.lower(
                ) in value:  #to account for lower case records, if already lower case and present earlier, it
                    #would not get to this instance
                    self.PLUS_GENE_UNIPROT_PAIRS.append([gene, key])

        #Add values found in personal database to biomart found values
        self.GENE_UNIPROT_PAIRS_FILTERED = self.PRE_GENE_UNIPROT_PAIRS_FILTERED + self.PLUS_GENE_UNIPROT_PAIRS

    def has_uniprot_genes(
            self):  #Returns genes in query that have a uniprot match
        GENE_PAIRS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0]
        self.found = []
        for gene in self.GENES:
            if gene in GENE_PAIRS:
                self.found.append(gene)
        return self.found

    def get_uniprots(
        self
    ):  #Return list of all uniprots that correspond to uniprots in has_uniprot_genes
        self.UNIPROTS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[1]
        return self.UNIPROTS

    def no_uniprots(
        self
    ):  #Return genes for which no uniprot was found #CHECK BELOW FOR REASONS
        GENE_PAIRS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0]
        self.not_found = []
        for gene in self.GENES:
            if gene not in GENE_PAIRS:
                self.not_found.append(gene)
        return self.not_found

    def not_on_database(
        self
    ):  #Return genes for which a record is not present in the ensembl database (hgnc) or personal
        #database, double check them
        self.not_present = []
        for gene in self.GENES:
            if gene not in zip(
                    *self.GENE_UNIPROT_PAIRS)[0] and gene not in zip(
                        *self.PLUS_GENE_UNIPROT_PAIRS)[0]:
                self.not_present.append(gene)
        return self.not_present

    def empty_gene(
        self
    ):  #Return genes that are in database but have no uniprot record - BE WARY OF RESULTS, relies on
        #output format biomaRt, depending on size of input it may give "NA" or leave result empty giving the
        #false impression that it is in the biomaRt database when in fact it does not exist
        self.empty = []
        GENE_PAIRS_UNFILTERED = zip(*self.GENE_UNIPROT_PAIRS)[0]
        GENE_PAIRS_FILTERED = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0]
        for gene in self.GENES:
            if gene in GENE_PAIRS_UNFILTERED and gene not in GENE_PAIRS_FILTERED:
                self.empty.append(gene)
        return self.empty
Esempio n. 49
0
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA."""

__author__ = "Shantanu H. Joshi"
__copyright__ = "Copyright 2014, Shantanu H. Joshi Ahmanson-Lovelace Brain Mapping Center, \
                 University of California Los Angeles"
__email__ = "*****@*****.**"
__credits__ = 'Inspired by the stats package rshape by Roger P. Woods'

import rpy2.robjects as robjects
import numpy as np
from stats_output import StatsOutput
from rpy2.robjects import r
r.library("data.table")
from sys import stdout


def corr_r_func():

    corr_r_funct_str = {'corr_r_func': '''<- function(x, y, methodstr)
                {
                    return(cor.test(x, y, method=methodstr))
                }
                '''}
    return corr_r_funct_str


def corr_shape_r(model, sdata):
    statsout = StatsOutput(dim=sdata.phenotype_array.shape[1])
Esempio n. 50
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bed", "bam"),
                      help="input file format [default=%default].")

    parser.add_option("-s",
                      "--fragment-size",
                      dest="fragment_size",
                      type="int",
                      help="fragment size, used for the extension parameter "
                      "in Zinba [default=%default].")

    parser.add_option("-m",
                      "--zinba-mappability-dir",
                      dest="mappability_dir",
                      type="string",
                      help="mappability_dir [default=%default].")

    parser.add_option("-b",
                      "--bit-file",
                      dest="bit_filename",
                      type="string",
                      help="2bit genome filename [default=%default].")

    parser.add_option("-c",
                      "--control-filename",
                      dest="control_filename",
                      type="string",
                      help="filename of input/control data in bed format "
                      "[default=%default].")

    parser.add_option("-i",
                      "--zinba-index-dir",
                      dest="index_dir",
                      type="string",
                      help="index directory [default=%default].")

    parser.add_option("-t",
                      "--threads",
                      dest="threads",
                      type="int",
                      help="number of threads to use [default=%default].")

    parser.add_option("-q",
                      "--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="fdr threshold [default=%default].")

    parser.add_option("-a",
                      "--zinba-alignability-threshold",
                      dest="alignability_threshold",
                      type="int",
                      help="alignability threshold [default=%default].")

    parser.add_option("-p",
                      "--aggregate-by-contig",
                      dest="per_contig",
                      action="store_true",
                      help="run analysis per chromosome [default=%default]")

    parser.add_option("-w",
                      "--temp-dir",
                      dest="tempdir",
                      type="string",
                      help="use existing directory as temporary directory "
                      "[default=%default].")

    parser.add_option("--keep-temp",
                      dest="keep_temp",
                      action="store_true",
                      help="keep temporary directory [default=%default]")

    parser.add_option("--action",
                      dest="action",
                      type="choice",
                      choices=("full", "count", "predict", "model"),
                      help="action to perform [default=%default]")

    parser.add_option("--zinba-improvement",
                      dest="improvement",
                      type="float",
                      help="relative improvement of likelihood until "
                      "convergence [default=%default]")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="minimum insert size for paired end data "
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="maximum insert size for paired end data "
                      "[default=%default]")

    parser.set_defaults(
        input_format="bed",
        fragment_size=200,
        mappability_dir=None,
        threads=1,
        alignability_threshold=1,
        bit_filename=None,
        fdr_threshold=0.05,
        tempdir=None,
        winsize=250,
        offset=125,
        cnvWinSize=1e+05,
        cnvOffset=2500,
        per_contig=False,
        keep_temp=False,
        min_insert_size=0,
        max_insert_size=1000,
        filelist="files.list",
        selectchr="chr19",
        action="full",
        improvement=0.00001,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please specify a filename with sample data and an output file")

    filename_sample, filename_output = args[0], args[1]

    filename_sample = os.path.abspath(filename_sample)
    filename_output = os.path.abspath(filename_output)

    if options.control_filename:
        filename_control = os.path.abspath(options.control_filename)
    else:
        filename_control = None

    # load Zinba
    R.library('zinba')

    if not options.tempdir:
        tmpdir = tempfile.mkdtemp()
    else:
        tmpdir = options.tempdir

    E.info("changing to temporary directory %s" % tmpdir)
    os.chdir(tmpdir)

    if options.input_format == "bam":
        E.info("converting bam files to bed")
        if not os.path.exists(os.path.join(tmpdir, "sample.bed")):
            filename_sample = bamToBed(filename_sample,
                                       os.path.join(tmpdir, "sample.bed"))
        else:
            E.info("using existing file %(tmpdir)s/sample.bed" % locals())
            filename_sample = os.path.join(tmpdir, "sample.bed")
        if filename_control:
            if not os.path.exists(os.path.join(tmpdir, "control.bed")):
                filename_control = bamToBed(
                    filename_control, os.path.join(tmpdir, "control.bed"))
            else:
                E.info("using existing file %(tmpdir)s/control.bed" % locals())
                filename_control = os.path.join(tmpdir, "control.bed")

    fragment_size = options.fragment_size
    threads = options.threads
    bit_filename = options.bit_filename
    mappability_dir = options.mappability_dir
    fdr_threshold = options.fdr_threshold
    tol = options.improvement

    contigs = E.run("twoBitInfo %(bit_filename)s %(tmpdir)s/contig_sizes" %
                    locals())
    contig2size = dict([
        x.split()
        for x in IOTools.openFile(os.path.join(tmpdir, "contig_sizes"))
    ])

    outdir = filename_output + "_files"
    E.info('saving intermediate results in %s' % outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    filelist = os.path.join(outdir, filename_output + ".list")
    modelfile = os.path.join(outdir, filename_output + ".model")
    winfile = os.path.join(outdir, filename_output + ".wins")
    winSize = 250
    offset = 125
    cnvWinSize = 100000
    cnvOffset = 0
    winGap = 0
    peakconfidence = 1.0 - fdr_threshold
    selectchr = options.selectchr

    if not os.path.exists(os.path.join(tmpdir, "basecount")):
        E.info("computing counts")

        R('''basealigncount(inputfile='%(filename_sample)s',
        outputfile='%(tmpdir)s/basecount',
        extension=%(fragment_size)i,
        filetype='bed',
        twoBitFile='%(bit_filename)s' )
        ''' % locals())
    else:
        E.info("using existing counts")

    # tried incremental updates
    # for contig, size in contig2size.iteritems():
    #     for size in
    #     fn = os.path.join( tmpdir, "sample_%(contig)s_win%(size)ibp_offset(offset)ibp.txt" % locals() )
    if options.action == "count":

        E.info("computing window counts only - saving results in %s" % outdir)
        R('''buildwindowdata(
                     seq='%(filename_sample)s',
                     align='%(mappability_dir)s',
                     input='%(filename_control)s',
                     twoBit='%(bit_filename)s',
                     winSize=%(winSize)i,
                     offset=%(offset)i,
                     cnvWinSize=%(cnvWinSize)i,
                     cnvOffset=%(cnvOffset)i,
                     filelist='%(filelist)s',
                     filetype='bed',
                     extension=%(fragment_size)s,
                     outdir='%(outdir)s/') ''' % locals())

    elif options.action == "model":

        # The important option is buildwin = 0
        # parameterized for broad == FALSE and input present
        # see zinba.R
        # model selection only on chr19.
        R('''run.zinba( 
                filelist='%(filelist)s',
                formula=NULL,formulaE=NULL,formulaZ=NULL,
                outfile='%(filename_output)s',
                seq='%(filename_sample)s',
                input='%(filename_control)s',
                filetype='bed',  
                align='%(mappability_dir)s',
                twoBit='%(bit_filename)s',
                extension=%(fragment_size)s,
                winSize=%(winSize)i,
                offset=%(offset)i,
                cnvWinSize=%(cnvWinSize)i,
                cnvOffset=%(cnvOffset)i,
                basecountfile='%(tmpdir)s/basecount',
                buildwin=0,
                threshold=%(fdr_threshold)f,
                pquant=1,
                peakconfidence=%(peakconfidence)f,
                winGap=%(winGap)i,
                tol=%(tol)f,
                initmethod="count",
                method="mixture",
                numProc=%(threads)i,
                printFullOut=1,
                interaction=FALSE,
                selectmodel=TRUE,
                selectchr='%(selectchr)s',
                selectcovs=c("input_count"),
                selecttype="complete",
                FDR=TRUE)''' % locals())

    elif options.action == "predict":

        # The important option is buildwin = 0 and selectmodel = FALSE
        # parameterized for broad == FALSE and input present
        # see zinba.R
        # model selection only on chr19.
        if not os.path.exists(modelfile):
            raise OSError("model file %s does not exist" % modelfile)

        E.info("reading model from %s" % modelfile)

        R('''
        final=read.table('%(modelfile)s', header=T, sep="\t")
        final=final[final$fail==0,]
        bestBIC=which.min(final$BIC)
        formula=as.formula(paste("exp_count~",final$formula[bestBIC]))
        formulaE=as.formula(paste("exp_count~",final$formulaE[bestBIC]))
        formulaZ=as.formula(paste("exp_count~",final$formulaZ[bestBIC]))
        cat("Background formula is:\n\t")
        print(formula)
        cat("Enrichment formula is:\n\t")
        print(formulaE)
        cat("Zero-inflated formula is:\n\t")
        print(formulaE)
        ''' % locals())

        E.info("predicting peaks")

        R('''run.zinba(
                filelist='%(filelist)s',
                outfile='%(filename_output)s',
                seq='%(filename_sample)s',
                input='%(filename_control)s',
                filetype='bed',
                align='%(mappability_dir)s',
                twoBit='%(bit_filename)s',
                extension=%(fragment_size)s,
                winSize=%(winSize)i,
                offset=%(offset)i,
                cnvWinSize=%(cnvWinSize)i,
                cnvOffset=%(cnvOffset)i,
                basecountfile='%(tmpdir)s/basecount',
                buildwin=0,
                threshold=%(fdr_threshold)f,
                pquant=1,
                winGap=%(winGap)i,
                initmethod="count",
                selectchr='%(selectchr)s',
                tol=%(tol)f,
                method="mixture",
                numProc=%(threads)i,
                printFullOut=1,
                interaction=FALSE,
                selectmodel=FALSE,
                formula=formula,
                formulaE=formulaE,
                formulaZ=formulaZ,
                peakconfidence=%(peakconfidence)f,
                FDR=TRUE)''' % locals())

    elif options.action == "per_contig":

        E.info("processing per chromosome")
        for contig, size in contig2size.items():
            if contig not in ("chr16", ):
                continue

            E.info("processing contig %s" % contig)
            filename_sample_contig = filename_sample + "_%s" % contig
            filename_control_contig = filename_control + "_%s" % contig
            if not os.path.exists(filename_output + "_files"):
                os.mkdir(filename_output + "_files")
            filename_output_contig = os.path.join(filename_output + "_files",
                                                  contig)
            filename_basecounts_contig = os.path.join(tmpdir,
                                                      "basecount_%s" % contig)

            E.run(
                "grep %(contig)s < %(filename_sample)s > %(filename_sample_contig)s"
                % locals())
            E.run(
                "grep %(contig)s < %(filename_control)s > %(filename_control_contig)s"
                % locals())

            if not os.path.exists(filename_basecounts_contig):
                E.info("computing counts")

                R('''basealigncount( inputfile='%(filename_sample_contig)s',
                                  outputfile='%(filename_basecounts_contig)s',
                                  extension=%(fragment_size)i,
                                  filetype='bed',
                                  twoBitFile='%(bit_filename)s' )
                                  ''' % locals())
            else:
                E.info("using existing counts")

            # run zinba, do not build window data
            R('''zinba( refinepeaks=1,
            seq='%(filename_sample_contig)s',
            input='%(filename_control_contig)s',
            filetype='bed',
            align='%(mappability_dir)s',
            twoBit='%(bit_filename)s',
            outfile='%(filename_output_contig)s',
            extension=%(fragment_size)s,
            basecountfile='%(filename_basecounts_contig)s',
            numProc=%(threads)i,
            threshold=%(fdr_threshold)f,
            broad=FALSE,
            printFullOut=0,
            interaction=FALSE,
            mode='peaks',
            FDR=TRUE) ''' % locals())
    elif options.action == "full":

        # run zinba, build window data and refine peaks

        # Note that zinba() uses 'chr22' to select model
        # which is not present in mouse. So call run.zinba
        # directly.
        R('''run.zinba(
        refinepeaks=1,
        buildwin=1,
        seq='%(filename_sample)s',
        input='%(filename_control)s',
        filetype='bed',
        align='%(mappability_dir)s',
        twoBit='%(bit_filename)s',
        outfile='%(filename_output)s',
        extension=%(fragment_size)s,
        winSize=%(winSize)i,
        offset=%(offset)i,
        basecountfile='%(tmpdir)s/basecount',
        numProc=%(threads)i,
        threshold=%(fdr_threshold)f,
        pquant=1,
        winGap=%(winGap)i,
        selectchr='%(selectchr)s',
        interaction=FALSE,
        method="mixture",
        cnvWinSize=%(cnvWinSize)i,
        cnvOffset=%(cnvOffset)i,
        selectmodel=TRUE,
        selectcovs=c("input_count"),
        selecttype="complete",
        initmethod="count",
        printFullOut=1,
        diff=0,
        pWinSize=200,
        peakconfidence=%(peakconfidence)f,
        FDR=TRUE) ''' % locals())

    if not (options.tempdir or options.keep_temp):
        shutil.rmtree(tmpdir)

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 51
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package.

    .. note::

        Since KEGG is no longer publically available, this is not
        up-to-date and maybe removed from bioconductor in future
        releases

    The table written to outfile has the following columns:
    ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``,
    ``evidence``.

    Arguments
    ---------
    outfile : string
        Output filename in :term:`tsv` format.
    mart : string
        Name of the biomart
    host : string
        Host name of the biomart server
    biomart_dataset : string
        Biomart dataset

    '''

    if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus",
                    biomart_dataset):
        E.warn("KEGG.db doesn't map Entrez ids for %s, %s will"
               " likely be empty" % (biomart_dataset, outfile))

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")

    # Generates an iterator containing the data from biomart
    entrez2ensembl = Biomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict((x['entrezgene'],
                           x['ensembl_gene_id'])
                          for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames))))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway),
                           pathname, "NA"]) + "\n")
Esempio n. 52
0
# 1. 파일 선택
# 2. 분석, 시각화

from rpy2.robjects import r
import os
import rpy2.robjects.packages as rpackages
import rpy2.robjects as robj
# R 기본 패키지
utils=rpackages.importr('utils')
file_path = input("파일이 있는 경로를 입력하세요(양 끝에 \"입력) : ")
file_name = input("파일 이름을 입력하세요(양 끝에 \" 및 확장자 입력) : ")

# 경로 지정 및 관련 라이브러리 호출
robj.r('setwd('+file_path+')')
robj.r('Sys.setenv(JAVA_HOME="C:/Program Files (x86)/Java/jre1.8.0_161")')
r.library('rJava')
r.library('wordcloud')
r.library('RColorBrewer')
r.library('KoNLP')
#robj.r('useSejongDic()')

# 데이터 분석 - 파일 불러오기, 명사 추출, 및 기타 필터링
print("데이터 분석 중입니다...[파일 불러오기]")
robj.r('data1<-readLines('+file_name+', encoding = "utf-8")')
print("데이터 분석 중입니다...[명사 추출 및 필터링]")
robj.r('data2<-sapply(data1, extractNoun, USE.NAMES = F)')
robj.r('data3<-unlist(data2)')
robj.r('data3<-Filter(function(x){nchar(x)>=2}, data3)')
robj.r('data3<-gsub("\\\\d+","",data3)')
robj.r('data3<-gsub("\\\\(","",data3)')
robj.r('data3<-gsub("\\\\)","",data3)')
Esempio n. 53
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.


    Arguments
    ---------
    columns : dict
       List of fields to obtain.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    filters : list
       List of filter to use
    values : list
       Values of the filters
    archive : bool
       If True, use archived version

    Returns
    -------
    iterator
       Iterator over rows in biomart database. Each row
       is dictionary mapping column names to values.

    '''

    R.library("biomaRt")

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         path=path,
                         archive=archive)
    else:
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         host=host,
                         path=path,
                         archive=archive)

    if filters is not None:
        filter_names = rpy2.robjects.vectors.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     filters=filter_names,
                     values=filter_values,
                     mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Esempio n. 54
0
import os
import time
import pickle
from os.path import join, dirname
import pandas as pd
import numpy as np
from rpy2.robjects import r, pandas2ri, numpy2ri
import pandas.rpy.common as com
import multiprocessing as mp
from copy import copy, deepcopy
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
import lawstructural.lawstructural.utils as lu
import lawstructural.lawstructural.constants as lc

r.library('VGAM')
pandas2ri.activate()
numpy2ri.activate()


#TODO: add in tests for first-stage
class FirstStage(object):
    """ Estimate first stage functions for BBL """
    def __init__(self, data_p, react, opts):
        self.data_p = data_p
        self.react = react
        self.opts = opts

    def reaction(self):
        """ Estimate reaction functions """
        print("    * Estimating reaction functions")
Esempio n. 55
0
    DFrame = r.Table(RFrame)
    subset = DFrame[IdxA:IdxB]
    NdArr = pandas2ri.ri2py(subset)
    Matri = np.matrix(NdArr)
    Cset = Matri[0:(IdxB - IdxA), Col]

    return Cset


base = importr('base')
utils = importr('utils')

base.source("http://www.bioconductor.org/biocLite.R")
biocInstaller = importr("BiocInstaller")
biocInstaller.biocLite("GEOquery")
GEOquery = r.library("GEOquery")

gds1 = r.getGEO("GDS4280")
gds2 = r.getGEO("GDS3329")
gds3 = r.getGEO("GDS4182")
gds4 = r.getGEO("GDS4181")

GDS1_Sub = Grab_Subset(gds1, 2, 26, 14823)
GDS2_Sub = Grab_Subset(gds2, 2, 81, 14823)
GDS3_Sub = Grab_Subset(gds3, 2, 98, 14823)
GDS4_Sub = Grab_Subset(gds4, 2, 82, 14823)

#print(GDS1_Sub)
print(GDS1_Sub.shape)
#print(GDS2_Sub)
print(GDS2_Sub.shape)
Esempio n. 56
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bed", "bam"),
                      help="input file format [default=%default].")

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-e",
                      "--extension",
                      dest="extension",
                      type="int",
                      help="extension size [default=%default].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option("-l",
                      "--fragment-length",
                      dest="fragment_length",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option(
        "-s",
        "--saturation-iterations",
        dest="saturation_iterations",
        type="int",
        help="iterations for saturation analysis [default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "rms", "rpm", "all"),
                      help="actions to perform [default=%default].")

    parser.add_option(
        "-w",
        "--bigwig",
        dest="bigwig",
        action="store_true",
        help=
        "store wig files as bigwig files - requires a genome file [default=%default]"
    )

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="hg19",
        genome_file=None,
        extension=400,
        bin_size=50,
        saturation_iterations=10,
        fragment_length=700,
        toolset=[],
        bigwig=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) != 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    filename_sample = args[0]

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome
    R.library(genome_file)

    tmpdir = tempfile.mkdtemp()

    E.debug("temporary files are in %s" % tmpdir)

    bin_size = options.bin_size
    extension = options.extension
    fragment_length = options.fragment_length
    saturation_iterations = options.saturation_iterations

    if options.input_format == "bam":
        E.info("converting bam files")
        filename_sample = bamToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))
    elif options.input_format == "bed":
        E.info("converting bed files")
        filename_sample = bedToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))

    E.info("loading data")
    R('''CONTROL.SET = MEDIPS.readAlignedSequences(
                       BSgenome = "%(genome_file)s", 
                       file = "%(filename_sample)s" ) ''' % locals())
    slotnames = (("extend", "extend",
                  "%i"), ("distFunction", "distance_function",
                          "%s"), ("slope", "slope", "%f"),
                 ("fragmentLength", "fragment_length",
                  "%i"), ("bin_size", "bin_size",
                          "%i"), ("seq_pattern", "pattern",
                                  "%s"), ("number_regions", "nregions", "%i"),
                 ("number_pattern", "npatterns",
                  "%i"), ("cali_chr", "calibration_contig",
                          "%s"), ("genome_name", "genome", "%s"))

    E.info("computing genome vector")
    R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, 
                       bin_size = %(bin_size)i, 
                       extend=%(extension)i )''' % locals())

    E.info("computing CpG positions")
    R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")'''
      )

    E.info("compute coupling vector")
    R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, 
                       fragmentLength = %(fragment_length)i, 
                       func = "count")''' % locals())

    E.info("compute calibration curve")
    R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''')

    E.info("normalizing")
    R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''')

    outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w")
    outfile.write("category\tvalue\n")

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, 
                            bin_size = %(bin_size)i, 
                            extend = %(extension)i, 
                            no_iterations = %(saturation_iterations)i, 
                            no_random_iterations = 1)''' % locals())

        R.png(E.getOutputFile("saturation.png"))
        R('''MEDIPS.plotSaturation(sr.control)''')
        R('''dev.off()''')

        R('''write.csv( sr.control$estimation, file ='%s' )''' %
          E.getOutputFile("saturation_estimation.csv"))
        outfile.write("estimated_correlation\t%f\n" %
                      R('''sr.control$maxEstCor''')[1])
        outfile.write("true_correlation\t%f\n" %
                      R('''sr.control$maxTruCor''')[1])

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, 
                                extend = %(extension)i, 
                                no_iterations = 10)''' % locals())

        R.png(E.getOutputFile("cpg_coverage.png"))
        R('''MEDIPS.plotCoverage(cr.control)''')
        R('''dev.off()''')

        # three rows
        R('''write.csv( cr.control$coveredPos, file ='%s' )''' %
          E.getOutputFile("saturation_coveredpos.csv"))
        # coverage threshold
        # number of CpG covered
        # percentage of CpG covered

        R('''write.csv( cr.control$matrix, file ='%s' )''' %
          E.getOutputFile("saturation_matrix.csv"))

        # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''')

    if "calibration" in options.toolset or do_all:
        E.info("plotting calibration")
        R.png(E.getOutputFile("calibration.png"))
        R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)'''
          )
        R('''dev.off()''')

    for slotname, label, pattern in slotnames:
        value = tuple(R('''CONTROL.SET@%s''' % slotname))
        if len(value) == 0:
            continue
        outfile.write(
            "%s\t%s\n" %
            (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0]))

    outfile.close()

    if "rpm" in options.toolset or do_all:
        outputfile = E.getOutputFile("rpm.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    if "rms" in options.toolset or do_all:
        outputfile = E.getOutputFile("rms.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    shutil.rmtree(tmpdir)

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 57
0
#!/usr/bin/env python
import numpy
import pandas
from copy import copy, deepcopy

import rpy2.robjects as ro
from rpy2.robjects import r, pandas2ri, numpy2ri
from rpy2.robjects.conversion import localconverter
pandas2ri.activate()
numpy2ri.activate()
r.library("lme4")


class lmer(object):
    """
    Mixed effect regression
    Wrapper around lme4's lmer class to enable:
        - training
        - predicting
    """

    # Class wide constants/parameters

    def __init__(
        self,
        target=None,
        fixed_effects=[],
        re_features=[],
        re_terms=[],
    ):
        # Target
Esempio n. 58
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in csv.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = iotools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()