def getResult(request): # 获取所有的待计算数据, 并转成R可以读取的格式 ListingId = request.POST.getlist('ListingId', []) Title = request.POST.getlist('Title', []) inputAmount = request.POST['inputAmount'] Months = request.POST.getlist('Months', []) CreditCode = request.POST.getlist('CreditCode', []) Rate = request.POST.getlist('Rate', []) data = rlc.OrdDict([('ListingId', rob.StrVector(ListingId)), ('Title', rob.StrVector(Title)), ('inputAmount', rob.IntVector([inputAmount] * len(ListingId))), ('Months', rob.IntVector(Months)), ('CreditCode', rob.StrVector(CreditCode)), ('Rate', rob.FloatVector(Rate))]) inputCalDataFrame = rob.DataFrame(data) """导入R""" rFilePath = os.path.dirname(os.path.abspath(__file__)) + '/DECISION.R' rob.r.source(rFilePath) decision = rob.globalenv['DECISION'](inputCalDataFrame) decisionDataFrame = pandas2ri.ri2py_dataframe( decision) # 转为Python的DataFrame格式 """/导入R """ # 转换为输出结果 inputAmount = list(decisionDataFrame['inputAmount'])[0] resultList = [] for index, row in decisionDataFrame.iterrows(): resultList.append(row.to_dict()) return render(request, 'result.html', locals())
def to_dataframe(self, data): #transpose the list of dicts into a dict of lists transposed = defaultdict(list) for entry in data: for key in entry: transposed[key].append(entry[key]) for key in transposed: #get the appropriate conversion function if key in formats._INTS: conv_f = robjects.IntVector elif key in formats._FLOATS: conv_f = robjects.FloatVector elif key == formats.DATE: transposed[key] = formats.strfdate_standard_list( transposed[key]) conv_f = robjects.StrVector elif key in formats._NO_CONVERSION: conv_f = robjects.StrVector else: raise Exception("Conversion function not found for key %s" % key) transposed[key] = conv_f(transposed[key]) data_frame = robjects.DataFrame(transposed) return data_frame
def _plt_distr(dat, col, title='', splitBy_pfill=True, pfill='label', independentpdf=False, fname='xdistr.pdf'): df = dat[dat[pfill] != 'NA'] ## remove invalid pairs n = len(df) df = { col: robjects.FloatVector(list(df[col])), pfill: robjects.StrVector(list(df[pfill])) } df = robjects.DataFrame(df) pp = ggplot2.ggplot(df) + \ ggplot2.ggtitle('%s [Total = %s]' % (title, n)) ## Plot1: counts if splitBy_pfill: p1 = pp + ggplot2.aes_string(x=col, fill=pfill) else: p1 = pp + ggplot2.aes_string(x=col) ## Plot2: density if splitBy_pfill: p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..') else: p2 = pp + ggplot2.aes_string(x=col, y='..density..') p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500) if col == 'distance': p1 = p1 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) p2 = p2 + \ ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \ ggplot2.xlim(-1000, 51000) else: p1 = p1 + \ ggplot2.geom_histogram(alpha=.5, position='identity') p2 = p2 + \ ggplot2.geom_histogram(alpha=.33, position='identity') if col == 'correlation': p1 = p1 + ggplot2.xlim(-1.1, 1.1) p2 = p2 + ggplot2.xlim(-1.1, 1.1) if independentpdf: grdevices = importr('grDevices') grdevices.pdf(file=fname) p1.plot() p2.plot() grdevices.dev_off() else: p1.plot() p2.plot() return
def deseq2_basic(data_frame, numerator=2, denominator=1, category_field='Category', sample_field='Sample', batch_field=None, expression_name_field='Name', counts_field='NumReads'): # from a dataframe # https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 design = '~ `' + category_field + '`' if batch_field is not None: design = '~ `' + batch_field + '` + `' + category_field + '`' #print(design) design = Formula(design) mat = data_frame.pivot(columns=sample_field, index=expression_name_field, values=counts_field) mfields = [sample_field, category_field] if batch_field is not None: mfields += [batch_field] meta = data_frame[mfields].groupby(sample_field).first().loc[mat.columns] metaarr = {} metaarr[category_field] = robjects.IntVector(meta[category_field].apply( lambda x: _trans(x, numerator, denominator))) if batch_field is not None: metaarr[batch_field] = robjects.IntVector(meta[batch_field]) dds0 = deseq.DESeqDataSetFromMatrix(countData=mat.astype(int), colData=robjects.DataFrame(metaarr), design=design) dds1 = deseq.DESeq(dds0) res = rpy2.robjects.pandas2ri.ri2py(as_df(deseq.results(dds1))) res.index = mat.index res.index.name = expression_name_field return (dds0, dds1, res, mat, meta)
def testDim(self): letters = robjects.r.letters numbers = robjects.r('1:26') df = robjects.DataFrame( rlc.TaggedList((letters, numbers), tags=('letters', 'numbers'))) self.assertEqual(26, df.nrow) self.assertEqual(2, df.ncol)
def test_init_from_taggedlist(): letters = robjects.r.letters numbers = robjects.r('1:26') df = robjects.DataFrame( rlc.TaggedList((letters, numbers), tags=('letters', 'numbers'))) assert df.rclass[0] == 'data.frame'
def analyse_permanova(self, user_request, otu_table, headers, sample_labels, metadata_values, strata_values, sample_ids_from_metadata): print("Starting PERMANOVA") groups = robjects.FactorVector(robjects.StrVector(metadata_values)) # Forms an OTU only table (without IDs) allOTUs = [] col = 0 while col < len(otu_table[0]): colVals = [] row = 0 while row < len(otu_table): sampleID = sample_labels[row] if sampleID in sample_ids_from_metadata: colVals.append(otu_table[row][col]) row += 1 allOTUs.append((headers[col], robjects.FloatVector(colVals))) col += 1 od = rlc.OrdDict(allOTUs) dataf = robjects.DataFrame(od) if strata_values is None: permanova = self.veganR.betaDiversityPERMANOVA(dataf, groups) else: strata = robjects.FactorVector(robjects.StrVector(strata_values)) permanova = self.veganR.betaDiversityPERMANOVAWithStrata(dataf, groups, strata) abundancesObj = {} abundancesObj["permanova"] = str(permanova) return abundancesObj
def plot(args): """ Plot data parsed from logfiles previously """ try: with open(args.input) as fin: data = json.load(fin) except IOError as e: logger.error("Could not read '{0}'".format(args.input)) logger.debug("IOError: {0}".format(e)) raise RuntimeError() except Exception as e: logger.error("Invalid input file. Expected JSON") logger.debug("JSON Error: {0}".format(e)) raise RuntimeError() # Get the first one parser = data.values()[0].keys()[0] if args.p is None else args.p dataf = util.filter_and_flatten(data, args.f, parser, label_files=True) r_dataf = ro.DataFrame(dataf.asRObjects) gp = ggplot2.ggplot(r_dataf) plotargs = get_ggplot_args(args) while True: render_plot(gp, plotargs) try: plotargs = print_menu(plotargs, dataf) except SavePlotException as e: try: ro.r("ggsave(filename='{0}')".format(e.filename)) except IOError as io: logging.warn("Error saving plot: {0}".format(io)) except StopIteration: return
def _create_R_dataframe(self, job_ads, include_columns): """Converts job ads to R dataframe. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. include_columns : list[str] Defines which columns are included in the dataframe. Returns ---------- dataf : :class:`robjects.DataFrame` :class:`robjects.DataFrame` representing job ads. """ #modify structure to type {column:[rows]} if len(job_ads) == 0: raise Exception("No job ads to convert to R dataframe.") job_ads_dataf = {} for column in include_columns: job_ads_dataf[column] = [self._remove_diacritics(ad[column]) for ad in job_ads] if (column == "relevant"): job_ads_dataf[column] = IntVector(job_ads_dataf[column]) else: job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column])) return robjects.DataFrame(job_ads_dataf)
def dict_list_to_df(data, base=None): """Converts a list of dictionaries to an rpy2 data frame""" vectors = dict() keys = data[0].keys() if base is None: base = rpackages.importr('base') for key in keys: # First extract each key into its own list var = [] for entry in data: var.append(entry[key]) # convert None to R's NA var = substitute_NA(var, key) # next convert it into a rpy2 vector if key in formats._INTS: vect = robjects.IntVector(var) elif key in formats._FLOATS: vect = robjects.FloatVector(var) elif key == formats.DATE: var_str = [x.strftime(formats.DMY) for x in var] vect = robjects.StrVector(var_str) vect = base.as_Date(vect, formats.DMY) else: vect = robjects.StrVector(var) vectors[key] = vect return (robjects.DataFrame(vectors))
def anova_shape_r_nonoptimal(model, sdata): pre_data_frame = sdata.create_r_pre_data_frame(model) statsout = StatsOutput(dim=sdata.phenotype_array.shape[1]) for i in xrange(sdata.phenotype_array.shape[1]): pre_data_frame['response'] = robjects.FloatVector( sdata.phenotype_array[:, i]) dataframe = robjects.DataFrame(pre_data_frame) robj = robjects.r fit_full = robj.lm(robjects.Formula('response' + ' ~ ' + model.fullmodel), data=dataframe) fit_reduced = robj.lm(robjects.Formula('response' + ' ~ ' + model.nullmodel), data=dataframe) model_diff = robjects.r.anova(fit_full, fit_reduced) idx_unique = fit_full.rx2('coefficients').names.index(model.unique) direction = np.sign(fit_full.rx2('coefficients')[idx_unique]) idx_pvalues = model_diff.names.index('Pr(>F)') statsout.pvalues[i] = model_diff[idx_pvalues][1] statsout.pvalues_signed[i] = direction * model_diff[idx_pvalues][1] statsout.tvalues[i] = fit_full.rx2('coefficients')[idx_unique] return statsout
def CSRtoDCG(sparse_matrix): data = robjects.DataFrame(sparse_matrix.toarray().flatten()) nrows, ncols = sparse_matrix.shape return MatrixInterface.Matrix(data, nrow=nrows, ncol=ncols, sparse=True)
def write(self, data): group=self.get_data_group(data) dest=robjects.globalenv if group == 'data': datavals = data['data'] ordering = data['ordering'] attrlist = [] nameind = 0 names = data['names'] types = data['types'] for cur_feat in ordering: if len(datavals[cur_feat].shape) > 1: for k in range(datavals[cur_feat].shape[0]): if str(types[nameind]).startswith('nominal'): attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat][k])))) else: attrlist.append((names[nameind], datavals[cur_feat][k])) nameind += 1 else: if str(types[nameind]).startswith('nominal'): attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat])))) else: attrlist.append((names[nameind], datavals[cur_feat])) nameind += 1 dest[data['name']] = robjects.DataFrame(rlc.OrdDict(attrlist)) elif group == 'task': d=data[group] for k in list(d.keys()): dest[k] = d[k] robjects.r.save(*list(robjects.r.ls(dest)), file=self.fname)
def pearsons(self, parm1, parm2): vectorDict = {} dataframe = self.getSelectParmData() names = dataframe.names vectorDict[parm1] = dataframe[names.index(parm1)] vectorDict[parm2] = dataframe[names.index(parm2)] if self.skewness(vectorDict[parm1]) > 1 or self.skewness( vectorDict[parm1]) < -1: vectorDict[parm1] = base.log10(vectorDict[parm1]) if self.skewness(vectorDict[parm2]) > 1 or self.skewness( vectorDict[parm2]) < -1: vectorDict[parm2] = base.log10(vectorDict[parm2]) newDataframe = robjects.DataFrame(vectorDict) matrix = base.as_matrix(newDataframe) cor = hmisc.rcorr(matrix, type="pearson") #return stats.cor(newDataframe,use="pairwise.complete.obs",method="pearson") #return the correlation coeeficient and the p-value as extracted from the resulting matrix return cor[0][1], cor[2][1]
def makePlot(grdevices, plotName, samp_set1_vals, samp_set2_vals, image_file_type): samp_vector = ["set1" for i in range(len(samp_set1_vals))] samp_vector.extend(["set2" for i in range(len(samp_set2_vals))]) data_vector = samp_set1_vals + samp_set2_vals dframe = robjects.DataFrame({ "sample": robjects.StrVector(samp_vector), "value": robjects.FloatVector(data_vector) }) gp = ggplot2.ggplot(dframe) pp = gp + \ ggplot2.aes_string(x="sample", y='value') + \ ggplot2.geom_jitter(position=ggplot2.position_jitter(width=0.2, height=0.01)) +\ ggplot2.theme_bw() # ggplot2.geom_boxplot(stat="identity") +\ if image_file_type == "pdf": grdevices.pdf(file=plotName) else: grdevices.png(file=plotName, width=512, height=512) pp.plot() grdevices.dev_off()
def bargraph_language(results): r = robjects.r for language in languages: varis = [] probs = [] locs = [] for (lang, prob, var) in results.keys(): if lang == language: loc = results[(lang, prob, var)] varis.append(pretty_varis[var]) probs.append(prob) locs.append(loc) r.pdf('bargraph-loc-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Lines': IntVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code")') pp.plot() r['dev.off']()
def make_output(tss_cov, out_prefix, upstream, downstream): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-upstream,downstream+1): print >> raw_out, '%d\t%e' % (i, tss_cov[upstream+i]) raw_out.close() # make plot data structures tss_i = ro.IntVector(range(-upstream,downstream+1)) cov = ro.FloatVector(tss_cov) df = ro.DataFrame({'tss_i':tss_i, 'cov':cov}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_full.pdf' % out_prefix) gp.plot() grdevices.dev_off() # construct zoomed plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index',limits=ro.IntVector([-1000,1000])) + \ ggplot2.scale_y_continuous('Coverage') # plot to file grdevices.pdf(file='%s_zoom.pdf' % out_prefix) gp.plot() grdevices.dev_off()
def testNewFromTaggedList(self): letters = robjects.r.letters numbers = robjects.r('1:26') df = robjects.DataFrame( rlc.TaggedList((letters, numbers), tags=('letters', 'numbers'))) self.assertEqual("data.frame", df.rclass[0])
def test_index(self): expectedIndex = 1 # testing dataframe dict = { 'test1': R.IntVector((12, 12, 15)), 'test2': R.IntVector((32, 4, 12)), 'test3': R.IntVector((3, 12, 26)) } # note that test1 has 12 in row 1 and row 2 testFrame = R.DataFrame(dict) # because the dict from which the dataframe is made is not ordened, the testframe is ordened first to make sure that the column 'test2' is always at the same position # however, to do tis I'm using the index function that I'm testing, which might or might not be a right thing to do testFrame = testFrame.rx[True, R.r['with']( testFrame, R.r['order'](R.IntVector([ rFunctions.index(testFrame, 'test1'), rFunctions.index(testFrame, 'test2'), rFunctions.index(testFrame, 'test3') ])), )] actualDataFrameIndex = rFunctions.index(testFrame, 'test2') # testing matrix (same values as the dataframe) testMatrix = R.r.matrix(R.IntVector([12, 12, 15, 32, 4, 12, 3, 12, 26]), nrow=3) testMatrix.colnames = R.StrVector(['test1', 'test2', 'test3']) actualMatrixIndex = rFunctions.index(testMatrix, 'test2') self.assertEqual(expectedIndex, actualDataFrameIndex) self.assertEqual(expectedIndex, actualMatrixIndex)
def getdata(): date = request.form['date'] date = '2015' print('ok') print(date) datar = data[date] datar = robjects.DataFrame(datar) freq = 5 datar = robjects.r['getdata'](datar, freq) datar = pandas2ri.ri2py(datar) dates = datar['Date'] dates = dates.tolist() bar = datar[['Open', 'Close', 'Low', 'High']] bar = np.array(bar) bar = bar.tolist() cci = datar['cci'] cci = cci.tolist() cci = [round(c, 2) for c in cci] sma = datar['sma'] sma = sma.tolist() option = {'bar': bar, 'sma': sma, 'cci': cci, 'dates': dates} return jsonify(option)
def Python_df_to_R_df(self,Python_df): #func to convert a Python df into R DF pandas2ri.activate() r_from_pd_df = robjects.DataFrame({}) with localconverter(robjects.default_converter + pandas2ri.converter): r_from_pd_df = robjects.conversion.py2rpy(Python_df) return r_from_pd_df
def test_dim(): letters = robjects.r.letters numbers = robjects.r('1:26') df = robjects.DataFrame( rlc.TaggedList((letters, numbers), tags=('letters', 'numbers'))) assert df.nrow == 26 assert df.ncol == 2
def get_response_matrix(self): matrix = {} matrix_index = 0 # For each question: for question_index in range(self.test_length): question = self.questions[question_index] # Cannot have questions where either 100% or 0% were correct, as ltm will crash. # This also excludes questions the user has opted to discard. if not question.discard: # Header value. question_response_vector = [] # Retrieve all the responses for each student. for j in range(len(self.students)): # question_response_vector.append(self.students[j].is_right(question_index)) question_response_vector.append( int(self.students[j].is_right(question_index))) # question_response_vector.append(1) matrix_index += 1 else: # Ignore the exclusions. pass # Convert to a vector. # matrix[question_index + 1] = robjects.BoolVector(question_response_vector) matrix[matrix_index] = robjects.IntVector(question_response_vector) # Convert the dictionary of vectors to a dataframe. response_matrix = robjects.DataFrame(matrix) return response_matrix
def calc_size_factors(self): self._count_df = np.round(self._count_df, decimals=0) self._count_df = self._count_df.astype(int) r_count_df = robjects.DataFrame(self._count_df) r_count_df.colnames = robjects.rinterface.NULL r_size_factors = r.estimateSizeFactorsForMatrix(r_count_df) return pd.Series(r_size_factors, index=self._count_df.columns)
def convert_dataframe_columns(df, strings_as_factors=False): """ Essentially the same as pandas.spy.common.convert_to_r_dataframe except we don't convert the index into strings We are just grabbing the column data here """ import rpy2.rlike.container as rlc columns = rlc.OrdDict() #FIXME: This doesn't handle MultiIndex for column in df: value = df[column] value_type = value.dtype.type value = [ item if pd.notnull(item) else NA_TYPES[value_type] for item in value ] value = VECTOR_TYPES[value_type](value) if not strings_as_factors: I = robjects.baseenv.get("I") value = I(value) columns[column] = value r_dataframe = robjects.DataFrame(columns) return r_dataframe
def make_manhattan(grdevices, data, raw_pvals_vector, snps_to_highlight, padj_cutoff, out_manhattan, title="", xlable="", xlim="-"): snp_names = [] snp_pos = [] chr_names = [] for chr, pos in sorted(data.iterkeys()): snp_pos.append(pos) chr_names.append(data[chr, pos]) snp_names.append("%s_%d" % (chr, pos)) od_raw = rlc.OrdDict([("SNP", robjects.StrVector(snp_names)), ("CHR", robjects.IntVector(chr_names)), ("BP", robjects.IntVector(snp_pos)), ("P", robjects.FloatVector(raw_pvals_vector))]) color_vector = robjects.StrVector(["blue4", "orange3"]) sig_snps = robjects.StrVector(snps_to_highlight) qqman = rpackages.importr('qqman') grdevices.pdf(out_manhattan) if xlim != "-": xmin = int(xlim.split(",")[0]) xmax = int(xlim.split(",")[1]) qqman.manhattan(robjects.DataFrame(od_raw), highlight=sig_snps, col=color_vector, suggestiveline=False, genomewideline=-1 * math.log10(padj_cutoff), xlim=robjects.IntVector([xmin, xmax]), xlab=xlable, main=title) else: qqman.manhattan(robjects.DataFrame(od_raw), highlight=sig_snps, col=color_vector, suggestiveline=False, genomewideline=-1 * math.log10(padj_cutoff), main=title, ylim=robjects.IntVector([0, 10]), chrlabs=robjects.StrVector( ["2L", "2R", "3L", "3R", "X"])) grdevices.dev_off()
def run(self, data, regression, resources=None): """ The method prints out summary of the BMA procedure and creates an imageplot. If resources has an entry 'bma_imageplot_filename', the imageplot is sent to this file as pdf. The method does not return any useful results - it is a tool for variable selection. Once you selected your variables, use estimate_linear_regression for further usage of the coefficients. Expects an entry 'outcome' in resources that provides the values of the dependent variable. 'data' is a 2D numpy array of the actual data (nobservations x ncoefficients), it can be created by Dataset.create_regression_data_for_estimation(...). 'regression' is an instance of a regression class. """ r = robjects.r if data.ndim < 2: raise StandardError, "Argument 'data' must be a 2D numpy array." nobs = data.shape[0] nvar = data.shape[1] constant_position = resources.get( "constant_position", array([], dtype='int32')) #position for intercept if constant_position.size == 0: #position for intercept constant_position = -1 nvalues = nvar else: constant_position = constant_position[0] nvalues = nvar + 1 beta = zeros(nvalues).astype(float32) coef_names = resources.get("coefficient_names", nvar * []) data_for_r = {} for icoef in range(len(coef_names)): data_for_r[coef_names[icoef]] = data[:, icoef] bma = importr("BMA") d = robjects.DataFrame(data_for_r) try: bma_params = { 'x': d, 'y': resources["outcome"], 'glm.family': "gaussian", 'strict': 1 } #fit = bma.bic_glm(x=d, y=resources["outcome"], glm_family="gaussian", strict=1) fit = bma.bic_glm(**bma_params) fit[20] = '' # to have less output in the summary r.summary(fit) filename = resources.get('bma_imageplot_filename', None) if filename is not None: r.pdf(file=filename) bma.imageplot_bma(fit) r['dev.off']() else: r.X11() bma.imageplot_bma(fit) except: logger.log_warning("Error in BMA procedure.") return {}
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream, downstream): # clean raw counts dir if os.path.isdir('%s_raw' % out_prefix): shutil.rmtree('%s_raw' % out_prefix) os.mkdir('%s_raw' % out_prefix) # dump raw counts to file for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: raw_out = open( '%s_raw/%s_%s.txt' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_')), 'w') for i in range(-upstream, downstream + 1): print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][ upstream + i], control_te_tss_cov[te][upstream + i]) raw_out.close() # clean plot dirs if os.path.isdir('%s_plot' % out_prefix): shutil.rmtree('%s_plot' % out_prefix) os.mkdir('%s_plot' % out_prefix) # make data structures tss_i = ro.IntVector(2 * range(-upstream, downstream + 1)) labels = ro.StrVector(['Main'] * (upstream + downstream + 1) + ['Control'] * (upstream + downstream + 1)) for te in te_tss_cov: if te[0] in [ 'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7' ] and te[1] in [ 'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR', 'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie', 'LTR/ERVK', 'DNA/TcMar-Tigger' ]: cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te]) df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ ggplot2.geom_point() + \ ggplot2.scale_x_continuous('TSS index') + \ ggplot2.scale_y_continuous('Coverage') + \ ggplot2.scale_colour_discrete('') # plot to file grdevices.pdf( file='%s_plot/%s_%s.pdf' % (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_'))) gp.plot() grdevices.dev_off()
def ortholog_blast( head_id, sequence_file, blast_out_value_tsv_file, read_function, write_function, blast_db_path, blast_out_xml_path, blast_out_asn_path, blast_out_txt_path, rbase ): #需要修改read if blast_out_value_tsv_file.exists() is True: return(read_function( str(blast_out_value_tsv_file), header = True, sep = "\t", **{'stringsAsFactors': False}, **{'check.names': False} )) blast_db_file=blast_db_path/(head_id+"_blast_head_db") blast_out_xml_file=blast_out_xml_path/(head_id+"_blast_out.xml") blast_out_asn_file=blast_out_asn_path/(head_id+"_blast_out.asn") blast_out_txt_file=blast_out_txt_path/(head_id+"_blast_out.txt") if blast_out_xml_file.exists() is False: blastdb(sequence_file,blast_db_file) blast(blast_db_file,sequence_file,blast_out_xml_file,blast_out_asn_file,blast_out_txt_file) with open(blast_out_xml_file) as xml_fl: R_blast_vlaue_list=rbase.list() i=1 for record in NCBIXML.parse(xml_fl): gene_name_list=[] blast_value_list=[] identity_perscent=0 gene_name=record.query.split()[0] if gene_name[0:3]!="MGG": break if record.alignments: if len(record.alignments)>1: for alignment in record.alignments: max_flag=-1 if alignment.hit_def==gene_name:continue for hsp in alignment.hsps: if max_flag > -1: break identity_perscent=hsp.identities/hsp.align_length max_flag=max_flag+20 blast_value_list.append(identity_perscent) gene_name_list.append(alignment.hit_def) MGG_head_id=[gene_name]*len(gene_name_list) R_blast_vlaue_list.rx2[i]=robjects.DataFrame({ "gene_name":robjects.StrVector(gene_name_list), "blast_value":robjects.FloatVector(blast_value_list), "MGG_head":robjects.StrVector(MGG_head_id) }) i=i+1 R_blast_vlaue_df=rbase.do_call("rbind",R_blast_vlaue_list) write_function(R_blast_vlaue_df,**{'file': str(blast_out_value_tsv_file)},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True}) return R_blast_vlaue_df
def preprocess_df_R(self,dataframe): #this function parse a python dataframe to a R dataframe feature_dict = {} for colname in dataframe.columns: # What happens if we pass the wrong type? feature_dict[colname] = robjects.FloatVector(dataframe[colname]) dataframe_R = robjects.DataFrame(feature_dict) return dataframe_R