def classifyTestData(testFilePath,modelRoot): """ This method calls the traverseDecisionTreeModel() to classify the test data on the trained model and generate Confusion matrix and error at the given depth :param testFilePath: Path to the test file :param modelRoot: Root node of the decision tree of the trained model """ correctlyClassifiedInstances=0 incorrectlyClassifiedInstances=0 testDataList=[] input=open(testFilePath,'rU') csvObject=csv.reader(input) label = featureList[len(featureList) -1] classLabels = featureAndValueMapping.get(label) classLabelCount = len(classLabels) ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))] for row in csvObject: predictedLabel=traverseDecisionTreeModel(row,root) ConfusionMatrix[int(row[len(row)- 1]) - 1][int(predictedLabel) - 1] += 1 if predictedLabel==row[len(row)-1]: correctlyClassifiedInstances+=1 else: incorrectlyClassifiedInstances+=1 df = DataFrame(ConfusionMatrix) df.columns = classLabels df.index = classLabels print "Confusion Matrix :: \n" print df print "Correctly Classified Instance ",correctlyClassifiedInstances print "Incorrectly Classified Instance ",incorrectlyClassifiedInstances
def _unstack_frame(obj, level): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy obj.index, level=level, value_columns=obj.columns) new_columns = unstacker.get_new_columns() new_index = unstacker.get_new_index() new_axes = [new_columns, new_index] new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: bunstacker = _Unstacker(blk.values.T, obj.index, level=level, value_columns=blk.items) new_items = bunstacker.get_new_columns() new_values, mask = bunstacker.get_new_values() mblk = make_block(mask.T, new_items, new_columns) mask_blocks.append(mblk) newb = make_block(new_values.T, new_items, new_columns) new_blocks.append(newb) result = DataFrame(BlockManager(new_blocks, new_axes)) mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) return result.ix[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns) return unstacker.get_result()
def test_read_empty_dta(self): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path,write_index=False) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2)
def feature_engineering(raw_data): input_data = raw_data[['Date','AdjClose','AdjVolume']].dropna() train_ratio = 0.8 savedata= DataFrame(input_data) savedata.to_csv('/home/peng/workspace/datafortrainCao.csv', header=0) #=========================================================================== # Vol_5 = index_cal().VOL_n(input_data, 5) # Vol_10 = index_cal().VOL_n(input_data, 10) # Vol_15 = index_cal().VOL_n(input_data, 15) # Vol_20 = index_cal().VOL_n(input_data, 20) # RDV_5 = index_cal().RDV_n(input_data, 5) # RDV_10 = index_cal().RDV_n(input_data, 10) # RDV_15 = index_cal().RDV_n(input_data, 15) # RDV_20 = index_cal().RDV_n(input_data, 20) #=========================================================================== EMA15 = index_cal().EMAn(input_data, 15) RDP_5 = index_cal().RDP_n(input_data, 5) RDP_10 = index_cal().RDP_n(input_data, 10) RDP_15 = index_cal().RDP_n(input_data, 15) RDP_20 = index_cal().RDP_n(input_data, 20) RDP_plus_5 = index_cal().RDP_plus_n(input_data, 5) all_data = mergeColumnByDate(RDP_5,RDP_10,RDP_15,RDP_20,EMA15,RDP_plus_5) features = all_data[['RDP-5','RDP-10','RDP-15','RDP-20','EMA15']] features = PCA().fit_transform(features.values) (x_train, x_test) = divideTrainTest(features, train_ratio) objectives = all_data['RDP+5'].values (y_train,y_real) = divideTrainTest(objectives, train_ratio) return (x_train,y_train,x_test,y_real)
def get_daily_normals(self, start_date = None, end_date = None, stamp_year = 2001): """ :type start_date: datetime.datetime :type end_date: datetime.datetime :rtype : list , list """ self.stamp_day_dates = pandas.DatetimeIndex(start = datetime(stamp_year,1,1), end = date(stamp_year, 12, 31), freq = pandas.datetools.offsets.Day()) if start_date is None: start_date = self.time[0] if end_date is None: end_date = self.time[-1] di = pandas.DatetimeIndex(data = self.time) df = DataFrame(data = self.data, index = di, columns=["values",]) df = df.select( lambda d: start_date <= d <= end_date ) df_mean = df.groupby(by = lambda d: (d.day, d.month)).mean() return self.stamp_day_dates, df_mean.ix[[ (d.day, d.month) for d in self.stamp_day_dates] ,"values"]
def getList(self,week, club, colList, filename): s = pd.read_csv(filename) df2 = DataFrame(s) df3 = DataFrame(s) columns = df2.columns xlist = list() for c in columns: if c.upper().find("PRICE ADJUSTMENT") == -1: if c.find(week) != -1: xlist.append(str(c)) indexList = list() for xcolumn in xlist: colist = list() colist.append(xcolumn) df4 = DataFrame(df3, columns=colist)[~df3[xcolumn].isnull()] for row in df4.iterrows(): if row[1][0] == club: indexList.append(row[0]) fin = DataFrame(df2, index=indexList, columns=colList) if fin.empty: return fin["Camp"] = club fin["Week"] = week return fin
def plot(self): """ Plots 2 graphs. One for N-period moving average, lower and upper bands. One for P/N and position. """ columns = {"Upper Bands": self.upper_bands, "Lower Bands": self.lower_bands, "Moving Means": self.moving_means, "Opening Prices": self.prices} df = DataFrame(columns, index=self.dates) df.plot() fig = plt.figure(num=None, figsize=(18, 10), dpi=80, facecolor='w', edgecolor='k') fig.add_subplot(121) trans_dates = [tran.date for tran in self.transactions] # we negate the value here to show profit/loss trans = Series([-tran.value() for tran in self.transactions], index=trans_dates) position = Series([tran.units for tran in self.transactions], index=trans_dates) position.cumsum().plot(label="Position") plt.xlabel("Date") plt.ylabel("Position") plt.title("Position over Time") plt.legend(loc="best") fig.add_subplot(122) trans.cumsum().plot(label="P/L") plt.xlabel("Date") plt.ylabel("Profit/Loss") plt.title("Profit and Loss over Time") plt.legend(loc="best") plt.show()
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ index_vals = self[index] column_vals = self[columns] mindex = MultiIndex.from_arrays([index_vals, column_vals]) try: mindex._verify_integrity() except Exception: raise Exception("duplicate index/column pairs!") if values is None: items = self.columns - [index, columns] mat = self.reindex(columns=items).values else: items = [values] mat = np.atleast_2d(self[values].values).T stacked = DataFrame(mat, index=mindex, columns=items) if not mindex.is_lexsorted(): stacked = stacked.sortlevel(level=0) unstacked = stacked.unstack() if values is not None: unstacked.columns = unstacked.columns.droplevel(0) return unstacked
def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, nw_lags, nobs, df, nw_overlap): from pandas.core.frame import group_agg xx_inv = math.inv(xx) yv = y.values if cluster_axis is None: if nw_lags is None: return xx_inv * (rmse ** 2) else: resid = yv - np.dot(x.values, beta) m = (x.values.T * resid).T xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap) return np.dot(xx_inv, np.dot(xeps, xx_inv)) else: Xb = np.dot(x.values, beta).reshape((len(x.values), 1)) resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=["resid"]) if cluster_axis == 1: x = x.swaplevel(0, 1).sortlevel(0) resid = resid.swaplevel(0, 1).sortlevel(0) m = group_agg(x.values * resid.values, x.index._bounds, lambda x: np.sum(x, axis=0)) if nw_lags is None: nw_lags = 0 xox = 0 for i in range(len(x.index.levels[0])): xox += math.newey_west(m[i : i + 1], nw_lags, nobs, df, nw_overlap) return np.dot(xx_inv, np.dot(xox, xx_inv))
def test_missing_value_generator(self): types = ('b','h','l') df = DataFrame([[0.0]],columns=['float_']) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE expected_values = ['.' + chr(97 + i) for i in range(26)] expected_values.insert(0, '.') for t in types: offset = valid_range[t][1] for i in range(0,27): val = StataMissingValue(offset+1+i) self.assertTrue(val.string == expected_values[i]) # Test extremes for floats val = StataMissingValue(struct.unpack('<f',b'\x00\x00\x00\x7f')[0]) self.assertTrue(val.string == '.') val = StataMissingValue(struct.unpack('<f',b'\x00\xd0\x00\x7f')[0]) self.assertTrue(val.string == '.z') # Test extremes for floats val = StataMissingValue(struct.unpack('<d',b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]) self.assertTrue(val.string == '.') val = StataMissingValue(struct.unpack('<d',b'\x00\x00\x00\x00\x00\x1a\xe0\x7f')[0]) self.assertTrue(val.string == '.z')
def export_converted_values(self): """ This function is called initially to convert per-100g values to per serving values Once this function is invoked, new file is generated which serves as Database This function will need to be called only one time :return: """ file_converted = self.file_converted_values data_file = self.file_database data = self.read_csv(data_file) converted_data = list() for item in data.values: converted_list = list(item[0:2]) sub_item = item[2:50] for nutrient in sub_item: import math if math.isnan(nutrient): nutrient = 0 converted_list.append(nutrient * sub_item[47] / 100) converted_list.append(item[50]) converted_data.append(converted_list) if len(self.cols) == 0: for col_name in list(data._info_axis._data): self.cols.append(col_name) df = DataFrame(data=converted_data, columns=self.cols) df.to_csv(file_converted, index=False) print 'File has been exported'
def get_result(self): if self._is_series: if self.axis == 0: new_data = com._concat_compat([x.get_values() for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) if columns is not None: tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
def test_read_write_dta12(self): original = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_1", "astringwithmorethan32characters_2", "+", "-", "short", "delete", ], ) formatted = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_", "_0astringwithmorethan32character", "_", "_1_", "_short", "_delete", ], ) formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) tm.assert_equal(len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
def _wrap_aggregated_output(self, output, mask, comp_ids): agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if len(output) == len(agg_labels): output_keys = agg_labels else: output_keys = sorted(output) try: output_keys.sort() except Exception: # pragma: no cover pass if isinstance(agg_labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=agg_labels.names) if not self.as_index: result = DataFrame(output, columns=output_keys) group_levels = self._get_group_levels(mask, comp_ids) for i, (name, labels) in enumerate(group_levels): result.insert(i, name, labels) result = result.consolidate() else: index = self._get_multi_index(mask, comp_ids) result = DataFrame(output, index=index, columns=output_keys) if self.axis == 1: result = result.T return result
def test_missing_value_generator(self): types = ("b", "h", "l") df = DataFrame([[0.0]], columns=["float_"]) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE expected_values = ["." + chr(97 + i) for i in range(26)] expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] for i in range(0, 27): val = StataMissingValue(offset + 1 + i) self.assertTrue(val.string == expected_values[i]) # Test extremes for floats val = StataMissingValue(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]) self.assertTrue(val.string == ".") val = StataMissingValue(struct.unpack("<f", b"\x00\xd0\x00\x7f")[0]) self.assertTrue(val.string == ".z") # Test extremes for floats val = StataMissingValue(struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]) self.assertTrue(val.string == ".") val = StataMissingValue(struct.unpack("<d", b"\x00\x00\x00\x00\x00\x1a\xe0\x7f")[0]) self.assertTrue(val.string == ".z")
def stack_sparse_frame(frame): """ Only makes sense when fill_value is NaN """ lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] nobs = sum(lengths) # this is pretty fast minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] # TODO: Figure out whether this can be reached. # I think this currently can't be reached because you can't build a SparseDataFrame # with a non-np.NaN fill value (fails earlier). for _, series in compat.iteritems(frame): if not np.isnan(series.fill_value): raise TypeError('This routine assumes NaN fill value') int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) major_labels = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], labels=[major_labels, minor_labels], verify_integrity=False) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) return lp.sortlevel(level=0)
def stack_sparse_frame(frame): """ Only makes sense when fill_value is NaN """ lengths = [s.sp_index.npoints for _, s in frame.iteritems()] nobs = sum(lengths) # this is pretty fast minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) inds_to_concat = [] vals_to_concat = [] for _, series in frame.iteritems(): if not np.isnan(series.fill_value): raise Exception('This routine assumes NaN fill value') int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) vals_to_concat.append(series.sp_values) major_labels = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], labels=[major_labels, minor_labels]) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) return lp.sortlevel(level=0)
def test_read_write_dta12(self): original = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-', 'short', 'delete']) formatted = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_', '_short', '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) tm.assert_equal(len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ index_vals = self[index] column_vals = self[columns] mindex = MultiIndex.from_arrays([index_vals, column_vals], names=[index, columns]) if values is None: items = self.columns - [index, columns] mat = self.reindex(columns=items).values else: items = [values] mat = np.atleast_2d(self[values].values).T stacked = DataFrame(mat, index=mindex, columns=items) if not mindex.is_lexsorted(): stacked = stacked.sortlevel(level=0) unstacked = stacked.unstack() if values is not None: unstacked.columns = unstacked.columns.droplevel(0) return unstacked
def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return Series([]) key_names = [ping.name for ping in self.groupings] if isinstance(values[0], Series): if not_indexed_same: data_dict = dict(zip(keys, values)) result = DataFrame(data_dict).T if len(self.groupings) > 1: result.index = MultiIndex.from_tuples(keys, names=key_names) return result else: cat_values = np.concatenate([x.values for x in values]) cat_index = values[0].index if len(values) > 1: cat_index = cat_index.append([x.index for x in values[1:]]) return Series(cat_values, index=cat_index) elif isinstance(values[0], DataFrame): # possible that Series -> DataFrame by applied function return self._wrap_frames(keys, values, not_indexed_same=not_indexed_same) else: if len(self.groupings) > 1: index = MultiIndex.from_tuples(keys, names=key_names) return Series(values, index) else: return Series(values, keys)
def testEnsembleForecastWeightCombinesForecasts(self): result = self.weight(self.forecasts) self.assertIsInstance(result, Forecast) for i in [0, 1, 5, 10, 19]: expected = [fcst.mean.iloc[i] for fcst in self.forecasts] expected = DataFrame(expected) self.assertTrue(result.mean.iloc[i].equals(expected.mean()))
def getCamps(self,week): s = pd.read_csv("D:/UTDallasStudy/summer.csv") df2 = DataFrame(s) df3 = DataFrame(s) columns = df2.columns x = set() xlist = list() ylist = list() for c in columns: if c.upper().find("PRICE ADJUSTMENT") == -1: if c.find(week) != -1: x.add(re.split("\.?", str(c))[0]) ylist.append(c) else: x.add(str(c)) for x1 in x: xlist.append(x1) xlist.sort(cmp=None, key=None, reverse=False) campset = set() for y in ylist: clist = list() clist.append(y) res = DataFrame(df3, columns=clist)[~df3[y].isnull()] for row in res.iterrows(): campName = row[1][0] campset.add(campName) camplist = list() for camp in campset: camplist.append(camp) camplist.sort(cmp=None, key=None, reverse=False) return camplist
def generate_input_df(self, n_topics, vocab_size, document_length, n_docs, previous_vocab=None, vocab_prefix=None, df_outfile=None, vocab_outfile=None, n_bags=1): print "Generating input DF" # word_dists is the topic x document_length matrix word_dists = self.generate_word_dists(n_topics, vocab_size, document_length) # generate each document x terms vector docs = np.zeros((vocab_size, n_docs), dtype=int64) for i in range(n_docs): docs[:, i] = self.generate_document(word_dists, n_topics, vocab_size, document_length) if previous_vocab is not None: width = vocab_size/n_topics high = int(document_length / width) # randomly initialises the previous_vocab part additional = np.random.randint(high, size=(len(previous_vocab), n_docs)) docs = np.vstack((additional, docs)) df = DataFrame(docs) df = df.transpose() print df.shape if self.make_plot: self._plot_nicely(df, 'Documents X Terms', 'Terms', 'Docs') if df_outfile is not None: df.to_csv(df_outfile) print "Generating vocabularies" # initialises vocab to either previous vocab or a blank list if previous_vocab is not None: vocab = previous_vocab.tolist() else: vocab = [] # add new words for n in range(vocab_size): if vocab_prefix is None: word = "word_" + str(n) else: word = vocab_prefix + "_word_" + str(n) # if more than one bag, then initialise word type too if n_bags > 1: word_type = np.random.randint(n_bags) tup = (word, word_type) vocab.append(tup) else: vocab.append(word) # save to txt vocab = np.array(vocab) if vocab_outfile is not None: np.savetxt(vocab_outfile, vocab, fmt='%s') return df, vocab
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) # checks if the column variable already stores valid column # names (because set via the 'key' argument in the 'concat' # function call. If that's not the case, use the series names # as column names if (columns.equals(Index(np.arange(len(self.objs)))) and not self.ignore_index): columns = np.array([data[i].name for i in range(len(data))], dtype='object') indexer = isnull(columns) if indexer.any(): columns[indexer] = np.arange(len(indexer[indexer])) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return (self.objs[0]._from_axes(new_data, self.new_axes) .__finalize__(self, method='concat'))
def test_write_missing_strings(self): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame([["1"], [""]], columns=["foo"]) expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
def test_no_index(self): columns = ["x", "y"] original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) original.index.name = "index_not_written" with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) tm.assertRaises(KeyError, lambda: written_and_read_again["index_not_written"])
class XLSDataFrameWriter(object): def __init__(self, records, columns): self.dataframe = DataFrame(records, columns=columns) def write_to_excel(self, excel_writer, sheet_name, header=False, index=False): self.dataframe.to_excel(excel_writer, sheet_name, header=header, index=index)
def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order) correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']}) correct_frame.set_index('STRING_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ["STRING_3", "STRING_2"] result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1", col_order=col_order) correct_frame = DataFrame({"STRING_1": ["a"], "STRING_2": ["b"], "STRING_3": ["c"]}) correct_frame.set_index("STRING_1", inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.assertRaises(ValueError): with tm.ensure_clean() as path: original.to_stata(path)
def rpy2py_dataframe(obj): items = OrderedDict((k, rpy2py(v) if isinstance(v, Sexp) else v) for k, v in obj.items()) res = PandasDataFrame.from_dict(items) res.index = obj.rownames return res
def makeNewsDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, word_trend_file=None, news_file=None, output_file=None, stock_id=None): if cur == None or start_date == None or end_date == None or word_trend_file is None or output_file == None or stock_id == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) news_path = os.path.join(basic_path, news_file) word_trend_path = os.path.join(basic_path, word_trend_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] + ["news_pos_num", "news_neg_num"] data = {} for k in columns: data[k] = [] pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) word_trend = {} word_trend_temp = pd.read_csv(word_trend_path) for k in word_trend_temp["0"].keys(): word_trend[word_trend_temp["0"][k]] = [ word_trend_temp["1"][k], word_trend_temp["2"][k] ] p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] + word_trend['total_words'][1]) p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] + word_trend['total_words'][1]) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else slimit - 1, skip if slimit - 1 < 0 else skip + 1)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' sdate = str(history['date'][history['date'].keys()[0]]) edate = str(history['date'][history['date'].keys()[-1]]) # sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d') # sdate = (sdate - datetime.timedelta(days=0)).strftime('%Y-%m-%d') cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time" % (sdate, edate)) news_temp = cur.fetchall() news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = None del news_temp nid_len = len(news_by_id) reader = pd.read_csv(news_path, chunksize=1000) for sentences in reader: if nid_len > 0: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == None: news_by_id[nid] = str(sentences['1'][k]).split(" ") wp_up = p_up wp_down = p_down for w in news_by_id[nid]: if w not in word_trend: wp_up *= (1 / word_trend['total_words'][0]) wp_down *= (1 / word_trend['total_words'][1]) else: if word_trend[w][0] > 0: wp_up *= word_trend[w][0] else: wp_up *= (1 / word_trend['total_words'][0]) if word_trend[w][1] > 0: wp_down *= word_trend[w][1] else: wp_down *= ( 1 / word_trend['total_words'][1]) while True: if wp_up < 1 and wp_down < 1: wp_up *= 10 wp_down *= 10 else: break news_by_id[nid] = [ wp_up / (wp_up + wp_down), -1 * wp_down / (wp_up + wp_down) ] nid_len -= 1 if nid_len <= 0: break else: break reader.close() del reader, sentences for d in news_by_date: sumn = [0, 0] for nid in news_by_date[d]: sumn[0] += news_by_id[nid][0] sumn[1] += news_by_id[nid][1] le = len(news_by_date[d]) if le > 0: sumn[0] /= le sumn[1] /= le news_by_date[d] = sumn print(d) history['news_pos_num'] = 0 history['news_neg_num'] = 0 for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) if str(history['date'][i]) in news_by_date: history.loc[i, 'news_pos_num'] = str( np.round( float(news_by_date[str(history['date'][i])][0]), 2)) history.loc[i, 'news_neg_num'] = str( np.round( float(news_by_date[str(history['date'][i])][1]), 2)) else: history.loc[i, 'news_pos_num'] = "0" history.loc[i, 'news_neg_num'] = "0" #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data = {} for k in columns: data[k] = [] for i in range(len(data_stock) - 1): for k in data: data[k].append(data_stock.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", columns=columns) g_stock = history.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data)
if lista_nombres[i][-2] == lista_nombres[i][1]: n2 = "" medico = clases.Medico(n1, n2, ap1, ap2, lista_ruts[i], lista_edad[i], lista_emails[i], lista_numero[i], lista_especialidad[i]) lista_medicos.append(medico) clinica_objeto = clases.Clinica("Clinica de la Salud", "Público", "Avenida Verdadera #123, Rancagua", "", lista_medicos, lista_pacientes) lista_citas = [] cita_vacia = clases.Cita("", "", "", "") cita_csv = pd.read_csv('./datos/Citas.csv') cita_csv = DataFrame(cita_csv) codigo = cita_csv["codigo"].values rut_paciente = cita_csv["rut paciente"].values rut_medico = cita_csv["rut medico"].values fecha_citada = cita_csv["fecha citada"].values fecha_creacion = cita_csv["fecha de creacion"].values modalidad = cita_csv["modalidad"].values prestacion = cita_csv["prestacion"].values confirmada = cita_csv["confirmada"].values tiempo_restante = cita_csv["tiempo restante"].values for i in range(len(codigo)): cita_vacia.setCodigo(codigo[i]) cita_vacia.setPaciente(clinica_objeto.buscarPaciente(rut_paciente[i])[0]) cita_vacia.setMedico(clinica_objeto.buscarMedico(rut_medico[i])[0]) cita_vacia.setFechaCitada(parser.parse(fecha_citada[i]))
model.fit(x=X_train, y=y_train, epochs=3, batch_size=128, verbose=2, validation_split=0.1) #预测 y_predict = model.predict(X_test) #转换预测结果 y_predict_label = label2tag(predictions=y_predict, y=y) #统计正确率 Y_test = label2tag(predictions=y_test, y=y) print( sum([y_predict_label[i] == Y_test[i] for i in range(len(y_predict))]) / len(y_predict)) #导入另一个测试集进行预测,并导出结果 filename = 'xiaomi5a.csv' test_data = pd.read_csv(filename) x = test_data['comment'] X_cut = cut_texts(texts=x, need_cut=True, word_len=2, savepath=None) X_seq = text2seq(texts_cut=X_cut, maxlen=maxlen, tokenizer=tokenizer) X_seq = np.array(X_seq) y_predict = model.predict(X_seq) y_predict_label = label2tag(predictions=y_predict, y=y) #Series转成dateframe out_x = x.to_frame(name=None) out_y = DataFrame(y_predict_label) out_x.to_csv('x.csv') out_y.to_csv('y.csv')
#model starttime = datetime.datetime.now() #Caculate time sample_model = KMeans(n_clusters=10).fit(images_train_sample) #K-Means endtime = datetime.datetime.now() #Caculate time scikit_learn_execution_time = (endtime - starttime).seconds print('scikit-learn execution time:', scikit_learn_execution_time) #Caculate time 429s #objective function value cluster = sample_model.labels_ objective_function_value = sample_model.inertia_ #394810072745.4526 print('scikit-learn objective function value:', objective_function_value) #accuracy crosstable_data = {'label': labels_train_sample, 'cluster': list(cluster)} df = DataFrame(crosstable_data) crosstable = pd.crosstab(index=df['label'], columns=df['cluster']) scikit_accuracy = sum(crosstable.max(axis=0)) / sum(crosstable.sum()) #0.22124 print('scikit-learn accuracy:', scikit_accuracy) #PART3 my kmeans ###### ##### #### ### ## # #model starttime_2 = datetime.datetime.now() #Caculate start time cluster_center, cluster_assign = Kmeans(array(images_train_sample), 10)
def test_stata_doc_examples(self): with tm.ensure_clean() as path: df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path)
target = cl.astype('int') print (target) # 切分訓練與測試資料 train_X, test_X, train_y, test_y = train_test_split(data, target, train_size = 0.9, random_state = 42) print (train_y) # 建立分類器 clf = neighbors.KNeighborsClassifier(n_neighbors = 25) data_clf = clf.fit(train_X, train_y) # 預測 test_y_predicted = data_clf.predict(test_X) """print (test_y_predicted) # 標準答案 print (test_y)""" # 績效 accuracy = metrics.accuracy_score(test_y, test_y_predicted) print "accuracy : ", accuracy precision = metrics.precision_score(test_y, test_y_predicted, average='macro') print "precision : ", precision recall = metrics.recall_score(test_y, test_y_predicted, average='macro') print "recall : ", recall f_measure = 2 * (precision * recall / (precision + recall)) print "f_measure : ", f_measure output = {'click' : test_y_predicted} output = DataFrame(output) output.to_csv('output.csv', sep=',', index = 0)
def predict_role(ps): fd = pd.read_csv('player_label.csv') df_obj = fd.label fd.label = df_obj.apply(lambda x: str(x).strip()) print(fd.label) test_set = fd[['label']] train_set = fd[[ 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes' ]] train_set = train_set[1:] test_set = test_set[1:] from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(train_set, test_set, test_size=0.33, random_state=12) from sklearn.naive_bayes import MultinomialNB clf_NB = MultinomialNB().fit(x_train, y_train) predicted = clf_NB.predict(x_test) import numpy as np from sklearn import metrics print("#################### NB ######################") confusion_matrix_NB = metrics.confusion_matrix(y_test, predicted) print(confusion_matrix_NB) accuracy_NB = metrics.accuracy_score(y_test, predicted) print(accuracy_NB) # print(metrics.classification_report(y_test, predicted)) print("##############################################") from sklearn import tree clf_tree = tree.DecisionTreeClassifier().fit(x_train, y_train) predicted = clf_tree.predict(x_test) print("#################### Decision Tree ######################") print(metrics.confusion_matrix(y_test, predicted)) accuracy_DT = metrics.accuracy_score(y_test, predicted) print(accuracy_DT) # print(metrics.classification_report(y_test, predicted)) print("##############################################") from sklearn.linear_model import SGDClassifier clf_SGD = SGDClassifier().fit(x_train, y_train) predicted = clf_SGD.predict(x_test) print("#################### SGD Classifier ######################") print(metrics.confusion_matrix(y_test, predicted)) accuracy_SGD = metrics.accuracy_score(y_test, predicted) print(accuracy_SGD) print("##############################################") from pandas.core.frame import DataFrame predict_data = DataFrame(ps) print( "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------" ) print(predict_data) print( "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------" ) predict_data = predict_data.iloc[:, 7:] print( "---------------------- become 38 ----------------------------------------" ) print(predict_data) print( "---------------------- become 38 ----------------------------------------" ) accuracy_list = [accuracy_NB, accuracy_DT, accuracy_SGD] if max(accuracy_list) == accuracy_NB: clf_model = clf_NB elif max(accuracy_list) == accuracy_DT: clf_model = clf_tree elif max(accuracy_list) == accuracy_SGD: clf_model = clf_SGD predicted = clf_model.predict(predict_data) print("************* model selection ****************") print(clf_model) pd.value_counts(predicted) print(predicted) print(type(predicted)) return predicted.tolist()
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index)
def makeBindexDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, word_count=20, stock_id=None, ranking_type='tfidf'): if cur == None or start_date == None or end_date == None or output_file == None or stock_id == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if word_count < 0: word_count = 20 if ranking_type not in ["tfidf", "textrank"]: ranking_type = "tfidf" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type) word_count = len(words) for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) del words word_key_list = [] for i in range(1, word_count + 1): word_key_list.append("word%s" % i) columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] + word_key_list data = {} for k in columns: data[k] = [] pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] skip = 50 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else slimit - 1, skip if slimit - 1 < 0 else skip + 1)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt sdate = str(history_t[0][9]) edate = str(history_t[-1][9]) sdate = datetime.datetime.strptime(sdate, '%Y-%m-%d') sdate = (sdate - datetime.timedelta(days=1)).strftime('%Y-%m-%d') cur.execute( "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] bix_item = [cur_date] if len(bindex) > 0: for bi in bindex: if str(bi[2]) != cur_date: cur_date = str(bi[2]) bix.append(bix_item) bix_item = [cur_date] bix_temp = json.loads(bi[1]) bix_item.append(bix_temp['all']['0']) bix.append(bix_item) del bindex bindex = {} for k in range(1, len(bix)): b_t = [] for kk in range(1, len(bix[k])): if int(bix[k][kk]) != 0 and int(bix[k - 1][kk]) != 0: b_t.append( str( np.round( float(100 * (int(bix[k][kk]) / int(bix[k - 1][kk]) - 1)), 2))) else: b_t.append(str(0.01)) bindex[bix[k][0]] = b_t del bix for i in range(len(history_t)): history_t[i] += bindex[history_t[i][9]] history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } for i in range(10, 10 + word_count): history["word%s" % (i - 9)] = history_temp[i] del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data = {} for k in columns: data[k] = [] for i in range(len(data_stock) - 1): for k in data: data[k].append(data_stock.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", columns=columns) g_stock = history.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] index: Optional[Index] if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: Union[bool, float, int] if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: List[List] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
start_days) else: fitCurve(US_data[start_days:], title, 0, fill_dates, start_days) pass confirmed_cases_since_Jan_22 = [ 1, 1, 2, 2, 5, 5, 5, 5, 5, 7, 8, 8, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 51, 51, 57, 58, 60, 68, 74, 98, 118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632, 6421, 7783, 13677, 19100, 25489, 33276, 43847, 53740, 65778, 83836, 101657, 121478, 140886, 161807, 188172, 213372, 243453, 275586, 308850, 337072, 366614, 396263 ] #Until April 7th US_data = DataFrame({0: confirmed_cases_since_Jan_22}) ori_len = len(US_data) start_days = 0 last_date = datetime.datetime(2020, 1, 22) + datetime.timedelta(days=(len(US_data) - 1)) last_date_str = last_date.strftime('%Y-%m-%d') plotConfirmedCases(US_data, start_days, last_date_str) fill_dates = 0 ratio = 1.0 US_data = fillData(US_data, fill_dates, ratio) title = "US_best_scenario_pred" go(start_days, fill_dates, ratio, US_data, title)
def makeOriginDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) data = cur.execute( "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) data = cur.fetchall() if len(data) == 0: return None res = [] for d in data: res.append([ int(d[0]), int(d[1]), str(d[2]), float(d[3]), float(d[4]), float(d[5]), float(d[6]), float(d[7]), float(d[8]), float(d[9]), float(d[10]) ]) new_data = [] for d in zip(*res): new_data.append(d) origin_data = { 'id': new_data[0], 'stock_id': new_data[1], 'date': new_data[2], 'opening': new_data[3], 'closing': new_data[4], 'difference': new_data[5], 'percentage_difference': new_data[6], 'lowest': new_data[7], 'highest': new_data[8], 'volume': new_data[9], 'amount': new_data[10] } #读取原始数据,只保留需要使用的列 total_data = DataFrame(origin_data) total_data.sort_values(by=['stock_id', 'date'], inplace=True) #根据股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] / g_stock_num.shift(1)["closing"] - 1) for i in total_data.index: total_data.loc[i, 'rate'] = str( np.round(float(total_data['rate'][i]), 2)) #重新调整列的顺序,为接下来处理成输入、输出形式做准备 columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] total_data = total_data[columns] def func_train_data(data_one_stock_num): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_one_stock_num.name) data = { "stock_id": [], "date": [], "opening": [], "closing": [], "difference": [], "percentage_difference": [], "lowest": [], "highest": [], "volume": [], "amount": [], "rate": [] } for i in range(len(data_one_stock_num.index) - 1): for k in data: data[k].append(data_one_stock_num.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) total_data1 = total_data.dropna() total_data2 = total_data1.drop( total_data1[(total_data1.rate == 'nan')].index) g_stock_num = total_data2.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock_num.apply(func_train_data)