def test_append_aware(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assertEqual(ts_result.index.tz, rng1.tz) rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) utc = rng1.tz self.assertEqual(utc, ts_result.index.tz) rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Central') ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assertEqual(utc, ts_result.index.tz)
def test_setitem_ambiguous_keyerror(): s = Series(lrange(10), index=lrange(0, 20, 2)) # equivalent of an append s2 = s.copy() s2[1] = 5 expected = s.append(Series([5], index=[1])) assert_series_equal(s2, expected) s2 = s.copy() s2.loc[1] = 5 expected = s.append(Series([5], index=[1])) assert_series_equal(s2, expected)
def test_dt_accessor_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] expected_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] s = Series(date_range(freq='D', start=datetime(1998, 1, 1), periods=365)) english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert s.dt.weekday_name[day] == eng_name assert s.dt.day_name(locale=time_locale)[day] == name s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) s = Series(date_range(freq='M', start='2012', end='2013')) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) # work around https://github.com/pandas-dev/pandas/issues/22342 if not compat.PY2: result = result.str.normalize("NFD") expected = expected.str.normalize("NFD") tm.assert_series_equal(result, expected) for s_date, expected in zip(s, expected_months): result = s_date.month_name(locale=time_locale) expected = expected.capitalize() if not compat.PY2: result = unicodedata.normalize("NFD", result) expected = unicodedata.normalize("NFD", expected) assert result == expected s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1])
def build_and_connect_devices(nest_instance, devices, nest_nodes): # Build devices by their population (Series) # and target nodes (Series) for faster reading nest_devices = Series() for device in ensure_list(devices): # For every distinct quantity to be measured from NEST or stimulated towards NEST nodes... dev_names = device.get("names", None) if dev_names is None: # If no devices' names are given... nest_devices = nest_devices.append( build_and_connect_devices_one_to_one(nest_instance, device, nest_nodes)) else: nest_devices = nest_devices.append( build_and_connect_devices_one_to_many(nest_instance, device, nest_nodes, dev_names)) return nest_devices
def get_frequency_lists(df): # 로또 번호만 확인 num_with_no_bonus = df.loc[:, 'drwtNo1': 'drwtNo6'] # 1~6번호를 하나의 Series로 full_value = Series() for idx in num_with_no_bonus: full_value = full_value.append(num_with_no_bonus[idx]) # 1~6번호의 count 값이 들어 있는 Series full_value_count = full_value.value_counts() # 각 빈도수 별 번호 집합(Number) number_list = [] # 빈도수 List freq_list = [] # 빈도별 번호 갯수List(표 Y값 Array) freq_number_count_list = [] for value in full_value_count.unique(): temp_df = full_value_count[full_value_count == value] temp_df.index.values.sort() number_list.append(temp_df.index.values) freq_list.append(value) freq_number_count_list.append(temp_df.count()) return number_list, freq_list, freq_number_count_list
def run_one_baseline(ssp,inputs,finalhhframe,ini_pop_desc,ssp_pop,year,countrycode,characteristics,inc,inimin,ini_year,wbreg,data2day,food_share_data,istoobig=False): shareag,sharemanu = correct_shares(inputs['shareag'],inputs['sharemanu']) shareemp = inputs['shareemp'] future_pop_desc,pop_0014=build_new_description(ini_pop_desc,ssp_pop,ssp,year,countrycode,shareag,sharemanu,shareemp) if istoobig: weights_proj = Series() finalhhframe1,finalhhframe2 = finalhhframe ini_weights_sum = (finalhhframe1['weight']*finalhhframe1['nbpeople']).sum()+(finalhhframe2['weight']*finalhhframe2['nbpeople']).sum() for finalhhframehalf in [finalhhframe1,finalhhframe2]: characteristicshalf = keep_characteristics_to_reweight(finalhhframehalf) ini_weightshalf = finalhhframehalf['weight']*finalhhframehalf['nbpeople'] ratio = sum(ini_weightshalf)/ini_weights_sum ini_pop_deschalf = calc_pop_desc(characteristicshalf,ini_weightshalf) weights_projh = find_new_weights(characteristicshalf,ini_weightshalf,future_pop_desc*ratio) weights_proj = weights_proj.append(weights_projh) finalhhframe = concat([finalhhframe1,finalhhframe2],axis=0) weights_proj = DataFrame(weights_proj,index=finalhhframe.index,columns=["weight"]) else: ini_weights = finalhhframe['weight'] weights_proj = find_new_weights(characteristics,ini_weights,future_pop_desc) weights_proj = DataFrame(weights_proj,index=finalhhframe.index,columns=["weight"]) futurehhframe = futurehh(finalhhframe,pop_0014) futurehhframe['weight'] = weights_proj["weight"] income_proj,futureinc = future_income_simple_no_cc(inputs,year,finalhhframe,futurehhframe,inc,inimin,ini_year) income_proj.fillna(0, inplace=True) futurehhframe['Y'] = income_proj return futurehhframe,futureinc
def location(self): i = 0 ser = Series([]) for x in val_segment_dict.values()[self.start: self.end]: #for x in val_segment_dict.get(key, []): ser = ser.append(Series(len(x))) return ser.values, ser.cumsum().values
def test_append_aware_naive(self): rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assertTrue(ts_result.index.equals(ts1.index.asobject.append(ts2.index.asobject))) # mixed rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = lrange(100) ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assertTrue(ts_result.index.equals(ts1.index.asobject.append(ts2.index)))
def _clean_data(X, y): clean_X = DataFrame(columns=X.columns) clean_y = Series(name=y.name) skf = StratifiedKFold(n_splits=MajorityFiltering.k_folds, shuffle=True) for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y): train_X = DataHelper.select_rows(X, train_idxs, copy=False) train_y = DataHelper.select_rows(y, train_idxs, copy=False) ensemble = MajorityFiltering.get_ensemble() ensemble.fit(train_X, train_y) val_X = DataHelper.select_rows(X, val_idxs, copy=False) predictions = ensemble.predict(val_X) maintain_idxs = [val_idxs[i] for i in range(len(val_idxs)) \ if predictions[i]==y.iloc[val_idxs[i]]] maintain_X = DataHelper.select_rows(X, maintain_idxs, copy=True) maintain_y = DataHelper.select_rows(y, maintain_idxs, copy=True) clean_X = clean_X.append(maintain_X, verify_integrity=True, sort=False) clean_y = clean_y.append(maintain_y, verify_integrity=True) return clean_X, clean_y
def count_options(gear): options = Series() for slot in ['Armour', 'Relic', 'Mod', 'Enh', 'Ear', 'Aug']: print slot n_options = len(gear[(gear['Slot'] == slot) & (gear['N'] == 'TBC')]) options = options.append(Series(({slot : n_options}))) return options
def test_append_tz_explicit_pytz(self): # see gh-2938 from pytz import timezone as timezone rng = date_range( "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") ) rng2 = date_range( "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") ) rng3 = date_range( "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") ) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) result = ts.append(ts2) result_df = df.append(df2) tm.assert_index_equal(result.index, rng3) tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) tm.assert_index_equal(appended, rng3)
def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' assert rng1.append(rng1).name == 'foo' assert rng1.append(rng2).name is None
def read_normal_data(self) -> TrafficSequence: traffic_sequences = [ self._make_traffic_sequence(pcap_file, ranges, "benign") for pcap_file, ranges in self.subset["benign"].items() if os.path.exists(os.path.join(self.dataset_dir, pcap_file)) ] if len(traffic_sequences) == 1: return traffic_sequences[0] # if more than one traffic sequences are present, join them into one. joined_ids = [ id_item for id_item in itertools.chain(*map(lambda seq: seq.ids, traffic_sequences)) ] joined_labels = Series() for traffic_sequence in traffic_sequences: joined_labels = joined_labels.append(traffic_sequence.labels) joined_reader = itertools.chain( *map(lambda seq: seq.packet_reader, traffic_sequences) ) parts = {"all": joined_ids} # make sure that the same pcaps, even in different order, result in the same traffic sequence name; regardless # of the used test pcaps name_identifier = ",".join( sorted([t.name.split(".pcap")[0] for t in traffic_sequences]) ) return TrafficSequence( name=f"benign@UNSW-NB15:%s" % name_identifier, labels=joined_labels, packet_reader=joined_reader, parts=parts, ids=joined_ids, )
def read_normal_data(self) -> TrafficSequence: traffic_sequences = [ self._make_traffic_sequence(pcap_file, ranges) for pcap_file, ranges in self.subset["benign"].items() ] if len(traffic_sequences) == 1: return traffic_sequences[0] # if more than one traffic sequences are present, join them into one. joined_ids = [ id_item for id_item in itertools.chain(*map(lambda seq: seq.ids, traffic_sequences)) ] joined_labels = Series() for traffic_sequence in traffic_sequences: joined_labels = joined_labels.append(traffic_sequence.labels) joined_reader = itertools.chain( *map(lambda seq: seq.packet_reader, traffic_sequences) ) parts = {"all": joined_ids} return TrafficSequence( name=f"benign@CIC-IDS-2017:{self.subset_name}", labels=joined_labels, packet_reader=joined_reader, parts=parts, ids=joined_ids, )
def asseries(self, session): datasets = self.datasets(session) data = Series() for src in datasets: s = src.asseries(self.start, self.end) data = data.append(s) return data.sort_index()
def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 # result must be Timestamp/Timedelta, not datetime.datetime/timedelta dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) tdi = pd.TimedeltaIndex(["1 days", "2 days"]) exp = Index( [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timedelta("1 days"), pd.Timedelta("2 days"), ] ) res = dti.append(tdi) tm.assert_index_equal(res, exp) assert isinstance(res[0], pd.Timestamp) assert isinstance(res[-1], pd.Timedelta) dts = Series(dti) tds = Series(tdi) res = dts.append(tds) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) assert isinstance(res.iloc[0], pd.Timestamp) assert isinstance(res.iloc[-1], pd.Timedelta) res = pd.concat([dts, tds]) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) assert isinstance(res.iloc[0], pd.Timestamp) assert isinstance(res.iloc[-1], pd.Timedelta)
def test_append_concat_tz_explicit_pytz(self): # GH 2938 tm._skip_if_no_pytz() from pytz import timezone as timezone rng = date_range('5/8/2012 1:45', periods=10, freq='5T', tz=timezone('US/Eastern')) rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', tz=timezone('US/Eastern')) rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', tz=timezone('US/Eastern')) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) result = ts.append(ts2) result_df = df.append(df2) tm.assert_index_equal(result.index, rng3) tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) tm.assert_index_equal(appended, rng3)
def run_years(end_year, stat_str, num_sims=100, thresholds=[0.05, 0.5, 0.95]): results = {} ages, ignore_years = parse_string(stat_str) start_year = datetime.datetime.today().year + 1 for threshold in thresholds: results[threshold] = {(start_year - 1): 0} for year in range(start_year, end_year + 1): # print year probs = [] years = year - datetime.datetime.today().year for age in ages: probs.append(deathprob(age, years)) sims = build_sims(num_sims, len(ages)) death_counts = count_deaths(probs, sims) death_rates = death_thresholds(death_counts, thresholds) for threshold in thresholds: results[threshold][year] = death_rates[threshold] ages = Series(ages).abs() ages = [ int(x) for x in ages ] birthyears = datetime.datetime.today().year - np.array(ages) # Being very lazy about math here births = np.bincount(birthyears).cumsum()[min(birthyears):] birthseries = Series(births, index=range(min(birthyears),len(births)+min(birthyears))) interimyrs = range(max(birthyears) + 1, start_year) birthseries = birthseries.append(Series(len(ages), index=interimyrs)) results['births'] = (len(ages) - birthseries).to_dict() return results
def cost(self,gene): data=self.decodeGene(gene) d1=Series(data.student1.tolist(),index=data.student2) d2=Series(data.student2.tolist(),index=data.student1) data=d1.append(d2).sort_index() cost=np.where(data==self.pref.iloc[:,0],0,np.where(data==self.pref.iloc[:,1],1,3)) return cost.sum()
def data_validation(series): """ add gaps sum & nan sum to the time series Args: series (pandas.Series): time-series Returns: pandas.DataFrame: tags with columns 'nans', 'gaps', ... """ ts = series.copy() first_index = ts.index[0].replace(day=1, month=1, hour=0, minute=0) if first_index not in series.index: ts = Series(index=[first_index]).append(ts) last_index = ts.index[-1].replace(day=31, month=12, hour=23, minute=59) if last_index not in ts.index: ts = ts.append(Series(index=[last_index])) if ts.index.has_duplicates: # very slow an large data sets ts = ts[~ts.index.duplicated()].copy() if not ts.index.is_monotonic_increasing: raise UserWarning( 'Series has not monotonic increasing of the timestamps.') ts = ts.sort_index() tags = DataFrame(index=ts.index) tags['nans'] = isna(ts).astype(int) tags = tags.reindex(tags.asfreq('T').index) tags['gaps'] = isna(ts.fillna(0).reindex(tags.index)).astype(int) return tags
def test_append_concat_tz_dateutil(self): # see gh-2938 rng = date_range('5/8/2012 1:45', periods=10, freq='5T', tz='dateutil/US/Eastern') rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', tz='dateutil/US/Eastern') rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', tz='dateutil/US/Eastern') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) result = ts.append(ts2) result_df = df.append(df2) tm.assert_index_equal(result.index, rng3) tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) tm.assert_index_equal(appended, rng3)
def train_data_construct(bins, train_set, iteration, realtime = False): train_bins = defaultdict(tuple) print 'start to construct the train data bins' if realtime: idx = 0 for bin in bins: if len(bin) > 0: feature_bin = DataFrame() lable_bin = Series() for uid in bin: tmp = train_set[train_set['product_uid'] == int(uid)] if not tmp.empty: feature_bin = feature_bin.append(tmp) # should drop the relevance data here lable_bin = lable_bin.append(tmp['relevance']) train_bins[idx] = (feature_bin,lable_bin) print len(train_bins[idx][0]), ' entries in bin', idx # if idx == 0: # feature_bin.to_csv('feature_bin.csv') idx += 1 f1 = file('../data/train_bins'+str(iteration)+'.pkl','wb') pk.dump(train_bins,f1) else: f1 = file('../data/train_bins'+str(iteration)+'.pkl','rb') train_bins=pk.load(f1) print 'finish constructing training bins' return train_bins
def test_series_append_dst(self): rng1 = date_range("1/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") rng2 = date_range("8/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") ser1 = Series([1, 2, 3], index=rng1) ser2 = Series([10, 11, 12], index=rng2) ts_result = ser1.append(ser2) exp_index = DatetimeIndex( [ "2016-01-01 01:00", "2016-01-01 02:00", "2016-01-01 03:00", "2016-08-01 01:00", "2016-08-01 02:00", "2016-08-01 03:00", ], tz="US/Eastern", ) exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz
def split_train_test(X, Y, size=0.2): X = DataFrame(X) Y = Series(Y) folds = int(1 / size) kfold = StratifiedKFold(Y, n_folds=folds, random_state=1) x_train = DataFrame() y_test = Series() y_train = Series() x_test = DataFrame() for train, test in kfold: x_train = x_train.append(X.iloc[train]) y_train = y_train.append(Y.iloc[train]) y_test = y_test.append(Y.iloc[test]) x_test = x_test.append(X.iloc[test]) break return x_train, y_train, x_test, y_test
def fit(self, dataset: NumpyOrPandas): """Estimate label frequencies and create encoding dicts. Args: dataset: Pandas or Numpy dataset of categorical features. """ # set transformer names and add checks LAMLTransformer.fit(self, dataset) # set transformer features # convert to accepted dtype and get attributes roles = dataset.roles subs = self._get_df(dataset) self.dicts = {} for i in subs.columns: role = roles[i] try: flg_number = np.issubdtype(role.dtype, np.number) except TypeError: flg_number = False if not flg_number: co = role.unknown cnts = subs[i].value_counts(dropna=True) cnts = cnts[cnts > co].reset_index() cnts = Series(cnts["index"].astype(str).rank().values, index=cnts["index"].values) cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan])) self.dicts[i] = cnts return self
def get_occupancy(df, x_coord): # pick an arbitrary Tues SWTues = df[(df['start date'] == '2013-04-30 00:00:00') & (df['x coordinate'] == x_coord)] SWTues['start date'] = pd.to_datetime(SWTues['start date']) SWTues['end date'] = pd.to_datetime(SWTues['end date']) # make a datetime for today at midnight ts_now=pd.to_datetime("2013-10-05") SWTues['start time'] = pd.to_datetime(SWTues['start time']) # defaults to today (5th Oct) with correct hour:min SWTues['start time'] =SWTues['start time'] - ts_now # get relative difference, leaves a timedelta SWTues['start datetime'] = SWTues['start time'] + SWTues['start date'] # combine date with midnight & timedelta to get new datetime SWTues['end time'] = pd.to_datetime(SWTues['end time']) # defaults to today (5th Oct) with correct hour:min SWTues['end time'] =SWTues['end time'] - ts_now # get relative difference, leaves a timedelta SWTues['end datetime'] = SWTues['end time'] + SWTues['end date'] # combine date with midnight & timedelta to get new datetime s1 = Series(np.ones(SWTues.shape[0]), index=SWTues['start datetime']) s2 = Series(-1*np.ones(SWTues.shape[0]), index=SWTues['end datetime']) mean_occ = s1.append(s2).sort_index().cumsum().mean() max_occ = SWTues['Spaces'].max() tariff = SWTues['Tariff'].max() max_stay = SWTues['Max Stay'].max() total_takings = SWTues['amount paid'].sum() occ_prop = mean_occ/max_occ return mean_occ, max_occ, occ_prop, tariff, max_stay, total_takings
def test_concatlike_common_period_diff_freq_to_object(self): # GH 13221 pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") exp = Index( [ pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), pd.Period("2012-01-01", freq="D"), pd.Period("2012-02-01", freq="D"), ], dtype=object, ) res = pi1.append(pi2) tm.assert_index_equal(res, exp) ps1 = Series(pi1) ps2 = Series(pi2) res = ps1.append(ps2) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) res = pd.concat([ps1, ps2]) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' self.assertEqual(rng1.append(rng1).name, 'foo') assert rng1.append(rng2).name is None
def test_concatlike_common_period_mixed_dt_to_object(self): # GH 13221 # different datetimelike pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") tdi = pd.TimedeltaIndex(["1 days", "2 days"]) exp = Index( [ pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), pd.Timedelta("1 days"), pd.Timedelta("2 days"), ], dtype=object, ) res = pi1.append(tdi) tm.assert_index_equal(res, exp) ps1 = Series(pi1) tds = Series(tdi) res = ps1.append(tds) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) res = pd.concat([ps1, tds]) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) # inverse exp = Index( [ pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Period("2011-01", freq="M"), pd.Period("2011-02", freq="M"), ], dtype=object, ) res = tdi.append(pi1) tm.assert_index_equal(res, exp) ps1 = Series(pi1) tds = Series(tdi) res = tds.append(ps1) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) res = pd.concat([tds, ps1]) tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False): ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet)) io = fileIO() ######################################################################################## # Previous DB Data ######################################################################################## if not fileUtil(self.disc.getDBModValFilename(modVal)).exists: tsDB = timestat("Creating New DB For ModVal={0}".format(modVal)) dbdata = Series({}) ts.stop() else: tsDB = timestat("Loading ModVal={0} DB Data".format(modVal)) dbdata = self.disc.getDBModValData(modVal) tsDB.stop() ######################################################################################## # Previous Media Data ######################################################################################## previousMetadata = self.disc.getMetadataAlbumData(modVal) ######################################################################################## # Artist Search Data (No Media) ######################################################################################## tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal)) artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True) artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)] if len(artistSearchFilename) == 1: artistSearchData = io.get(artistSearchFilename[0]) else: raise ValueError("Could not find Discogs API Artist Search Data") tsDB.stop() N = artistSearchData.shape[0] modValue = 5000 if N >= 50000 else 1000 nSave = 0 tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N)) Nnew = 0 for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) if dbdata.get(artistID) is not None: continue artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})} dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)})) Nnew += 1 if Nnew > 0: print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal))) self.disc.saveDBModValData(modVal=modVal, idata=dbdata) else: print("Not saving any of the new data") ts.stop()
def _get_holidays(self, use_usd: bool = True) -> List[date]: if self.holiday_calendar: return self.holiday_calendar try: currencies = self.currencies or [] if use_usd: currencies.append('USD') if self.exchanges: cached_data = _cache.get( hashkey(use_usd, str(currencies), str(self.exchanges))) if cached_data: return cached_data holidays = Series() if self.exchanges: self.exchanges = [ x.value if isinstance(x, ExchangeCode) else x.upper() for x in self.exchanges ] exchange_query = GsDataApi.build_query(start=DATE_LOW_LIMIT, end=DATE_HIGH_LIMIT, exchange=self.exchanges) data = GsDataApi.query_data(exchange_query, 'HOLIDAY') holidays = holidays.append( Series(to_datetime(DataFrame(data)['date']).dt.date)) if len(currencies): currencies = [ x.value if isinstance(x, Currency) else x.upper() for x in currencies ] currency_query = GsDataApi.build_query(start=DATE_LOW_LIMIT, end=DATE_HIGH_LIMIT, currency=currencies) data = GsDataApi.query_data(currency_query, 'HOLIDAY_CURRENCY') holidays = holidays.append( Series(to_datetime(DataFrame(data)['date']).dt.date)) holidays = holidays.unique().tolist() _cache[hashkey(use_usd, str(currencies), str(self.exchanges))] = holidays return holidays except Exception as e: _logger.warning( 'Unable to fetch holiday calendar. Try passing your own when applying a rule.', e) return []
def _get_holidays(self) -> List[date]: if self.holiday_calendar is not None: if self.usd_calendar is None: return self.holiday_calendar return list(set().union(self.holiday_calendar, self.usd_calendar)) try: holidays = Series(dtype=object) currencies = ['USD' ] if self.currencies is None else self.currencies cached_data = _cache.get( hashkey(str(currencies), str(self.exchanges))) if cached_data: return cached_data if self.exchanges: exchanges = [ x.value if isinstance(x, ExchangeCode) else x.upper() for x in self.exchanges ] exchange_query = GsDataApi.build_query(start=DATE_LOW_LIMIT, end=DATE_HIGH_LIMIT, exchange=exchanges) data = GsDataApi.query_data(exchange_query, 'HOLIDAY') holidays = holidays.append( Series(to_datetime(DataFrame(data)['date']).dt.date)) if len(currencies): currencies = [ x.value if isinstance(x, Currency) else x.upper() for x in currencies ] currency_query = GsDataApi.build_query(start=DATE_LOW_LIMIT, end=DATE_HIGH_LIMIT, currency=currencies) data = GsDataApi.query_data(currency_query, 'HOLIDAY_CURRENCY') holidays = holidays.append( Series(to_datetime(DataFrame(data)['date']).dt.date)) holidays = holidays.unique().tolist() _cache[hashkey(str(currencies), str(self.exchanges))] = holidays return holidays except Exception as e: _logger.warning( 'Unable to fetch holiday calendar. Try passing your own when applying a rule.', e) return []
def _build_and_connect_devices(self, devices): # Build devices by the variable model they measure or stimulate (Series), # population (Series), # and target node (Series) for faster reading _devices = Series() for device in devices: _devices = _devices.append(self.build_and_connect_devices(device)) return _devices
def update_series(): ser = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) print('根据列表+索引创建:\n', ser) s = ser.drop('c') # print('删除后的结果:\n', s) s = ser.drop(['a', 'c']) # print('删除多个后的结果:\n', s) ser.pop('d') # ser.pop(0) # 索引删除 invalid key # ser.pop([0,1]) # 删除多个 invalid key print('pop删除,修改源数据:\n', ser) ser[0] = 1000 ser['f'] = 2000 ser2 = Series([100, 200], index=['x', 'y']) ser.append(ser2) print('修改Series:\n', ser) print(' Series append:\n', ser.append(ser2))
def test_setitem_ambiguous_keyerror(indexer_sl): s = Series(range(10), index=list(range(0, 20, 2))) # equivalent of an append s2 = s.copy() indexer_sl(s2)[1] = 5 expected = s.append(Series([5], index=[1])) tm.assert_series_equal(s2, expected)
def Fractions(df, symbols, colnames): from pandas import Series vec = Series() for col in colnames: valcounts = df[col].value_counts(normalize=True) valcounts = valcounts.reindex(symbols[col], fill_value=0.) vec = vec.append(valcounts.rename(lambda i: col + '_' + str(i))) pass return vec
def _build_and_connect_devices(self, devices): """Method to build and connect input or output devices, organized by - the variable they measure or stimulate (pandas.Series), and the - population(s) (pandas.Series), and - brain region nodes (pandas.Series) they target.""" _devices = Series() for device in devices: _devices = _devices.append(self.build_and_connect_devices(device)) return _devices
def data_process1(file_url): #read file df = read_file(file_url, sheet_name='Sheet1') data_replace = df.replace(np.nan, '') """group cut""" intergenic = str('lncRNA, intergenic') df2 = df[df['gene class'].str.contains(intergenic, na=False)] df2 = df2.drop_duplicates('gene name') """group cut""" """alias symbol列から、”|”で分ける""" newDF = df2['alias symbol'].str.split("|", expand=True) newDF = newDF.fillna(".") col_size = len(newDF.columns) # array_alias=[] namefile = Series() current_number = 0 while current_number < col_size: namefile1 = newDF[~newDF[current_number].str.contains('\.', na=False)] namefile1 = namefile1[current_number] namefile = namefile.append(namefile1) current_number += 1 namefile2 = namefile """削除.""" genename2 = df2[~df2['gene name'].str.contains('\.', na=False)] genename2 = genename2['gene name'] """名称を合わせる""" namefile = namefile.append(genename2) """CSV""" #down_path = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(__file__))), 'tmp/download') #data_file.save(os.path.join(upload_path, data_filename)) timestampe = time.strftime("%Y%m%d%H%M%S", time.localtime()) csv_file_name = 'GENANAME' + '_' + timestampe + '.csv' down_path = os.path.join(app.root_path, 'tmp/download') #csv_file_name = 'GENANAME.csv' csv_file_save_url = os.path.join(down_path, csv_file_name) csv_file = namefile.to_csv( #r"C:\WDD\OUTPUT\GENANAME.CSV" csv_file_save_url) #return csv_file_save_url return data_replace, csv_file_name
def test_series_append_aware_naive(self): rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) expected = ser1.index.astype(object).append(ser2.index.astype(object)) assert ts_result.index.equals(expected) # mixed rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = range(100) ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) expected = ser1.index.astype(object).append(ser2.index) assert ts_result.index.equals(expected)
def test_append_aware_naive(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assert_(ts_result.index.equals( ts1.index.asobject.append(ts2.index.asobject))) #mixed rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = range(100) ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) self.assert_(ts_result.index.equals( ts1.index.asobject.append(ts2.index)))
def test_series_append_aware_naive(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) expected = ser1.index.astype(object).append(ser2.index.astype(object)) assert ts_result.index.equals(expected) # mixed rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = range(100) ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) expected = ser1.index.astype(object).append(ser2.index) assert ts_result.index.equals(expected)
def get_timeframe(data: Dict[str, DataFrame]) -> Tuple[arrow.Arrow, arrow.Arrow]: """ Get the maximum timeframe for the given backtest data :param data: dictionary with preprocessed backtesting data :return: tuple containing min_date, max_date """ all_dates = Series([]) for pair, pair_data in data.items(): all_dates = all_dates.append(pair_data['date']) all_dates.sort_values(inplace=True) return arrow.get(all_dates.iloc[0]), arrow.get(all_dates.iloc[-1])
def extractJSONData(dict1): ser1 = Series(dict1['MonitoredVehicleJourney']) ser1 = ser1.append(Series({'RecordedAtTime':dict1['RecordedAtTime']})) nextStops = ser1['OnwardCalls']['OnwardCall'][0] ser1.drop('OnwardCalls', inplace = True) ser1 = unnest(ser1) nextStops = unnest(Series(nextStops)) nextStops.index = 'NextStop' + nextStops.index.values ser1 = pd.concat((ser1, nextStops)) df_row = DataFrame(ser1).transpose() return(df_row)
def test_series_append_aware(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], tz='US/Eastern') exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], tz='UTC') exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) utc = rng1.tz assert utc == ts_result.index.tz # GH#7795 # different tz coerces to object dtype, not UTC rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Central') ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), Timestamp('1/1/2011 02:00', tz='US/Central')]) exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp)
def main(): # Get links to survey pages home_url = "http://www.igmchicago.org/igm-economic-experts-panel" home_contents = get_page_contents(home_url) urls = re.findall( r"<h2><a href=\"(\S+?results\?SurveyID=\S+?)\"", home_contents) urls = ["http://www.igmchicago.org" + url for url in urls] # Loop through survey pages df = DataFrame() question_count = 0 for url in reversed(urls): contents = get_page_contents(url) questions = re.findall(r"surveyQuestion\">([\s\S]+?)</h3>", contents) responder_list = re.findall( r"\?id=([\d]+)?\">([\s\w.]+?)</a>", contents) responses = re.findall( r"<span class=\"option-[\d]+?\">([\s\w.]+?)</span>", contents) num_responders = len(responses) / len(questions) # Loop through sub-questions (A, B, etc) within each page for i, question in enumerate(questions): question = clean_string(question) question_count += 1 print(question) # Restrict range to responses for this sub-question rng = (i * num_responders, (i + 1) * num_responders) # Collect sub-question, its url suffix, and the responses prefix = "(%03d" % question_count + ") " q_responses = Series( responses[rng[0]:rng[1]], index=responder_list[rng[0]:rng[1]]) q_url_suffix = re.findall("=(.+)", url)[0] q_responses = q_responses.append( Series([q_url_suffix], index=['q_url_suffix'])) q_responses.name = prefix + question.strip() # Add question data to dataframe df = df.join(q_responses, how='outer') # Move responder id from index to column, only after all joins are complete df['responder_id'] = [pair[0] for pair in df.index] df.index = [pair[1] if type(pair) == tuple else pair for pair in df.index] # Write to file df.to_json("survey_results.json")
def compute_summary(self, combined_df): combined_mean = combined_df.mean() average_cons = combined_mean['consistency'] average_ambi = combined_mean['ambiguity'] # completeness = self.compute_completeness() noise = self.compute_noise() series = Series([average_cons, average_ambi]) series.index = ['Average Consistency', 'Average Ambiguity'] series_2 = Series(noise) series_2.index = [i.title() for i in series_2.index] series = series.append(series_2) return series
def test_series_append_dst(self): rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', tz='US/Eastern') rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', tz='US/Eastern') ser1 = Series([1, 2, 3], index=rng1) ser2 = Series([10, 11, 12], index=rng2) ts_result = ser1.append(ser2) exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', '2016-01-01 03:00', '2016-08-01 01:00', '2016-08-01 02:00', '2016-08-01 03:00'], tz='US/Eastern') exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz
class RolingMeanAlgorithm( Algorithm ): def __init__(self, smallAvg, bigAvg): self.df = Series() self.smallAvg = smallAvg self.bigAvg = bigAvg self.was = 0 # def getInputSettings( self ): # """metoda ta zwracalaby obiekt InputSettings, ktory to zawieralby # informacje odnosnie tego jakie wymogi musza spelniac danewejsciowe, # na razie wydaje mi sie ze moze to byc: # a) minimalny zakres danych # b) skok danych - odstep czasowy miedzy poszczegolnymi wartosciami # x) pewnie masz pomysly na wiecej parametrow""" # raise NotImplementedError( "Should have implemented this" ) def getBuySignals( self, measurement ): """na wejsciu data frame o zadanych przez InputSettings parametrach, na wyjsciu 0,1,-1 kiedy kupowac z kierunkiem - nie wiem czy bedziemy mieli takie algorytmy co beda w wyniku dawac sygnaly -1,1? """ #print (self.df) self.df = self.df.append(Series(measurement['ask'],index = ['a'])) #print measurement.name #sys.exit(1) if self.df.shape[0] == 20: curBig = rolling_mean(self.df, self.bigAvg) curSmall = rolling_mean(self.df[(self.bigAvg-self.smallAvg):], self.smallAvg) # print "1==========================" # print curBig[-1] # print curSmall[-1] if curBig[-1] < curSmall[-1]: #print "128==========================" self.df = self.df[1:] return [128,measurement.name] else: #print "129==========================" self.df = self.df[1:] return [130,measurement.name] else: return [129,measurement.name]
def test_datetimeindex(self): from pandas import date_range, NaT, Timestamp index = date_range("20130102", periods=6) s = Series(1, index=index) result = s.to_string() self.assertTrue("2013-01-02" in result) # nat in index s2 = Series(2, index=[Timestamp("20130111"), NaT]) s = s2.append(s) result = s.to_string() self.assertTrue("NaT" in result) # nat in summary result = str(s2.index) self.assertTrue("NaT" in result)
def entropy_n(self, calc_dkl=True): """ Calcualte the entropy of neighborhoods conditional on each bin and overall Parameters ---------- calc_dkl : boolean, default True Calculate KL Divergence at the same time (faster) Returns ------- entropy : DataFrame """ df_city = [] if calc_dkl: df_city_dkl = [] cols = self._filter().columns bin_num = len(cols) for name, group in self._grouped(): group_tot = group[self.tot_col].sum() p_n = DataFrame( [group[self.tot_col]/group_tot]*bin_num).transpose() H_n = Series(self._entropy(p_n)[0], index=['H(n)'], name=name) group_filtered = self._filter(df=group) H_n_y = Series( self._entropy(group_filtered), index=cols, name=name) H_n_y.index = ['H(n|y)_' + str(item) for item in H_n_y.index] if calc_dkl: DKL_n = self._dkl_n_group(group_filtered, name, cols, p_n) df_city_dkl.append(DKL_n) df_city.append(H_n_y.append(H_n)) self.H_n = DataFrame(df_city) if calc_dkl: self.DKL_n = DataFrame(df_city_dkl) # Return all the data return concat([self.H_n, self.DKL_n], axis=1) return self.H_n
def test_append_concat_tz_explicit_pytz(self): # see gh-2938 from pytz import timezone as timezone rng = date_range('5/8/2012 1:45', periods=10, freq='5T', tz=timezone('US/Eastern')) rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', tz=timezone('US/Eastern')) rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', tz=timezone('US/Eastern')) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) result = ts.append(ts2) result_df = df.append(df2) tm.assert_index_equal(result.index, rng3) tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) tm.assert_index_equal(appended, rng3)
train_y = outcome.iloc[train_idx] test_y = outcome.iloc[test_idx] train_x_t = text.iloc[train_idx] test_x_t = text.iloc[test_idx] train_x_v = df.iloc[train_idx, :] test_x_v = df.iloc[test_idx, :] tf = tf_vectorizer.fit_transform(train_x_t.tolist()) train_x = hstack([csr_matrix(train_x_v), tf], format='csr') tf = tf_vectorizer.fit_transform(test_x_t.tolist()) test_x = hstack([csr_matrix(test_x_v), tf], format='csr') test_x_list.append(test_x) test_y_all = test_y_all.append(test_y, ignore_index=True) _ = sgdr.partial_fit(train_x, train_y, classes=[0.0, 1.0]) print time() - t1 test_x = vstack(test_x_list, format='csr') test_y = test_y_all.to_frame('original') test_y['predicted'] = sgdr.predict_proba(test_x)[1] mad = (test_y['original'] - test_y['predicted']).abs().median() type_s = (test_y['original'].gt(0.0) == test_y['predicted'].gt(0.0)).mean() fn = (test_y['original'].gt(0.0) & test_y['predicted'].le(0.0)).mean() fp = (test_y['original'].le(0.0) & test_y['predicted'].gt(0.0)).mean()
feeder = ForexBi5DataFeeder(date(2012, 1, 1),date(2012, 1, 2), 'EURUSD') #broker = Broker() #wallet = Wallet(25000, BuyForOneTenthOfWalletBuyStrategy(), SellIfUpBy10OrDownBy5SellStrategy(), broker) alg = RolingMeanAlgorithm(5,20) #sim = Simulator(date(2012, 1, 1), date(2012, 1, 2), wallet, alg, feeder) dataFrame = DataFrame() signals = Series() curFeed = feeder.getData() while curFeed is not None: dataFrame = dataFrame.append([curFeed]) #print curFeed signal = alg.getBuySignals(curFeed) signals = signals.append(Series(signal[0], index = [signal[1]])) curFeed = feeder.getData() dataFrame['ask'].plot(); #rolling_mean(dataFrame['ask'], 20).plot() #rolling_mean(dataFrame['ask'], 5).plot() #print rolling_mean(dataFrame['ask'], 20)[20:30] #print rolling_mean(dataFrame['ask'], 5)[20:30] #print signals[20:30] signals.plot() plt.show()
def get_tdx_day_to_df_last(code, dayl=1, type=0, dt=None, ptype="low", dl=None): """ :param code:999999 :param dayl:Duration Days :param type:TDX type :param dt: Datetime :param ptype:low or high :return:Series or df """ # dayl=int(dayl) # type=int(type) # print "t:",dayl,"type",type if not type == 0: f = lambda x: str((1000000 - int(x))) if x.startswith("0") else x code = f(code) code_u = cct.code_to_symbol(code) day_path = day_dir % "sh" if code.startswith(("5", "6", "9")) else day_dir % "sz" p_day_dir = day_path.replace("/", path_sep).replace("\\", path_sep) # p_exp_dir=exp_dir.replace('/',path_sep).replace('\\',path_sep) # print p_day_dir,p_exp_dir file_path = p_day_dir + code_u + ".day" if not os.path.exists(file_path): ds = Series( {"code": code, "date": cct.get_today(), "open": 0, "high": 0, "low": 0, "close": 0, "amount": 0, "vol": 0} ) return ds ofile = file(file_path, "rb") b = 0 e = 32 if dayl == 1 and dt == None: log.debug("%s" % (dayl == 1 and dt == None)) fileSize = os.path.getsize(file_path) if fileSize < 32: print "why", code ofile.seek(-e, 2) buf = ofile.read() ofile.close() a = unpack("IIIIIfII", buf[b:e]) tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8] topen = float(a[1] / 100.0) thigh = float(a[2] / 100.0) tlow = float(a[3] / 100.0) tclose = float(a[4] / 100.0) amount = float(a[5] / 10.0) tvol = int(a[6]) # int # tpre = int(a[7]) # back dt_list = Series( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) return dt_list elif dayl == 1 and dt is not None and dl is not None: log.debug("dt:%s" % (dt)) dt_list = [] # if len(str(dt)) == 8: # dt = cct.day8_to_day10(dt) # else: # dt=get_duration_price_date(code, ptype=ptype, dt=dt) # print ("dt:%s"%dt) fileSize = os.path.getsize(file_path) if fileSize < 32: print "why", code b = fileSize ofile.seek(-fileSize, 2) no = int(fileSize / e) # if no < newstockdayl: # return Series() # print no,b,day_cout,fileSize buf = ofile.read() ofile.close() # print repr(buf) # df=pd.DataFrame() for i in xrange(no): a = unpack("IIIIIfII", buf[-e:b]) tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8] topen = float(a[1] / 100.0) thigh = float(a[2] / 100.0) tlow = float(a[3] / 100.0) tclose = float(a[4] / 100.0) amount = float(a[5] / 10.0) tvol = int(a[6]) # int # tpre = int(a[7]) # back dt_list.append( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) # print series # dSeries.append(series) # dSeries.append(Series({'code':code,'date':tdate,'open':topen,'high':thigh,'low':tlow,'close':tclose,'amount':amount,'vol':tvol,'pre':tpre})) b = b - 32 e = e + 32 # print tdate,dt if tdate < dt: # print "why" break df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns) # print "len:%s %s"%(len(df),fileSize) df = df.set_index("date") dt = get_duration_price_date(code, ptype=ptype, dt=dt, df=df, dl=dl) log.debug("last_dt:%s" % dt) dd = df[df.index == dt] if len(dd) > 0: dd = dd[:1] dt = dd.index.values[0] dd = dd.T[dt] dd["date"] = dt else: log.warning("no < dt:NULL") dd = Series() # dd = Series( # {'code': code, 'date': cct.get_today(), 'open': 0, 'high': 0, 'low': 0, 'close': 0, 'amount': 0, # 'vol': 0}) return dd else: dt_list = [] fileSize = os.path.getsize(file_path) # print fileSize day_cout = abs(e * int(dayl)) # print day_cout if day_cout > fileSize: b = fileSize ofile.seek(-fileSize, 2) no = int(fileSize / e) else: no = int(dayl) b = day_cout ofile.seek(-day_cout, 2) # print no,b,day_cout,fileSize buf = ofile.read() ofile.close() # print repr(buf) # df=pd.DataFrame() for i in xrange(no): a = unpack("IIIIIfII", buf[-e:b]) tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8] topen = float(a[1] / 100.0) thigh = float(a[2] / 100.0) tlow = float(a[3] / 100.0) tclose = float(a[4] / 100.0) amount = float(a[5] / 10.0) tvol = int(a[6]) # int # tpre = int(a[7]) # back dt_list.append( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) # print series # dSeries.append(series) # dSeries.append(Series({'code':code,'date':tdate,'open':topen,'high':thigh,'low':tlow,'close':tclose,'amount':amount,'vol':tvol,'pre':tpre})) b = b - 32 e = e + 32 df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns) df = df.set_index("date") return df
def get_tdx_Exp_day_to_df(code, type="f", start=None, end=None, dt=None, dl=None): # start=cct.day8_to_day10(start) # end=cct.day8_to_day10(end) # day_path = day_dir % 'sh' if code[:1] in ['5', '6', '9'] else day_dir % 'sz' code_u = cct.code_to_symbol(code) log.debug("code:%s code_u:%s" % (code, code_u)) if type == "f": file_path = exp_path + "forwardp" + path_sep + code_u.upper() + ".txt" elif type == "b": file_path = exp_path + "backp" + path_sep + code_u.upper() + ".txt" else: return None log.debug("daypath:%s" % file_path) # p_day_dir = day_path.replace('/', path_sep).replace('\\', path_sep) # p_exp_dir = exp_dir.replace('/', path_sep).replace('\\', path_sep) # print p_day_dir,p_exp_dir if not os.path.exists(file_path): # ds = Series( # {'code': code, 'date': cct.get_today(), 'open': 0, 'high': 0, 'low': 0, 'close': 0, 'amount': 0, # 'vol': 0}) ds = pd.DataFrame() log.error("file_path:not exists") return ds # ofile = open(file_path, 'rb') if dt is None and dl is None: ofile = open(file_path, "rb") buf = ofile.readlines() ofile.close() num = len(buf) no = num - 1 dt_list = [] for i in xrange(no): a = buf[i].split(",") # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000 # da=a[0].split('/') tdate = a[0] # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8] # tdate=dt.strftime('%Y-%m-%d') topen = float(a[1]) thigh = float(a[2]) tlow = float(a[3]) tclose = float(a[4]) # tvol = round(float(a[5]) / 10, 2) tvol = float(a[5]) amount = round(float(a[6].replace("\r\n", "")), 1) # int # tpre = int(a[7]) # back if int(amount) == 0: continue dt_list.append( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) # if dt is not None and tdate < dt: # break df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns) # df.sort_index(ascending=False, inplace=True) if start is not None and end is not None: df = df[(df.date >= start) & (df.date <= end)] elif end is not None: df = df[df.date <= end] elif start is not None: df = df[df.date >= start] df = df.set_index("date") return df elif int(dl) == 1: # fileSize = os.path.getsize(file_path) # if fileSize < 60 * newstockdayl: # return Series() data = cct.read_last_lines(file_path, int(dl) + 3) data_l = data.split("\n") dt_list = Series() data_l.reverse() log.debug("day 1:%s" % data_l) for line in data_l: a = line.split(",") # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000 # da=a[0].split('/') log.debug("day 1 len(a):%s a:%s" % (len(a), a)) if len(a) > 5: tdate = a[0] log.debug("day 1 tdate:%s" % tdate) # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8] # tdate=dt.strftime('%Y-%m-%d') topen = float(a[1]) thigh = float(a[2]) tlow = float(a[3]) tclose = float(a[4]) # tvol = round(float(a[5]) / 10, 2) tvol = float(a[5]) amount = round(float(a[6].replace("\r\n", "")), 1) # int # tpre = int(a[7]) # back if int(amount) == 0: continue dt_list = Series( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) break else: continue # if dt is not None and tdate < dt: # break # df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns) # df = df.set_index('date') return dt_list else: fileSize = os.path.getsize(file_path) # if fileSize < 60 * newstockdayl: # return Series() data = cct.read_last_lines(file_path, int(dl) + 2) dt_list = [] data_l = data.split("\n") data_l.reverse() for line in data_l: a = line.split(",") # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000 # da=a[0].split('/') if len(a) > 5: tdate = a[0] # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8] # tdate=dt.strftime('%Y-%m-%d') topen = float(a[1]) thigh = float(a[2]) tlow = float(a[3]) tclose = float(a[4]) tvol = round(float(a[5]) / 10, 2) amount = round(float(a[6].replace("\r\n", "")), 1) # int # tpre = int(a[7]) # back if int(amount) == 0: continue dt_list.append( { "code": code, "date": tdate, "open": topen, "high": thigh, "low": tlow, "close": tclose, "amount": amount, "vol": tvol, } ) else: continue # if dt is not None and tdate < dt: # break df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns) # df.sort_index(ascending=False, inplace=True) # if start is not None and end is not None: # df = df[(df.date >= start) & (df.date <= end)] # elif end is not None: # df = df[df.date <= end] # elif start is not None: # df = df[df.date >= start] df = df.set_index("date") # print "time:",(time.time()-time_s)*1000 return df
"silent": 1, "thread": 1, "seed": 1301 } num_boost_round = 1000 print("Train a XGBoost model") X_train, X_valid = train_test_split(train, test_size=0.01, random_state=10) y_train = np.log1p(X_train.Sales) y_valid = np.log1p(X_valid.Sales) dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True) print("Validating") predict = gbm.predict(xgb.DMatrix(X_valid[features])) error = rmspe(X_valid.Sales.values, np.expm1(predict)) print('RMSPE: {:.6f}'.format(error)) print("Make predictions on the test set") dtest = xgb.DMatrix(test[features]) ytest = gbm.predict(dtest) sub = Series() sub = sub.append(Series(np.expm1(ytest), index = test.Id)) sub = sub.append(Series(0, index = closedId)) # Make Submission sub = pd.DataFrame({"Id": sub.index, "Sales": sub.values}) sub.to_csv("xgboost_submission2.csv", index=False)
# get relative difference, leaves a timedelta v['start_time'] = v['start_time'] - ts_now v['end_time'] = v['end_time'] - ts_now # combine date with midnight & timedelta to get new datetime v['start_datetime'] = v['start_time'] + v['start_date'] v['end_datetime'] = v['end_time'] + v['end_date'] start_date = pd.to_datetime(Series(v.start_datetime)) start_date.sort() end_date = pd.to_datetime(Series(v.end_datetime)) end_date.sort() ts1 = Series(np.ones(len(start_date)), start_date) ts2 = Series(-1*np.ones(len(end_date)), end_date) ts = ts1.append(ts2) us = ts.sort_index() cs = us.cumsum() mpl.rc('figure', figsize = (10, 8)) cs.plot() plt.show()
def do_eBBQ(self, variations= None, plot_fig = False, modes = None, Pj_from_current = True, junc_rect = [], junc_lines = None, junc_len = [], junc_LJ_var_name = [], dielectrics = None, seams = None, surface = False, calc_Hamiltonian = False,pJ_method = 'J_surf_mag', save_mesh_stats = True): """ Pj_from_current: Multi-junction calculation of energy participation ratio matrix based on <I_J>. Current is integrated average of J_surf by default: (zkm 3/29/16) Will calculate the Pj matrix for the selected modes for the given junctions junc_rect array & length of juuncs junc_rect = ['junc_rect1', 'junc_rect2'] name of junc rectangles to integrate H over junc_lines = ['junc_line1', 'junc_line2'] used to define the current flow direction, arbitrary, doesnt really matter that much, just need a line there junc_len = [0.0001] lenght of junc = lenght of junc_line #TODO: could now get rid of this and use the line [specify in SI units; i.e., meters] junc_LJ_var_name = ['LJ1', 'LJ2'] pJ_method = 'J_surf_mag' - takes the avg. Jsurf over the rect. Make sure you have seeded lots of tets here. i recommend starting with 4 across smallest dimension. Assumptions: Low dissipation (high-Q). Right now, we assume that there are no lumped capcitors to simply calculations. Not required. We assume that there are only lumped inductors, so that U_tot = U_E+U_H+U_L and U_C =0, so that U_tot = 2*U_E; Other parameters: seams = ['seam1', 'seam2'] (seams needs to be a list of strings) variations = ['0', '1'] A variation is a combination of project/design variables in an optimetric sweep """ self.Pj_from_current = Pj_from_current; meta_data = {}; assert(type(junc_LJ_var_name) == list), "Please pass junc_LJ_var_name as a list " if Pj_from_current : print_color(' Setup: ' + self.setup.name); self.PJ_multi_sol = {} # this is where the result will go if seams is not None: self.seams = seams; meta_data['seams'] = seams; if dielectrics is not None: self.dielectrics = dielectrics; meta_data['dielectrics'] = dielectrics; if variations is None: variations = (['-1'] if self.listvariations == (u'',) else [str(i) for i in range(self.nvariations)] ) if modes is None: modes = range(self.nmodes) if self.latest_h5_path is not None and self.append_analysis:shutil.copyfile(self.latest_h5_path, self.data_filename); self.h5file = hdf = pd.HDFStore(self.data_filename); self.variations = variations; self.modes = modes; self.njunc = len(junc_rect) meta_data['junc_rect'] = junc_rect; meta_data['junc_lines'] = junc_lines; meta_data['junc_len'] = junc_len; meta_data['junc_LJ_var_name'] = junc_LJ_var_name; meta_data['pJ_method'] = pJ_method; mesh_stats = self.mesh_stats = [] for ii, variation in enumerate(variations): print_color( 'variation : ' + variation + ' / ' + str(self.nvariations-1), bg = 44, newline = False ) self.lv = self.get_lv(variation) if (variation+'/hfss_variables') in hdf.keys() and self.append_analysis: print_NoNewLine(' previously analyzed ...\n'); \ continue; print_NoNewLine( ' NOT analyzed\n' ); time.sleep(0.5) hdf[variation+'/hfss_variables'] = self.hfss_variables[variation] = varz \ = pd.Series(self.get_variables(variation=variation)) freqs_bare_dict, freqs_bare_vals = self.get_freqs_bare(variation) # get bare freqs from HFSS self.pjs={}; var_sol_accum = [] for mode in modes: sol = Series({'freq' : freqs_bare_vals[mode]*10**-9, 'modeQ' : freqs_bare_dict['Q_'+str(mode)] }) self.omega = 2*np.pi*freqs_bare_vals[mode] # this should really be passed as argument to the functions rather than a property of the calss I would say print ' Mode \x1b[0;30;46m ' + str(mode) + ' \x1b[0m / ' + str(self.nmodes-1)+' calculating:' self.solutions.set_mode(mode+1, 0) self.fields = self.setup.get_fields() print_NoNewLine(' U_H ...'); sol['U_H'] = self.U_H = self.calc_U_H(variation) print_NoNewLine(' U_E'); sol['U_E'] = self.U_E = self.calc_U_E(variation) print( " => U_L = %.3f%%" %( (self.U_E - self.U_H )/(2*self.U_E)) ) if self.Pj_from_current: self.LJs = [ ureg.Quantity(varz['_'+LJvar_nm]).to_base_units().magnitude for LJvar_nm in junc_LJ_var_name] meta_data['LJs'] = dict(zip(junc_LJ_var_name, self.LJs)) print ' I -> p_{mJ} ...' sol_PJ = self.calc_Pjs_from_I_for_mode(variation, self.U_H, self.U_E, self.LJs, junc_rect, junc_len, method = pJ_method, freq = freqs_bare_vals[mode]*10**-9, calc_sign = junc_lines) sol = sol.append(sol_PJ) if self.njunc == 1: # Single-junction method using global U_H and U_E; assert(type(junc_LJ_var_name) == list and len(junc_LJ_var_name) == 1), "Please pass junc_LJ_var_name as array of 1 element for a single junction; e.g., junc_LJ_var_name = ['junc1']" #lj = 1E-3*ureg.Quantity(varz['_'+junc_LJ_var_name]).to_base_units().magnitude sol['pj1'] = self.get_p_j(mode) self.pjs.update(sol['pj1']) # convinience function for single junction case if seams is not None: # get seam Q for seam in seams: sol = sol.append(self.get_Qseam(seam,mode,variation)) if dielectrics is not None: # get Q dielectric for dielectric in dielectrics: sol = sol.append(self.get_Qdielectric(dielectric, mode, variation)) if surface is True: # get Q surface sol = sol.append( self.get_Qsurface(mode, variation) ) var_sol_accum +=[sol] #TODO: add metadata to the Dataframe & save it # such as what are the junc_rect names and Lj values etc. (e.g., http://stackoverflow.com/questions/29129095/save-additional-attributes-in-pandas-dataframe/29130146#29130146) hdf[variation+'/eBBQ_solution'] = self.sols[variation] \ = pd.DataFrame(var_sol_accum, index = modes) hdf[variation+'/meta_data'] = self.meta_data[variation] \ = Series(meta_data) if save_mesh_stats: msh = self.setup.get_mesh_stats(self.listvariations[ureg(variation)]) mesh_stats += [msh] if msh is not None: hdf[variation+'/mesh_stats'] = msh # returns dataframe conv = self.setup.get_convergence(self.listvariations[ureg(variation)]) # returns dataframe #print 'conv.' if conv is not None: hdf[variation+'/convergence'] = conv self.h5file.close() self.bbq_analysis = BbqAnalysis(self.data_filename, variations=self.variations) #TODO: to be implemented below # if plot_fig: # self.bbq_analysis.plot_Hparams(modes=self.modes) # self.bbq_analysis.print_Hparams(modes=self.modes) return