Python Series.appendの例、pandas.Series.append Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: techtommey/pandas

    def test_append_aware(self):
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
                          tz='US/Eastern')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Eastern')
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assertEqual(ts_result.index.tz, rng1.tz)

        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
                          tz='UTC')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='UTC')
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        utc = rng1.tz
        self.assertEqual(utc, ts_result.index.tz)

        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
                          tz='US/Eastern')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Central')
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assertEqual(utc, ts_result.index.tz)

コード例 #2

0

ファイルを表示

ファイル: test_indexing.py プロジェクト: christlc/pandas

def test_setitem_ambiguous_keyerror():
    s = Series(lrange(10), index=lrange(0, 20, 2))

    # equivalent of an append
    s2 = s.copy()
    s2[1] = 5
    expected = s.append(Series([5], index=[1]))
    assert_series_equal(s2, expected)

    s2 = s.copy()
    s2.loc[1] = 5
    expected = s.append(Series([5], index=[1]))
    assert_series_equal(s2, expected)

コード例 #3

0

ファイルを表示

ファイル: test_datetime_values.py プロジェクト: brianholland/pandas

    def test_dt_accessor_datetime_name_accessors(self, time_locale):
        # Test Monday -> Sunday and January -> December, in that sequence
        if time_locale is None:
            # If the time_locale is None, day-name and month_name should
            # return the english attributes
            expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                             'Friday', 'Saturday', 'Sunday']
            expected_months = ['January', 'February', 'March', 'April', 'May',
                               'June', 'July', 'August', 'September',
                               'October', 'November', 'December']
        else:
            with tm.set_locale(time_locale, locale.LC_TIME):
                expected_days = calendar.day_name[:]
                expected_months = calendar.month_name[1:]

        s = Series(date_range(freq='D', start=datetime(1998, 1, 1),
                              periods=365))
        english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                        'Friday', 'Saturday', 'Sunday']
        for day, name, eng_name in zip(range(4, 11),
                                       expected_days,
                                       english_days):
            name = name.capitalize()
            assert s.dt.weekday_name[day] == eng_name
            assert s.dt.day_name(locale=time_locale)[day] == name
        s = s.append(Series([pd.NaT]))
        assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1])

        s = Series(date_range(freq='M', start='2012', end='2013'))
        result = s.dt.month_name(locale=time_locale)
        expected = Series([month.capitalize() for month in expected_months])

        # work around https://github.com/pandas-dev/pandas/issues/22342
        if not compat.PY2:
            result = result.str.normalize("NFD")
            expected = expected.str.normalize("NFD")

        tm.assert_series_equal(result, expected)

        for s_date, expected in zip(s, expected_months):
            result = s_date.month_name(locale=time_locale)
            expected = expected.capitalize()

            if not compat.PY2:
                result = unicodedata.normalize("NFD", result)
                expected = unicodedata.normalize("NFD", expected)

            assert result == expected

        s = s.append(Series([pd.NaT]))
        assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1])

コード例 #4

0

ファイルを表示

ファイル: nest_factory.py プロジェクト: gabiflorea10/tvb-multiscale

def build_and_connect_devices(nest_instance, devices, nest_nodes):
    # Build devices by their population (Series)
    # and target nodes (Series) for faster reading
    nest_devices = Series()
    for device in ensure_list(devices):
        # For every distinct quantity to be measured from NEST or stimulated towards NEST nodes...
        dev_names = device.get("names", None)
        if dev_names is None:  # If no devices' names are given...
            nest_devices = nest_devices.append(
                build_and_connect_devices_one_to_one(nest_instance, device,
                                                     nest_nodes))
        else:
            nest_devices = nest_devices.append(
                build_and_connect_devices_one_to_many(nest_instance, device,
                                                      nest_nodes, dev_names))
    return nest_devices

コード例 #5

0

ファイルを表示

def get_frequency_lists(df):
    # 로또 번호만 확인
    num_with_no_bonus = df.loc[:, 'drwtNo1': 'drwtNo6']

    # 1~6번호를 하나의 Series로
    full_value = Series()
    for idx in num_with_no_bonus:
        full_value = full_value.append(num_with_no_bonus[idx])

    # 1~6번호의 count 값이 들어 있는 Series
    full_value_count = full_value.value_counts()

    # 각 빈도수 별 번호 집합(Number)
    number_list = []

    # 빈도수 List
    freq_list = []

    # 빈도별 번호 갯수List(표 Y값 Array)
    freq_number_count_list = []

    for value in full_value_count.unique():
        temp_df = full_value_count[full_value_count == value]
        temp_df.index.values.sort()
        number_list.append(temp_df.index.values)
        freq_list.append(value)
        freq_number_count_list.append(temp_df.count())
    return number_list, freq_list, freq_number_count_list

コード例 #6

0

ファイルを表示

ファイル: lib_for_country_run.py プロジェクト: julierozz/poverty_climate_model_public

def run_one_baseline(ssp,inputs,finalhhframe,ini_pop_desc,ssp_pop,year,countrycode,characteristics,inc,inimin,ini_year,wbreg,data2day,food_share_data,istoobig=False):
	shareag,sharemanu = correct_shares(inputs['shareag'],inputs['sharemanu'])
	shareemp          = inputs['shareemp']
	
	future_pop_desc,pop_0014=build_new_description(ini_pop_desc,ssp_pop,ssp,year,countrycode,shareag,sharemanu,shareemp)
	
	if istoobig:
		weights_proj = Series()
		finalhhframe1,finalhhframe2 = finalhhframe

		ini_weights_sum   = (finalhhframe1['weight']*finalhhframe1['nbpeople']).sum()+(finalhhframe2['weight']*finalhhframe2['nbpeople']).sum()
		for finalhhframehalf in [finalhhframe1,finalhhframe2]:
			characteristicshalf = keep_characteristics_to_reweight(finalhhframehalf)
			ini_weightshalf     = finalhhframehalf['weight']*finalhhframehalf['nbpeople']
			ratio               = sum(ini_weightshalf)/ini_weights_sum
			ini_pop_deschalf    = calc_pop_desc(characteristicshalf,ini_weightshalf)
			weights_projh       = find_new_weights(characteristicshalf,ini_weightshalf,future_pop_desc*ratio)
			weights_proj        = weights_proj.append(weights_projh)
		finalhhframe  = concat([finalhhframe1,finalhhframe2],axis=0)
		weights_proj = DataFrame(weights_proj,index=finalhhframe.index,columns=["weight"])
	else:
		ini_weights   = finalhhframe['weight']
		weights_proj  = find_new_weights(characteristics,ini_weights,future_pop_desc)
		weights_proj = DataFrame(weights_proj,index=finalhhframe.index,columns=["weight"])
		
	futurehhframe              = futurehh(finalhhframe,pop_0014)
	futurehhframe['weight']    = weights_proj["weight"]

	income_proj,futureinc  = future_income_simple_no_cc(inputs,year,finalhhframe,futurehhframe,inc,inimin,ini_year)
	income_proj.fillna(0, inplace=True)
	futurehhframe['Y']         = income_proj
		
	return futurehhframe,futureinc

コード例 #7

0

ファイルを表示

ファイル: TianChi.py プロジェクト: xuyitie/python-widget

 def location(self):
     i = 0
     ser = Series([])
     for x in val_segment_dict.values()[self.start: self.end]:
     #for x in val_segment_dict.get(key, []):
         ser = ser.append(Series(len(x)))
     return ser.values, ser.cumsum().values

コード例 #8

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: arvindchari88/newGitTest

    def test_append_aware_naive(self):
        rng1 = date_range("1/1/2011 01:00", periods=1, freq="H")
        rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern")
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assertTrue(ts_result.index.equals(ts1.index.asobject.append(ts2.index.asobject)))

        # mixed

        rng1 = date_range("1/1/2011 01:00", periods=1, freq="H")
        rng2 = lrange(100)
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assertTrue(ts_result.index.equals(ts1.index.asobject.append(ts2.index)))

コード例 #9

0

ファイルを表示

ファイル: majority_filtering.py プロジェクト: jpedrocm/noise-detection-ensemble

    def _clean_data(X, y):

        clean_X = DataFrame(columns=X.columns)
        clean_y = Series(name=y.name)

        skf = StratifiedKFold(n_splits=MajorityFiltering.k_folds, shuffle=True)

        for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y):

            train_X = DataHelper.select_rows(X, train_idxs, copy=False)
            train_y = DataHelper.select_rows(y, train_idxs, copy=False)

            ensemble = MajorityFiltering.get_ensemble()
            ensemble.fit(train_X, train_y)

            val_X = DataHelper.select_rows(X, val_idxs, copy=False)

            predictions = ensemble.predict(val_X)

            maintain_idxs = [val_idxs[i] for i in range(len(val_idxs)) \
                if predictions[i]==y.iloc[val_idxs[i]]]

            maintain_X = DataHelper.select_rows(X, maintain_idxs, copy=True)
            maintain_y = DataHelper.select_rows(y, maintain_idxs, copy=True)

            clean_X = clean_X.append(maintain_X,
                                     verify_integrity=True,
                                     sort=False)
            clean_y = clean_y.append(maintain_y, verify_integrity=True)

        return clean_X, clean_y

コード例 #10

0

ファイルを表示

ファイル: dps_heal_gear_combos_v2.py プロジェクト: heatherdbcooper/swtorprojects

def count_options(gear):
    options = Series()
    for slot in ['Armour', 'Relic', 'Mod', 'Enh', 'Ear', 'Aug']:
        print slot
        n_options = len(gear[(gear['Slot'] == slot) & (gear['N'] == 'TBC')])
        options = options.append(Series(({slot : n_options})))
    return options

コード例 #11

0

ファイルを表示

ファイル: test_append.py プロジェクト: AlexeyDzyubaP/LinearReg

    def test_append_tz_explicit_pytz(self):
        # see gh-2938
        from pytz import timezone as timezone

        rng = date_range(
            "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern")
        )
        rng2 = date_range(
            "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern")
        )
        rng3 = date_range(
            "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern")
        )
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        ts2 = Series(np.random.randn(len(rng2)), rng2)
        df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)

        result = ts.append(ts2)
        result_df = df.append(df2)
        tm.assert_index_equal(result.index, rng3)
        tm.assert_index_equal(result_df.index, rng3)

        appended = rng.append(rng2)
        tm.assert_index_equal(appended, rng3)

コード例 #12

0

ファイルを表示

ファイル: test_combine_concat.py プロジェクト: TomAugspurger/pandas

    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        assert rng1.append(rng1).name == 'foo'
        assert rng1.append(rng2).name is None

コード例 #13

0

ファイルを表示

 def read_normal_data(self) -> TrafficSequence:
     traffic_sequences = [
         self._make_traffic_sequence(pcap_file, ranges, "benign")
         for pcap_file, ranges in self.subset["benign"].items()
         if os.path.exists(os.path.join(self.dataset_dir, pcap_file))
     ]
     if len(traffic_sequences) == 1:
         return traffic_sequences[0]
     # if more than one traffic sequences are present, join them into one.
     joined_ids = [
         id_item
         for id_item in itertools.chain(*map(lambda seq: seq.ids, traffic_sequences))
     ]
     joined_labels = Series()
     for traffic_sequence in traffic_sequences:
         joined_labels = joined_labels.append(traffic_sequence.labels)
     joined_reader = itertools.chain(
         *map(lambda seq: seq.packet_reader, traffic_sequences)
     )
     parts = {"all": joined_ids}
     # make sure that the same pcaps, even in different order, result in the same traffic sequence name; regardless
     # of the used test pcaps
     name_identifier = ",".join(
         sorted([t.name.split(".pcap")[0] for t in traffic_sequences])
     )
     return TrafficSequence(
         name=f"benign@UNSW-NB15:%s" % name_identifier,
         labels=joined_labels,
         packet_reader=joined_reader,
         parts=parts,
         ids=joined_ids,
     )

コード例 #14

0

ファイルを表示

ファイル: cic_ids_2017.py プロジェクト: dhelmr/bachelor-thesis

 def read_normal_data(self) -> TrafficSequence:
     traffic_sequences = [
         self._make_traffic_sequence(pcap_file, ranges)
         for pcap_file, ranges in self.subset["benign"].items()
     ]
     if len(traffic_sequences) == 1:
         return traffic_sequences[0]
     # if more than one traffic sequences are present, join them into one.
     joined_ids = [
         id_item
         for id_item in itertools.chain(*map(lambda seq: seq.ids, traffic_sequences))
     ]
     joined_labels = Series()
     for traffic_sequence in traffic_sequences:
         joined_labels = joined_labels.append(traffic_sequence.labels)
     joined_reader = itertools.chain(
         *map(lambda seq: seq.packet_reader, traffic_sequences)
     )
     parts = {"all": joined_ids}
     return TrafficSequence(
         name=f"benign@CIC-IDS-2017:{self.subset_name}",
         labels=joined_labels,
         packet_reader=joined_reader,
         parts=parts,
         ids=joined_ids,
     )

コード例 #15

0

ファイルを表示

ファイル: dataset.py プロジェクト: philippkraft/odmf

 def asseries(self, session):
     datasets = self.datasets(session)
     data = Series()
     for src in datasets:
         s = src.asseries(self.start, self.end)
         data = data.append(s)
     return data.sort_index()

コード例 #16

0

ファイルを表示

    def test_concatlike_common_coerce_to_pandas_object(self):
        # GH 13626
        # result must be Timestamp/Timedelta, not datetime.datetime/timedelta
        dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"])
        tdi = pd.TimedeltaIndex(["1 days", "2 days"])

        exp = Index(
            [
                pd.Timestamp("2011-01-01"),
                pd.Timestamp("2011-01-02"),
                pd.Timedelta("1 days"),
                pd.Timedelta("2 days"),
            ]
        )

        res = dti.append(tdi)
        tm.assert_index_equal(res, exp)
        assert isinstance(res[0], pd.Timestamp)
        assert isinstance(res[-1], pd.Timedelta)

        dts = Series(dti)
        tds = Series(tdi)
        res = dts.append(tds)
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
        assert isinstance(res.iloc[0], pd.Timestamp)
        assert isinstance(res.iloc[-1], pd.Timedelta)

        res = pd.concat([dts, tds])
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))
        assert isinstance(res.iloc[0], pd.Timestamp)
        assert isinstance(res.iloc[-1], pd.Timedelta)

コード例 #17

0

ファイルを表示

    def test_append_concat_tz_explicit_pytz(self):
        # GH 2938
        tm._skip_if_no_pytz()
        from pytz import timezone as timezone

        rng = date_range('5/8/2012 1:45',
                         periods=10,
                         freq='5T',
                         tz=timezone('US/Eastern'))
        rng2 = date_range('5/8/2012 2:35',
                          periods=10,
                          freq='5T',
                          tz=timezone('US/Eastern'))
        rng3 = date_range('5/8/2012 1:45',
                          periods=20,
                          freq='5T',
                          tz=timezone('US/Eastern'))
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        ts2 = Series(np.random.randn(len(rng2)), rng2)
        df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)

        result = ts.append(ts2)
        result_df = df.append(df2)
        tm.assert_index_equal(result.index, rng3)
        tm.assert_index_equal(result_df.index, rng3)

        appended = rng.append(rng2)
        tm.assert_index_equal(appended, rng3)

コード例 #18

0

ファイルを表示

def run_years(end_year, stat_str, num_sims=100, thresholds=[0.05, 0.5, 0.95]):
  results = {}
  ages, ignore_years = parse_string(stat_str)
  start_year = datetime.datetime.today().year + 1

  for threshold in thresholds:
    results[threshold] = {(start_year - 1): 0}

  for year in range(start_year, end_year + 1):
#    print year
    probs = []
    years = year - datetime.datetime.today().year
    for age in ages:
      probs.append(deathprob(age, years))
    sims = build_sims(num_sims, len(ages))
    death_counts = count_deaths(probs, sims)
    death_rates = death_thresholds(death_counts, thresholds)
    for threshold in thresholds:
      results[threshold][year] = death_rates[threshold]

  ages = Series(ages).abs()
  ages = [ int(x) for x in ages ]
  birthyears = datetime.datetime.today().year - np.array(ages) # Being very lazy about math here
  births = np.bincount(birthyears).cumsum()[min(birthyears):]
  birthseries = Series(births, index=range(min(birthyears),len(births)+min(birthyears)))
  interimyrs = range(max(birthyears) + 1, start_year)
  birthseries = birthseries.append(Series(len(ages), index=interimyrs))
  results['births'] = (len(ages) - birthseries).to_dict()

  return results

コード例 #19

0

ファイルを表示

 def cost(self,gene):
     data=self.decodeGene(gene)
     d1=Series(data.student1.tolist(),index=data.student2)
     d2=Series(data.student2.tolist(),index=data.student1)
     data=d1.append(d2).sort_index()
     cost=np.where(data==self.pref.iloc[:,0],0,np.where(data==self.pref.iloc[:,1],1,3))
     return cost.sum()

コード例 #20

0

ファイルを表示

ファイル: data_processing.py プロジェクト: MarkusPic/ehyd_tools

def data_validation(series):
    """
    add gaps sum & nan sum to the time series

    Args:
        series (pandas.Series): time-series

    Returns:
        pandas.DataFrame: tags with columns 'nans', 'gaps', ...
    """
    ts = series.copy()

    first_index = ts.index[0].replace(day=1, month=1, hour=0, minute=0)
    if first_index not in series.index:
        ts = Series(index=[first_index]).append(ts)

    last_index = ts.index[-1].replace(day=31, month=12, hour=23, minute=59)
    if last_index not in ts.index:
        ts = ts.append(Series(index=[last_index]))

    if ts.index.has_duplicates:  # very slow an large data sets
        ts = ts[~ts.index.duplicated()].copy()

    if not ts.index.is_monotonic_increasing:
        raise UserWarning(
            'Series has not monotonic increasing of the timestamps.')
        ts = ts.sort_index()

    tags = DataFrame(index=ts.index)
    tags['nans'] = isna(ts).astype(int)
    tags = tags.reindex(tags.asfreq('T').index)
    tags['gaps'] = isna(ts.fillna(0).reindex(tags.index)).astype(int)
    return tags

コード例 #21

0

ファイルを表示

ファイル: test_combine_concat.py プロジェクト: ywpark1/pandas

    def test_append_concat_tz_dateutil(self):
        # see gh-2938
        rng = date_range('5/8/2012 1:45',
                         periods=10,
                         freq='5T',
                         tz='dateutil/US/Eastern')
        rng2 = date_range('5/8/2012 2:35',
                          periods=10,
                          freq='5T',
                          tz='dateutil/US/Eastern')
        rng3 = date_range('5/8/2012 1:45',
                          periods=20,
                          freq='5T',
                          tz='dateutil/US/Eastern')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        ts2 = Series(np.random.randn(len(rng2)), rng2)
        df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)

        result = ts.append(ts2)
        result_df = df.append(df2)
        tm.assert_index_equal(result.index, rng3)
        tm.assert_index_equal(result_df.index, rng3)

        appended = rng.append(rng2)
        tm.assert_index_equal(appended, rng3)

コード例 #22

0

ファイルを表示

ファイル: Integrated_model.py プロジェクト: toggled/cs6220project

def train_data_construct(bins, train_set, iteration, realtime = False):
    train_bins = defaultdict(tuple)

    print 'start to construct the train data bins'
    if realtime:
        idx = 0
        for bin in bins:
            if len(bin) > 0:
                feature_bin = DataFrame()
                lable_bin = Series()
                for uid in bin:
                    tmp = train_set[train_set['product_uid'] == int(uid)]
                    if not tmp.empty:
                        feature_bin = feature_bin.append(tmp)
                        # should drop the relevance data here
                        lable_bin = lable_bin.append(tmp['relevance'])
                train_bins[idx] = (feature_bin,lable_bin)
                print len(train_bins[idx][0]), ' entries in bin', idx
                # if idx == 0:
                #     feature_bin.to_csv('feature_bin.csv')
                idx += 1
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','wb')
        pk.dump(train_bins,f1)
    else:
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','rb')
        train_bins=pk.load(f1)
    print 'finish constructing training bins'

    return train_bins

コード例 #23

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: xuweitj/pandas

    def test_series_append_dst(self):
        rng1 = date_range("1/1/2016 01:00",
                          periods=3,
                          freq="H",
                          tz="US/Eastern")
        rng2 = date_range("8/1/2016 01:00",
                          periods=3,
                          freq="H",
                          tz="US/Eastern")
        ser1 = Series([1, 2, 3], index=rng1)
        ser2 = Series([10, 11, 12], index=rng2)
        ts_result = ser1.append(ser2)

        exp_index = DatetimeIndex(
            [
                "2016-01-01 01:00",
                "2016-01-01 02:00",
                "2016-01-01 03:00",
                "2016-08-01 01:00",
                "2016-08-01 02:00",
                "2016-08-01 03:00",
            ],
            tz="US/Eastern",
        )
        exp = Series([1, 2, 3, 10, 11, 12], index=exp_index)
        tm.assert_series_equal(ts_result, exp)
        assert ts_result.index.tz == rng1.tz

コード例 #24

0

ファイルを表示

def split_train_test(X, Y, size=0.2):
    X = DataFrame(X)
    Y = Series(Y)
    folds = int(1 / size)
    kfold = StratifiedKFold(Y, n_folds=folds, random_state=1)
    x_train = DataFrame()
    y_test = Series()
    y_train = Series()
    x_test = DataFrame()
    for train, test in kfold:
        x_train = x_train.append(X.iloc[train])
        y_train = y_train.append(Y.iloc[train])
        y_test = y_test.append(Y.iloc[test])
        x_test = x_test.append(X.iloc[test])
        break
    return x_train, y_train, x_test, y_test

コード例 #25

0

ファイルを表示

    def fit(self, dataset: NumpyOrPandas):
        """Estimate label frequencies and create encoding dicts.

        Args:
            dataset: Pandas or Numpy dataset of categorical features.

        """
        # set transformer names and add checks
        LAMLTransformer.fit(self, dataset)
        # set transformer features

        # convert to accepted dtype and get attributes
        roles = dataset.roles
        subs = self._get_df(dataset)

        self.dicts = {}
        for i in subs.columns:
            role = roles[i]
            try:
                flg_number = np.issubdtype(role.dtype, np.number)
            except TypeError:
                flg_number = False

            if not flg_number:
                co = role.unknown
                cnts = subs[i].value_counts(dropna=True)
                cnts = cnts[cnts > co].reset_index()
                cnts = Series(cnts["index"].astype(str).rank().values,
                              index=cnts["index"].values)
                cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
                self.dicts[i] = cnts

        return self

コード例 #26

0

ファイルを表示

ファイル: load_SJWHS.py プロジェクト: ANB2/ParkingWestminster

def get_occupancy(df, x_coord):
	# pick an arbitrary Tues
	SWTues = df[(df['start date'] == '2013-04-30 00:00:00') & (df['x coordinate'] == x_coord)]

	SWTues['start date'] = pd.to_datetime(SWTues['start date'])
	SWTues['end date'] = pd.to_datetime(SWTues['end date'])

	# make a datetime for today at midnight
	ts_now=pd.to_datetime("2013-10-05")
	SWTues['start time'] = pd.to_datetime(SWTues['start time'])  # defaults to today (5th Oct) with correct hour:min
	SWTues['start time'] =SWTues['start time'] - ts_now  # get relative difference, leaves a timedelta
	SWTues['start datetime'] = SWTues['start time'] + SWTues['start date']  # combine date with midnight & timedelta to get new datetime

	SWTues['end time'] = pd.to_datetime(SWTues['end time'])  # defaults to today (5th Oct) with correct hour:min
	SWTues['end time'] =SWTues['end time'] - ts_now  # get relative difference, leaves a timedelta
	SWTues['end datetime'] = SWTues['end time'] + SWTues['end date']  # combine date with midnight & timedelta to get new datetime



	s1 = Series(np.ones(SWTues.shape[0]), index=SWTues['start datetime'])
	s2 = Series(-1*np.ones(SWTues.shape[0]), index=SWTues['end datetime'])

	mean_occ = s1.append(s2).sort_index().cumsum().mean()
	max_occ = SWTues['Spaces'].max()
	tariff = SWTues['Tariff'].max()
	max_stay = SWTues['Max Stay'].max()
	total_takings = SWTues['amount paid'].sum()

	occ_prop = mean_occ/max_occ
	return mean_occ, max_occ, occ_prop, tariff, max_stay, total_takings

コード例 #27

0

ファイルを表示

    def test_concatlike_common_period_diff_freq_to_object(self):
        # GH 13221
        pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
        pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D")

        exp = Index(
            [
                pd.Period("2011-01", freq="M"),
                pd.Period("2011-02", freq="M"),
                pd.Period("2012-01-01", freq="D"),
                pd.Period("2012-02-01", freq="D"),
            ],
            dtype=object,
        )

        res = pi1.append(pi2)
        tm.assert_index_equal(res, exp)

        ps1 = Series(pi1)
        ps2 = Series(pi2)
        res = ps1.append(ps2)
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

        res = pd.concat([ps1, ps2])
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

コード例 #28

0

ファイルを表示

ファイル: test_combine_concat.py プロジェクト: zycjss/pandas

    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        self.assertEqual(rng1.append(rng1).name, 'foo')
        assert rng1.append(rng2).name is None

コード例 #29

0

ファイルを表示

    def test_concatlike_common_period_mixed_dt_to_object(self):
        # GH 13221
        # different datetimelike
        pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M")
        tdi = pd.TimedeltaIndex(["1 days", "2 days"])
        exp = Index(
            [
                pd.Period("2011-01", freq="M"),
                pd.Period("2011-02", freq="M"),
                pd.Timedelta("1 days"),
                pd.Timedelta("2 days"),
            ],
            dtype=object,
        )

        res = pi1.append(tdi)
        tm.assert_index_equal(res, exp)

        ps1 = Series(pi1)
        tds = Series(tdi)
        res = ps1.append(tds)
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

        res = pd.concat([ps1, tds])
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

        # inverse
        exp = Index(
            [
                pd.Timedelta("1 days"),
                pd.Timedelta("2 days"),
                pd.Period("2011-01", freq="M"),
                pd.Period("2011-02", freq="M"),
            ],
            dtype=object,
        )

        res = tdi.append(pi1)
        tm.assert_index_equal(res, exp)

        ps1 = Series(pi1)
        tds = Series(tdi)
        res = tds.append(ps1)
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

        res = pd.concat([tds, ps1])
        tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1]))

コード例 #30

0

ファイルを表示

ファイル: dbArtistsParse.py プロジェクト: tgadf/dbdata

    def parseSearch(self, modVal, expr=None, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Discogs Search ModVal={0} Files(expr=\'{1}\', force={2}, debug={3}, quiet={4})".format(modVal, expr, force, debug, quiet))
                        
        io = fileIO()

            
        ########################################################################################
        # Previous DB Data
        ########################################################################################
        if not fileUtil(self.disc.getDBModValFilename(modVal)).exists:
            tsDB = timestat("Creating New DB For ModVal={0}".format(modVal))
            dbdata = Series({})
            ts.stop()
        else:
            tsDB = timestat("Loading ModVal={0} DB Data".format(modVal))
            dbdata = self.disc.getDBModValData(modVal)
            tsDB.stop()
            
        
        ########################################################################################
        # Previous Media Data
        ########################################################################################
        previousMetadata = self.disc.getMetadataAlbumData(modVal)
        
        
        ########################################################################################
        # Artist Search Data (No Media)
        ########################################################################################
        tsDB = timestat("Loading Artist Search Data For ModVal={0}".format(modVal))
        artistSearchFilenames = self.getArtistRawFiles(datatype="search", expr=expr, force=True)
        artistSearchFilename = [x for x in artistSearchFilenames if fileUtil(x).basename == "artistData-{0}".format(modVal)]
        if len(artistSearchFilename) == 1:
            artistSearchData = io.get(artistSearchFilename[0])
        else:
            raise ValueError("Could not find Discogs API Artist Search Data")
        tsDB.stop()
        
        
        N = artistSearchData.shape[0]
        modValue = 5000 if N >= 50000 else 1000
        nSave = 0
        tsParse = timestat("Parsing {0} Searched For Discogs API Artists".format(N))
        Nnew = 0
        for i,(artistID,artistData) in enumerate(artistSearchData.iterrows()):
            if (i+1) % modValue == 0 or (i+1) == N:
                tsParse.update(n=i+1, N=N)
            if dbdata.get(artistID) is not None:
                continue
            artistAPIData = {"Artist": artistData, "Albums": previousMetadata.get(artistID, {})}
            dbdata = dbdata.append(Series({artistID: self.artist.getData(artistAPIData)}))
            Nnew += 1
            
        if Nnew > 0:
            print("Saving [{0}/{1}] {2} Entries To {3}".format(len(dbdata), len(dbdata), "ID Data", self.disc.getDBModValFilename(modVal)))
            self.disc.saveDBModValData(modVal=modVal, idata=dbdata)
        else:
            print("Not saving any of the new data")
                
        ts.stop()

コード例 #31

0

ファイルを表示

ファイル: rules.py プロジェクト: zhuangchaoxi/gs-quant

    def _get_holidays(self, use_usd: bool = True) -> List[date]:
        if self.holiday_calendar:
            return self.holiday_calendar
        try:
            currencies = self.currencies or []
            if use_usd:
                currencies.append('USD')
            if self.exchanges:
                cached_data = _cache.get(
                    hashkey(use_usd, str(currencies), str(self.exchanges)))
                if cached_data:
                    return cached_data
            holidays = Series()
            if self.exchanges:
                self.exchanges = [
                    x.value if isinstance(x, ExchangeCode) else x.upper()
                    for x in self.exchanges
                ]
                exchange_query = GsDataApi.build_query(start=DATE_LOW_LIMIT,
                                                       end=DATE_HIGH_LIMIT,
                                                       exchange=self.exchanges)
                data = GsDataApi.query_data(exchange_query, 'HOLIDAY')
                holidays = holidays.append(
                    Series(to_datetime(DataFrame(data)['date']).dt.date))

            if len(currencies):
                currencies = [
                    x.value if isinstance(x, Currency) else x.upper()
                    for x in currencies
                ]
                currency_query = GsDataApi.build_query(start=DATE_LOW_LIMIT,
                                                       end=DATE_HIGH_LIMIT,
                                                       currency=currencies)
                data = GsDataApi.query_data(currency_query, 'HOLIDAY_CURRENCY')
                holidays = holidays.append(
                    Series(to_datetime(DataFrame(data)['date']).dt.date))

            holidays = holidays.unique().tolist()
            _cache[hashkey(use_usd, str(currencies),
                           str(self.exchanges))] = holidays
            return holidays
        except Exception as e:
            _logger.warning(
                'Unable to fetch holiday calendar. Try passing your own when applying a rule.',
                e)
            return []

コード例 #32

0

ファイルを表示

    def _get_holidays(self) -> List[date]:
        if self.holiday_calendar is not None:
            if self.usd_calendar is None:
                return self.holiday_calendar
            return list(set().union(self.holiday_calendar, self.usd_calendar))
        try:
            holidays = Series(dtype=object)
            currencies = ['USD'
                          ] if self.currencies is None else self.currencies

            cached_data = _cache.get(
                hashkey(str(currencies), str(self.exchanges)))
            if cached_data:
                return cached_data
            if self.exchanges:
                exchanges = [
                    x.value if isinstance(x, ExchangeCode) else x.upper()
                    for x in self.exchanges
                ]
                exchange_query = GsDataApi.build_query(start=DATE_LOW_LIMIT,
                                                       end=DATE_HIGH_LIMIT,
                                                       exchange=exchanges)
                data = GsDataApi.query_data(exchange_query, 'HOLIDAY')
                holidays = holidays.append(
                    Series(to_datetime(DataFrame(data)['date']).dt.date))

            if len(currencies):
                currencies = [
                    x.value if isinstance(x, Currency) else x.upper()
                    for x in currencies
                ]
                currency_query = GsDataApi.build_query(start=DATE_LOW_LIMIT,
                                                       end=DATE_HIGH_LIMIT,
                                                       currency=currencies)
                data = GsDataApi.query_data(currency_query, 'HOLIDAY_CURRENCY')
                holidays = holidays.append(
                    Series(to_datetime(DataFrame(data)['date']).dt.date))

            holidays = holidays.unique().tolist()
            _cache[hashkey(str(currencies), str(self.exchanges))] = holidays
            return holidays
        except Exception as e:
            _logger.warning(
                'Unable to fetch holiday calendar. Try passing your own when applying a rule.',
                e)
            return []

コード例 #33

0

ファイルを表示

ファイル: base.py プロジェクト: pedrocklein/tvb-multiscale

 def _build_and_connect_devices(self, devices):
     # Build devices by the variable model they measure or stimulate (Series),
     # population (Series),
     # and target node (Series) for faster reading
     _devices = Series()
     for device in devices:
         _devices = _devices.append(self.build_and_connect_devices(device))
     return _devices

コード例 #34

0

ファイルを表示

def update_series():
    ser = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
    print('根据列表+索引创建:\n', ser)
    s = ser.drop('c')  #
    print('删除后的结果:\n', s)
    s = ser.drop(['a', 'c'])  #
    print('删除多个后的结果:\n', s)
    ser.pop('d')
    # ser.pop(0) # 索引删除 invalid key
    # ser.pop([0,1]) # 删除多个 invalid key
    print('pop删除，修改源数据:\n', ser)
    ser[0] = 1000
    ser['f'] = 2000
    ser2 = Series([100, 200], index=['x', 'y'])
    ser.append(ser2)
    print('修改Series:\n', ser)
    print(' Series append:\n', ser.append(ser2))

コード例 #35

0

ファイルを表示

ファイル: test_indexing.py プロジェクト: frreiss/pandas-fred

def test_setitem_ambiguous_keyerror(indexer_sl):
    s = Series(range(10), index=list(range(0, 20, 2)))

    # equivalent of an append
    s2 = s.copy()
    indexer_sl(s2)[1] = 5
    expected = s.append(Series([5], index=[1]))
    tm.assert_series_equal(s2, expected)

コード例 #36

0

ファイルを表示

ファイル: avnet_estimator_01_real.py プロジェクト: WojciechMigda/TCO-ElectronicPartsClassification

def Fractions(df, symbols, colnames):
    from pandas import Series
    vec = Series()
    for col in colnames:
        valcounts = df[col].value_counts(normalize=True)
        valcounts = valcounts.reindex(symbols[col], fill_value=0.)
        vec = vec.append(valcounts.rename(lambda i: col + '_' + str(i)))
        pass
    return vec

コード例 #37

0

ファイルを表示

ファイル: base.py プロジェクト: the-virtual-brain/tvb-multiscale

 def _build_and_connect_devices(self, devices):
     """Method to build and connect input or output devices, organized by
        - the variable they measure or stimulate (pandas.Series), and the
        - population(s) (pandas.Series), and
        - brain region nodes (pandas.Series) they target."""
     _devices = Series()
     for device in devices:
         _devices = _devices.append(self.build_and_connect_devices(device))
     return _devices

コード例 #38

0

ファイルを表示

ファイル: dataprocess.py プロジェクト: xiaolli/WDDdemo

def data_process1(file_url):
    #read file
    df = read_file(file_url, sheet_name='Sheet1')

    data_replace = df.replace(np.nan, '')
    """group cut"""
    intergenic = str('lncRNA, intergenic')
    df2 = df[df['gene class'].str.contains(intergenic, na=False)]
    df2 = df2.drop_duplicates('gene name')
    """group cut"""
    """alias symbol列から、”｜”で分ける"""
    newDF = df2['alias symbol'].str.split("|", expand=True)
    newDF = newDF.fillna(".")

    col_size = len(newDF.columns)

    # array_alias=[]
    namefile = Series()
    current_number = 0
    while current_number < col_size:
        namefile1 = newDF[~newDF[current_number].str.contains('\.', na=False)]
        namefile1 = namefile1[current_number]
        namefile = namefile.append(namefile1)
        current_number += 1
    namefile2 = namefile
    """削除."""
    genename2 = df2[~df2['gene name'].str.contains('\.', na=False)]
    genename2 = genename2['gene name']
    """名称を合わせる"""
    namefile = namefile.append(genename2)
    """CSV"""
    #down_path = os.path.join(os.path.abspath(os.path.dirname(os.path.dirname(__file__))), 'tmp/download')
    #data_file.save(os.path.join(upload_path, data_filename))
    timestampe = time.strftime("%Y%m%d%H%M%S", time.localtime())
    csv_file_name = 'GENANAME' + '_' + timestampe + '.csv'
    down_path = os.path.join(app.root_path, 'tmp/download')
    #csv_file_name = 'GENANAME.csv'
    csv_file_save_url = os.path.join(down_path, csv_file_name)
    csv_file = namefile.to_csv(
        #r"C:\WDD\OUTPUT\GENANAME.CSV"
        csv_file_save_url)

    #return csv_file_save_url
    return data_replace, csv_file_name

コード例 #39

0

ファイルを表示

    def test_series_append_aware_naive(self):
        rng1 = date_range("1/1/2011 01:00", periods=1, freq="H")
        rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern")
        ser1 = Series(np.random.randn(len(rng1)), index=rng1)
        ser2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ser1.append(ser2)

        expected = ser1.index.astype(object).append(ser2.index.astype(object))
        assert ts_result.index.equals(expected)

        # mixed
        rng1 = date_range("1/1/2011 01:00", periods=1, freq="H")
        rng2 = range(100)
        ser1 = Series(np.random.randn(len(rng1)), index=rng1)
        ser2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ser1.append(ser2)

        expected = ser1.index.astype(object).append(ser2.index)
        assert ts_result.index.equals(expected)

コード例 #40

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: techtommey/pandas

    def test_append_aware_naive(self):
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Eastern')
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assert_(ts_result.index.equals(
                ts1.index.asobject.append(ts2.index.asobject)))

        #mixed

        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
        rng2 = range(100)
        ts1 = Series(np.random.randn(len(rng1)), index=rng1)
        ts2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ts1.append(ts2)
        self.assert_(ts_result.index.equals(
                ts1.index.asobject.append(ts2.index)))

コード例 #41

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: forking-repos/pandas

    def test_series_append_aware_naive(self):
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Eastern')
        ser1 = Series(np.random.randn(len(rng1)), index=rng1)
        ser2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ser1.append(ser2)

        expected = ser1.index.astype(object).append(ser2.index.astype(object))
        assert ts_result.index.equals(expected)

        # mixed
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H')
        rng2 = range(100)
        ser1 = Series(np.random.randn(len(rng1)), index=rng1)
        ser2 = Series(np.random.randn(len(rng2)), index=rng2)
        ts_result = ser1.append(ser2)

        expected = ser1.index.astype(object).append(ser2.index)
        assert ts_result.index.equals(expected)

コード例 #42

0

ファイルを表示

ファイル: backtesting.py プロジェクト: enenn/freqtrade

def get_timeframe(data: Dict[str, DataFrame]) -> Tuple[arrow.Arrow, arrow.Arrow]:
    """
    Get the maximum timeframe for the given backtest data
    :param data: dictionary with preprocessed backtesting data
    :return: tuple containing min_date, max_date
    """
    all_dates = Series([])
    for pair, pair_data in data.items():
        all_dates = all_dates.append(pair_data['date'])
    all_dates.sort_values(inplace=True)
    return arrow.get(all_dates.iloc[0]), arrow.get(all_dates.iloc[-1])

コード例 #43

0

ファイルを表示

ファイル: ReadInBusJSON.py プロジェクト: AlanFeder/NYC_SubwayBus

 def extractJSONData(dict1):
     ser1 = Series(dict1['MonitoredVehicleJourney'])
     ser1 = ser1.append(Series({'RecordedAtTime':dict1['RecordedAtTime']}))
     nextStops = ser1['OnwardCalls']['OnwardCall'][0]
     ser1.drop('OnwardCalls', inplace = True)
     ser1 = unnest(ser1)
     nextStops = unnest(Series(nextStops))
     nextStops.index = 'NextStop' + nextStops.index.values
     ser1 = pd.concat((ser1, nextStops))
     df_row = DataFrame(ser1).transpose()
     return(df_row)

コード例 #44

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: forking-repos/pandas

    def test_series_append_aware(self):
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
                          tz='US/Eastern')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Eastern')
        ser1 = Series([1], index=rng1)
        ser2 = Series([2], index=rng2)
        ts_result = ser1.append(ser2)

        exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'],
                                  tz='US/Eastern')
        exp = Series([1, 2], index=exp_index)
        tm.assert_series_equal(ts_result, exp)
        assert ts_result.index.tz == rng1.tz

        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC')
        ser1 = Series([1], index=rng1)
        ser2 = Series([2], index=rng2)
        ts_result = ser1.append(ser2)

        exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'],
                                  tz='UTC')
        exp = Series([1, 2], index=exp_index)
        tm.assert_series_equal(ts_result, exp)
        utc = rng1.tz
        assert utc == ts_result.index.tz

        # GH#7795
        # different tz coerces to object dtype, not UTC
        rng1 = date_range('1/1/2011 01:00', periods=1, freq='H',
                          tz='US/Eastern')
        rng2 = date_range('1/1/2011 02:00', periods=1, freq='H',
                          tz='US/Central')
        ser1 = Series([1], index=rng1)
        ser2 = Series([2], index=rng2)
        ts_result = ser1.append(ser2)
        exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'),
                           Timestamp('1/1/2011 02:00', tz='US/Central')])
        exp = Series([1, 2], index=exp_index)
        tm.assert_series_equal(ts_result, exp)

コード例 #45

0

ファイルを表示

ファイル: read_surveypages.py プロジェクト: jonrobinson2/economist_poll

def main():

    # Get links to survey pages
    home_url = "http://www.igmchicago.org/igm-economic-experts-panel"
    home_contents = get_page_contents(home_url)
    urls = re.findall(
        r"<h2><a href=\"(\S+?results\?SurveyID=\S+?)\"", home_contents)
    urls = ["http://www.igmchicago.org" + url for url in urls]

    # Loop through survey pages
    df = DataFrame()
    question_count = 0
    for url in reversed(urls):

        contents = get_page_contents(url)

        questions = re.findall(r"surveyQuestion\">([\s\S]+?)</h3>", contents)
        responder_list = re.findall(
            r"\?id=([\d]+)?\">([\s\w.]+?)</a>", contents)

        responses = re.findall(
            r"<span class=\"option-[\d]+?\">([\s\w.]+?)</span>", contents)
        num_responders = len(responses) / len(questions)

        # Loop through sub-questions (A, B, etc) within each page
        for i, question in enumerate(questions):
            question = clean_string(question)
            question_count += 1
            print(question)

            # Restrict range to responses for this sub-question
            rng = (i * num_responders, (i + 1) * num_responders)

            # Collect sub-question, its url suffix, and the responses
            prefix = "(%03d" % question_count + ") "
            q_responses = Series(
                responses[rng[0]:rng[1]], index=responder_list[rng[0]:rng[1]])
            q_url_suffix = re.findall("=(.+)", url)[0]
            q_responses = q_responses.append(
                Series([q_url_suffix], index=['q_url_suffix']))
            q_responses.name = prefix + question.strip()

            # Add question data to dataframe
            df = df.join(q_responses, how='outer')

    # Move responder id from index to column, only after all joins are complete
    df['responder_id'] = [pair[0] for pair in df.index]
    df.index = [pair[1] if type(pair) == tuple else pair for pair in df.index]

    # Write to file
    df.to_json("survey_results.json")

コード例 #46

0

ファイルを表示

ファイル: analyser.py プロジェクト: Sinderella/OSINT

    def compute_summary(self, combined_df):
        combined_mean = combined_df.mean()
        average_cons = combined_mean['consistency']
        average_ambi = combined_mean['ambiguity']
        # completeness = self.compute_completeness()
        noise = self.compute_noise()

        series = Series([average_cons, average_ambi])
        series.index = ['Average Consistency', 'Average Ambiguity']

        series_2 = Series(noise)
        series_2.index = [i.title() for i in series_2.index]
        series = series.append(series_2)
        return series

コード例 #47

0

ファイルを表示

ファイル: test_timezones.py プロジェクト: forking-repos/pandas

    def test_series_append_dst(self):
        rng1 = date_range('1/1/2016 01:00', periods=3, freq='H',
                          tz='US/Eastern')
        rng2 = date_range('8/1/2016 01:00', periods=3, freq='H',
                          tz='US/Eastern')
        ser1 = Series([1, 2, 3], index=rng1)
        ser2 = Series([10, 11, 12], index=rng2)
        ts_result = ser1.append(ser2)

        exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00',
                                   '2016-01-01 03:00', '2016-08-01 01:00',
                                   '2016-08-01 02:00', '2016-08-01 03:00'],
                                  tz='US/Eastern')
        exp = Series([1, 2, 3, 10, 11, 12], index=exp_index)
        tm.assert_series_equal(ts_result, exp)
        assert ts_result.index.tz == rng1.tz

コード例 #48

0

ファイルを表示

ファイル: parseNewFile.py プロジェクト: kkreja/quotesAnalyzer

class RolingMeanAlgorithm( Algorithm ):
    
    def __init__(self, smallAvg, bigAvg):
        self.df = Series()
        self.smallAvg = smallAvg
        self.bigAvg = bigAvg
        self.was = 0
        
    
#    def getInputSettings( self ):
#        """metoda ta zwracalaby obiekt InputSettings, ktory to zawieralby 
#        informacje odnosnie tego jakie wymogi musza spelniac danewejsciowe,
#        na razie wydaje mi sie ze moze to byc:
#        a) minimalny zakres danych
#        b) skok danych - odstep czasowy miedzy poszczegolnymi wartosciami
#        x) pewnie masz pomysly na wiecej parametrow"""	
#        raise NotImplementedError( "Should have implemented this" )

    def getBuySignals( self, measurement ):
        """na wejsciu data frame o zadanych przez InputSettings parametrach,
        na wyjsciu 0,1,-1 kiedy kupowac z kierunkiem - nie wiem czy bedziemy
        mieli takie algorytmy co beda w wyniku dawac sygnaly -1,1?
        """
        #print (self.df)
        self.df = self.df.append(Series(measurement['ask'],index = ['a']))
        #print measurement.name
        #sys.exit(1)
        if self.df.shape[0] == 20:
            curBig = rolling_mean(self.df, self.bigAvg)
            curSmall = rolling_mean(self.df[(self.bigAvg-self.smallAvg):], self.smallAvg)
#            print "1=========================="
#            print curBig[-1]
#            print curSmall[-1]
            
            if curBig[-1] < curSmall[-1]:
                
                #print "128=========================="
                self.df = self.df[1:]
                return [128,measurement.name]
            else:
                #print "129=========================="
                self.df = self.df[1:]
                return [130,measurement.name]
        else:
            return [129,measurement.name]

コード例 #49

0

ファイルを表示

ファイル: test_format.py プロジェクト: rymurr/pandas

    def test_datetimeindex(self):

        from pandas import date_range, NaT, Timestamp

        index = date_range("20130102", periods=6)
        s = Series(1, index=index)
        result = s.to_string()
        self.assertTrue("2013-01-02" in result)

        # nat in index
        s2 = Series(2, index=[Timestamp("20130111"), NaT])
        s = s2.append(s)
        result = s.to_string()
        self.assertTrue("NaT" in result)

        # nat in summary
        result = str(s2.index)
        self.assertTrue("NaT" in result)

コード例 #50

0

ファイルを表示

ファイル: test_combine_concat.py プロジェクト: TomAugspurger/pandas

    def test_append_concat_tz_dateutil(self):
        # see gh-2938
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
                         tz='dateutil/US/Eastern')
        rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
                          tz='dateutil/US/Eastern')
        rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
                          tz='dateutil/US/Eastern')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        ts2 = Series(np.random.randn(len(rng2)), rng2)
        df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)

        result = ts.append(ts2)
        result_df = df.append(df2)
        tm.assert_index_equal(result.index, rng3)
        tm.assert_index_equal(result_df.index, rng3)

        appended = rng.append(rng2)
        tm.assert_index_equal(appended, rng3)

コード例 #51

0

ファイルを表示

ファイル: information.py プロジェクト: brelsford/statistics-neighborhoods

    def entropy_n(self, calc_dkl=True):
        """
        Calcualte the entropy of neighborhoods conditional on each bin and overall

        Parameters
        ----------
        calc_dkl : boolean, default True
            Calculate KL Divergence at the same time (faster)

        Returns
        -------
        entropy : DataFrame
        """
        df_city = []
        if calc_dkl:
            df_city_dkl = []
        cols = self._filter().columns
        bin_num = len(cols)
        for name, group in self._grouped():
            group_tot = group[self.tot_col].sum()
            p_n = DataFrame(
                [group[self.tot_col]/group_tot]*bin_num).transpose()
            H_n = Series(self._entropy(p_n)[0], index=['H(n)'], name=name)

            group_filtered = self._filter(df=group)
            H_n_y = Series(
                self._entropy(group_filtered), index=cols, name=name)
            H_n_y.index = ['H(n|y)_' + str(item) for item in H_n_y.index]

            if calc_dkl:
                DKL_n = self._dkl_n_group(group_filtered, name, cols, p_n)
                df_city_dkl.append(DKL_n)

            df_city.append(H_n_y.append(H_n))

        self.H_n = DataFrame(df_city)
        if calc_dkl:
            self.DKL_n = DataFrame(df_city_dkl)
            # Return all the data
            return concat([self.H_n, self.DKL_n], axis=1)
        return self.H_n

コード例 #52

0

ファイルを表示

ファイル: test_combine_concat.py プロジェクト: TomAugspurger/pandas

    def test_append_concat_tz_explicit_pytz(self):
        # see gh-2938
        from pytz import timezone as timezone

        rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
                         tz=timezone('US/Eastern'))
        rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
                          tz=timezone('US/Eastern'))
        rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
                          tz=timezone('US/Eastern'))
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        ts2 = Series(np.random.randn(len(rng2)), rng2)
        df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)

        result = ts.append(ts2)
        result_df = df.append(df2)
        tm.assert_index_equal(result.index, rng3)
        tm.assert_index_equal(result_df.index, rng3)

        appended = rng.append(rng2)
        tm.assert_index_equal(appended, rng3)

コード例 #53

0

ファイルを表示

ファイル: example.py プロジェクト: schaunwheeler/schaunwheeler.github.io

    train_y = outcome.iloc[train_idx]
    test_y = outcome.iloc[test_idx]
    train_x_t = text.iloc[train_idx]
    test_x_t = text.iloc[test_idx]
    train_x_v = df.iloc[train_idx, :]
    test_x_v = df.iloc[test_idx, :]

    tf = tf_vectorizer.fit_transform(train_x_t.tolist())
    train_x = hstack([csr_matrix(train_x_v), tf], format='csr')

    tf = tf_vectorizer.fit_transform(test_x_t.tolist())
    test_x = hstack([csr_matrix(test_x_v), tf], format='csr')

    test_x_list.append(test_x)
    test_y_all = test_y_all.append(test_y, ignore_index=True)

    _ = sgdr.partial_fit(train_x, train_y, classes=[0.0, 1.0])

    print time() - t1

test_x = vstack(test_x_list, format='csr')
test_y = test_y_all.to_frame('original')
test_y['predicted'] = sgdr.predict_proba(test_x)[1]



mad = (test_y['original'] - test_y['predicted']).abs().median()
type_s = (test_y['original'].gt(0.0) == test_y['predicted'].gt(0.0)).mean()
fn = (test_y['original'].gt(0.0) & test_y['predicted'].le(0.0)).mean()
fp = (test_y['original'].le(0.0) & test_y['predicted'].gt(0.0)).mean()

コード例 #54

0

ファイルを表示

ファイル: parseNewFile.py プロジェクト: kkreja/quotesAnalyzer

feeder = ForexBi5DataFeeder(date(2012, 1, 1),date(2012, 1, 2), 'EURUSD')

#broker = Broker()
#wallet = Wallet(25000, BuyForOneTenthOfWalletBuyStrategy(), SellIfUpBy10OrDownBy5SellStrategy(), broker)
alg = RolingMeanAlgorithm(5,20)

#sim = Simulator(date(2012, 1, 1), date(2012, 1, 2), wallet, alg, feeder)



dataFrame = DataFrame()
signals = Series()

curFeed = feeder.getData()
while curFeed is not None:
    dataFrame = dataFrame.append([curFeed])
    #print curFeed
    signal = alg.getBuySignals(curFeed)
    signals = signals.append(Series(signal[0], index = [signal[1]]))    
    curFeed = feeder.getData()

dataFrame['ask'].plot();
#rolling_mean(dataFrame['ask'], 20).plot()
#rolling_mean(dataFrame['ask'], 5).plot()
#print rolling_mean(dataFrame['ask'], 20)[20:30]
#print rolling_mean(dataFrame['ask'], 5)[20:30]
#print signals[20:30]
signals.plot()
plt.show()

コード例 #55

0

ファイルを表示

ファイル: tdx_data_Day.py プロジェクト: johnsonhongyi/pyQuant

def get_tdx_day_to_df_last(code, dayl=1, type=0, dt=None, ptype="low", dl=None):
    """
    :param code:999999
    :param dayl:Duration Days
    :param type:TDX type
    :param dt:  Datetime
    :param ptype:low or high
    :return:Series or df
    """
    # dayl=int(dayl)
    # type=int(type)
    # print "t:",dayl,"type",type
    if not type == 0:
        f = lambda x: str((1000000 - int(x))) if x.startswith("0") else x
        code = f(code)
    code_u = cct.code_to_symbol(code)
    day_path = day_dir % "sh" if code.startswith(("5", "6", "9")) else day_dir % "sz"
    p_day_dir = day_path.replace("/", path_sep).replace("\\", path_sep)
    # p_exp_dir=exp_dir.replace('/',path_sep).replace('\\',path_sep)
    # print p_day_dir,p_exp_dir
    file_path = p_day_dir + code_u + ".day"
    if not os.path.exists(file_path):
        ds = Series(
            {"code": code, "date": cct.get_today(), "open": 0, "high": 0, "low": 0, "close": 0, "amount": 0, "vol": 0}
        )
        return ds
    ofile = file(file_path, "rb")
    b = 0
    e = 32
    if dayl == 1 and dt == None:
        log.debug("%s" % (dayl == 1 and dt == None))
        fileSize = os.path.getsize(file_path)
        if fileSize < 32:
            print "why", code
        ofile.seek(-e, 2)
        buf = ofile.read()
        ofile.close()
        a = unpack("IIIIIfII", buf[b:e])
        tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8]
        topen = float(a[1] / 100.0)
        thigh = float(a[2] / 100.0)
        tlow = float(a[3] / 100.0)
        tclose = float(a[4] / 100.0)
        amount = float(a[5] / 10.0)
        tvol = int(a[6])  # int
        # tpre = int(a[7])  # back
        dt_list = Series(
            {
                "code": code,
                "date": tdate,
                "open": topen,
                "high": thigh,
                "low": tlow,
                "close": tclose,
                "amount": amount,
                "vol": tvol,
            }
        )
        return dt_list
    elif dayl == 1 and dt is not None and dl is not None:
        log.debug("dt:%s" % (dt))
        dt_list = []
        # if len(str(dt)) == 8:
        # dt = cct.day8_to_day10(dt)
        # else:
        # dt=get_duration_price_date(code, ptype=ptype, dt=dt)
        # print ("dt:%s"%dt)
        fileSize = os.path.getsize(file_path)
        if fileSize < 32:
            print "why", code
        b = fileSize
        ofile.seek(-fileSize, 2)
        no = int(fileSize / e)
        # if no < newstockdayl:
        # return Series()
        # print no,b,day_cout,fileSize
        buf = ofile.read()
        ofile.close()
        # print repr(buf)
        # df=pd.DataFrame()
        for i in xrange(no):
            a = unpack("IIIIIfII", buf[-e:b])
            tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8]
            topen = float(a[1] / 100.0)
            thigh = float(a[2] / 100.0)
            tlow = float(a[3] / 100.0)
            tclose = float(a[4] / 100.0)
            amount = float(a[5] / 10.0)
            tvol = int(a[6])  # int
            # tpre = int(a[7])  # back
            dt_list.append(
                {
                    "code": code,
                    "date": tdate,
                    "open": topen,
                    "high": thigh,
                    "low": tlow,
                    "close": tclose,
                    "amount": amount,
                    "vol": tvol,
                }
            )
            # print series
            # dSeries.append(series)
            # dSeries.append(Series({'code':code,'date':tdate,'open':topen,'high':thigh,'low':tlow,'close':tclose,'amount':amount,'vol':tvol,'pre':tpre}))
            b = b - 32
            e = e + 32
            # print tdate,dt
            if tdate < dt:
                # print "why"
                break
        df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns)
        # print "len:%s %s"%(len(df),fileSize)
        df = df.set_index("date")
        dt = get_duration_price_date(code, ptype=ptype, dt=dt, df=df, dl=dl)
        log.debug("last_dt:%s" % dt)
        dd = df[df.index == dt]
        if len(dd) > 0:
            dd = dd[:1]
            dt = dd.index.values[0]
            dd = dd.T[dt]
            dd["date"] = dt
        else:
            log.warning("no < dt:NULL")
            dd = Series()
            # dd = Series(
            # {'code': code, 'date': cct.get_today(), 'open': 0, 'high': 0, 'low': 0, 'close': 0, 'amount': 0,
            # 'vol': 0})
        return dd
    else:
        dt_list = []
        fileSize = os.path.getsize(file_path)
        # print fileSize
        day_cout = abs(e * int(dayl))
        # print day_cout
        if day_cout > fileSize:
            b = fileSize
            ofile.seek(-fileSize, 2)
            no = int(fileSize / e)
        else:
            no = int(dayl)
            b = day_cout
            ofile.seek(-day_cout, 2)
        # print no,b,day_cout,fileSize
        buf = ofile.read()
        ofile.close()
        # print repr(buf)
        # df=pd.DataFrame()
        for i in xrange(no):
            a = unpack("IIIIIfII", buf[-e:b])
            tdate = str(a[0])[:4] + "-" + str(a[0])[4:6] + "-" + str(a[0])[6:8]
            topen = float(a[1] / 100.0)
            thigh = float(a[2] / 100.0)
            tlow = float(a[3] / 100.0)
            tclose = float(a[4] / 100.0)
            amount = float(a[5] / 10.0)
            tvol = int(a[6])  # int
            # tpre = int(a[7])  # back
            dt_list.append(
                {
                    "code": code,
                    "date": tdate,
                    "open": topen,
                    "high": thigh,
                    "low": tlow,
                    "close": tclose,
                    "amount": amount,
                    "vol": tvol,
                }
            )
            # print series
            # dSeries.append(series)
            # dSeries.append(Series({'code':code,'date':tdate,'open':topen,'high':thigh,'low':tlow,'close':tclose,'amount':amount,'vol':tvol,'pre':tpre}))
            b = b - 32
            e = e + 32
        df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns)
        df = df.set_index("date")
        return df

コード例 #56

0

ファイルを表示

ファイル: tdx_data_Day.py プロジェクト: johnsonhongyi/pyQuant

def get_tdx_Exp_day_to_df(code, type="f", start=None, end=None, dt=None, dl=None):
    # start=cct.day8_to_day10(start)
    # end=cct.day8_to_day10(end)
    # day_path = day_dir % 'sh' if code[:1] in ['5', '6', '9'] else day_dir % 'sz'
    code_u = cct.code_to_symbol(code)
    log.debug("code:%s code_u:%s" % (code, code_u))
    if type == "f":
        file_path = exp_path + "forwardp" + path_sep + code_u.upper() + ".txt"
    elif type == "b":
        file_path = exp_path + "backp" + path_sep + code_u.upper() + ".txt"
    else:
        return None
    log.debug("daypath:%s" % file_path)
    # p_day_dir = day_path.replace('/', path_sep).replace('\\', path_sep)
    # p_exp_dir = exp_dir.replace('/', path_sep).replace('\\', path_sep)
    # print p_day_dir,p_exp_dir
    if not os.path.exists(file_path):
        # ds = Series(
        #     {'code': code, 'date': cct.get_today(), 'open': 0, 'high': 0, 'low': 0, 'close': 0, 'amount': 0,
        #      'vol': 0})
        ds = pd.DataFrame()
        log.error("file_path:not exists")
        return ds
    # ofile = open(file_path, 'rb')
    if dt is None and dl is None:
        ofile = open(file_path, "rb")
        buf = ofile.readlines()
        ofile.close()
        num = len(buf)
        no = num - 1
        dt_list = []
        for i in xrange(no):
            a = buf[i].split(",")
            # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000
            # da=a[0].split('/')
            tdate = a[0]
            # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8]
            # tdate=dt.strftime('%Y-%m-%d')
            topen = float(a[1])
            thigh = float(a[2])
            tlow = float(a[3])
            tclose = float(a[4])
            # tvol = round(float(a[5]) / 10, 2)
            tvol = float(a[5])
            amount = round(float(a[6].replace("\r\n", "")), 1)  # int
            # tpre = int(a[7])  # back
            if int(amount) == 0:
                continue
            dt_list.append(
                {
                    "code": code,
                    "date": tdate,
                    "open": topen,
                    "high": thigh,
                    "low": tlow,
                    "close": tclose,
                    "amount": amount,
                    "vol": tvol,
                }
            )
            # if dt is not None and tdate < dt:
            #     break
        df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns)
        # df.sort_index(ascending=False, inplace=True)
        if start is not None and end is not None:
            df = df[(df.date >= start) & (df.date <= end)]
        elif end is not None:
            df = df[df.date <= end]
        elif start is not None:
            df = df[df.date >= start]
        df = df.set_index("date")
        return df
    elif int(dl) == 1:
        # fileSize = os.path.getsize(file_path)
        # if fileSize < 60 * newstockdayl:
        #     return Series()
        data = cct.read_last_lines(file_path, int(dl) + 3)
        data_l = data.split("\n")
        dt_list = Series()
        data_l.reverse()
        log.debug("day 1:%s" % data_l)
        for line in data_l:
            a = line.split(",")
            # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000
            # da=a[0].split('/')
            log.debug("day 1 len(a):%s a:%s" % (len(a), a))
            if len(a) > 5:
                tdate = a[0]
                log.debug("day 1 tdate:%s" % tdate)
                # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8]
                # tdate=dt.strftime('%Y-%m-%d')
                topen = float(a[1])
                thigh = float(a[2])
                tlow = float(a[3])
                tclose = float(a[4])
                # tvol = round(float(a[5]) / 10, 2)
                tvol = float(a[5])
                amount = round(float(a[6].replace("\r\n", "")), 1)  # int
                # tpre = int(a[7])  # back
                if int(amount) == 0:
                    continue
                dt_list = Series(
                    {
                        "code": code,
                        "date": tdate,
                        "open": topen,
                        "high": thigh,
                        "low": tlow,
                        "close": tclose,
                        "amount": amount,
                        "vol": tvol,
                    }
                )
                break
            else:
                continue
                # if dt is not None and tdate < dt:
                #     break
        # df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns)
        # df = df.set_index('date')
        return dt_list

    else:
        fileSize = os.path.getsize(file_path)
        # if fileSize < 60 * newstockdayl:
        # return Series()
        data = cct.read_last_lines(file_path, int(dl) + 2)
        dt_list = []
        data_l = data.split("\n")
        data_l.reverse()
        for line in data_l:
            a = line.split(",")
            # 01/15/2016,27.57,28.15,26.30,26.97,714833.15,1946604544.000
            # da=a[0].split('/')
            if len(a) > 5:
                tdate = a[0]
                # tdate = str(a[0])[:4] + '-' + str(a[0])[4:6] + '-' + str(a[0])[6:8]
                # tdate=dt.strftime('%Y-%m-%d')
                topen = float(a[1])
                thigh = float(a[2])
                tlow = float(a[3])
                tclose = float(a[4])
                tvol = round(float(a[5]) / 10, 2)
                amount = round(float(a[6].replace("\r\n", "")), 1)  # int
                # tpre = int(a[7])  # back
                if int(amount) == 0:
                    continue
                dt_list.append(
                    {
                        "code": code,
                        "date": tdate,
                        "open": topen,
                        "high": thigh,
                        "low": tlow,
                        "close": tclose,
                        "amount": amount,
                        "vol": tvol,
                    }
                )
            else:
                continue
                # if dt is not None and tdate < dt:
                #     break
        df = pd.DataFrame(dt_list, columns=ct.TDX_Day_columns)
        # df.sort_index(ascending=False, inplace=True)
        # if start is not None and end is not None:
        #     df = df[(df.date >= start) & (df.date <= end)]
        # elif end is not None:
        #     df = df[df.date <= end]
        # elif start is not None:
        #     df = df[df.date >= start]
        df = df.set_index("date")
        # print "time:",(time.time()-time_s)*1000
        return df

コード例 #57

0

ファイルを表示

ファイル: extra.features.py プロジェクト: haoopeng/Rossmman-Sales-Prediction

          "silent": 1,
          "thread": 1,
          "seed": 1301
          }
num_boost_round = 1000

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.01, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

print("Validating")
predict = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(predict))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
ytest = gbm.predict(dtest)
sub = Series()
sub = sub.append(Series(np.expm1(ytest), index = test.Id))
sub = sub.append(Series(0, index = closedId))
# Make Submission
sub = pd.DataFrame({"Id": sub.index, "Sales": sub.values})
sub.to_csv("xgboost_submission2.csv", index=False)

コード例 #58

0

ファイルを表示

ファイル: OneStreet.py プロジェクト: ANB2/ParkingWestminster

# get relative difference, leaves a timedelta
v['start_time'] = v['start_time'] - ts_now
v['end_time']   = v['end_time'] - ts_now

# combine date with midnight & timedelta to get new datetime
v['start_datetime'] = v['start_time'] + v['start_date']
v['end_datetime']   = v['end_time'] + v['end_date']

start_date = pd.to_datetime(Series(v.start_datetime))
start_date.sort()

end_date = pd.to_datetime(Series(v.end_datetime))
end_date.sort()

ts1 = Series(np.ones(len(start_date)), start_date)
ts2 = Series(-1*np.ones(len(end_date)), end_date)

ts = ts1.append(ts2)
us = ts.sort_index()
cs = us.cumsum()

mpl.rc('figure', figsize = (10, 8))
cs.plot()
plt.show()

コード例 #59

0

ファイルを表示

ファイル: bbq.py プロジェクト: zlatkom/pyHFSS

    def do_eBBQ(self, variations= None, plot_fig  = False, modes      = None,
               Pj_from_current  = True, junc_rect = [],    junc_lines = None,  junc_len = [],  junc_LJ_var_name = [],    
               dielectrics      = None, seams     = None,  surface    = False, 
               calc_Hamiltonian = False,pJ_method =  'J_surf_mag',
               save_mesh_stats  = True):
        """               
            Pj_from_current:
                Multi-junction calculation of energy participation ratio matrix based on <I_J>. Current is integrated average of J_surf by default: (zkm 3/29/16)
                Will calculate the Pj matrix for the selected modes for the given junctions junc_rect array & length of juuncs
                
                junc_rect  = ['junc_rect1', 'junc_rect2'] name of junc rectangles to integrate H over
                junc_lines = ['junc_line1', 'junc_line2'] used to define the current flow direction, arbitrary, doesnt really matter that much, just need a line there
                junc_len   = [0.0001]                     lenght of junc = lenght of junc_line #TODO: could now get rid of this and use the line     [specify in SI units; i.e., meters]
                junc_LJ_var_name = ['LJ1', 'LJ2']
                pJ_method  = 'J_surf_mag'   - takes the avg. Jsurf over the rect. Make sure you have seeded lots of tets here. i recommend starting with 4 across smallest dimension.

                Assumptions:
                    Low dissipation (high-Q). 
                    Right now, we assume that there are no lumped capcitors to simply calculations. Not required. 
                    We assume that there are only lumped inductors, so that U_tot = U_E+U_H+U_L    and U_C =0, so that U_tot = 2*U_E;
            
            Other parameters:
                seams = ['seam1', 'seam2']  (seams needs to be a list of strings)
                variations = ['0', '1']
            
            A variation is a combination of project/design variables in an optimetric sweep
        """

        self.Pj_from_current = Pj_from_current;  meta_data = {};  assert(type(junc_LJ_var_name) == list), "Please pass junc_LJ_var_name as a list "
        if Pj_from_current        :  print_color(' Setup: ' + self.setup.name); self.PJ_multi_sol = {} # this is where the result will go             
        if seams       is not None:  self.seams       = seams;       meta_data['seams']       = seams;    
        if dielectrics is not None:  self.dielectrics = dielectrics; meta_data['dielectrics'] = dielectrics;
        if variations      is None:  variations = (['-1'] if self.listvariations == (u'',)  else [str(i) for i in range(self.nvariations)] )
        if modes           is None:  modes = range(self.nmodes)
        if self.latest_h5_path is not None and self.append_analysis:shutil.copyfile(self.latest_h5_path, self.data_filename);
        self.h5file     = hdf = pd.HDFStore(self.data_filename); 
        self.variations = variations;  self.modes = modes; self.njunc = len(junc_rect)
        meta_data['junc_rect'] = junc_rect; meta_data['junc_lines'] = junc_lines; meta_data['junc_len'] = junc_len; meta_data['junc_LJ_var_name'] = junc_LJ_var_name; meta_data['pJ_method'] = pJ_method;
        mesh_stats = self.mesh_stats = []

        for ii, variation in enumerate(variations):
            print_color( 'variation : ' + variation + ' / ' + str(self.nvariations-1), bg = 44, newline = False )
            self.lv = self.get_lv(variation)
            if (variation+'/hfss_variables') in hdf.keys() and self.append_analysis: print_NoNewLine('  previously analyzed ...\n');  \
                continue;    

            print_NoNewLine( ' NOT analyzed\n' );  time.sleep(0.5)
            hdf[variation+'/hfss_variables'] = self.hfss_variables[variation] = varz \
                                             = pd.Series(self.get_variables(variation=variation))
            freqs_bare_dict, freqs_bare_vals = self.get_freqs_bare(variation)   # get bare freqs from HFSS

            self.pjs={}; var_sol_accum = [] 
            for mode in modes:
                sol = Series({'freq' : freqs_bare_vals[mode]*10**-9, 'modeQ' : freqs_bare_dict['Q_'+str(mode)] })
                self.omega  = 2*np.pi*freqs_bare_vals[mode] # this should really be passed as argument  to the functions rather than a property of the calss I would say 
                print ' Mode  \x1b[0;30;46m ' +  str(mode) + ' \x1b[0m / ' + str(self.nmodes-1)+'  calculating:'
                self.solutions.set_mode(mode+1, 0)
                self.fields = self.setup.get_fields()

                print_NoNewLine('   U_H ...');     sol['U_H'] = self.U_H = self.calc_U_H(variation)
                print_NoNewLine('   U_E');         sol['U_E'] = self.U_E = self.calc_U_E(variation)
                print(  "   =>   U_L = %.3f%%" %( (self.U_E - self.U_H )/(2*self.U_E)) )
                
                if self.Pj_from_current:
                    self.LJs    = [ ureg.Quantity(varz['_'+LJvar_nm]).to_base_units().magnitude  for LJvar_nm in junc_LJ_var_name]
                    meta_data['LJs'] = dict(zip(junc_LJ_var_name, self.LJs))
                    print '   I -> p_{mJ} ...'
                    sol_PJ = self.calc_Pjs_from_I_for_mode(variation, self.U_H, self.U_E, self.LJs, junc_rect, junc_len, 
                                                                 method = pJ_method, freq = freqs_bare_vals[mode]*10**-9,
                                                                 calc_sign = junc_lines)
                    sol = sol.append(sol_PJ)
                
                if self.njunc == 1:             # Single-junction method using global U_H and U_E; 
                    assert(type(junc_LJ_var_name) == list and len(junc_LJ_var_name) == 1), "Please pass junc_LJ_var_name as array of 1 element for a single junction; e.g., junc_LJ_var_name = ['junc1']" 
                    #lj  = 1E-3*ureg.Quantity(varz['_'+junc_LJ_var_name]).to_base_units().magnitude                        
                    sol['pj1'] = self.get_p_j(mode)
                    self.pjs.update(sol['pj1'])        # convinience function for single junction case
                    
                if seams is not None:           # get seam Q
                    for seam in seams: sol = sol.append(self.get_Qseam(seam,mode,variation))

                if dielectrics is not None:     # get Q dielectric      
                    for dielectric in dielectrics: sol = sol.append(self.get_Qdielectric(dielectric, mode, variation))
                               
                if surface is True:             # get Q surface                              
                    sol = sol.append( self.get_Qsurface(mode, variation) )
                    
                var_sol_accum +=[sol]
            
            #TODO: add metadata to the Dataframe & save it
            #      such as what are the junc_rect names and Lj values etc.  (e.g., http://stackoverflow.com/questions/29129095/save-additional-attributes-in-pandas-dataframe/29130146#29130146)
            hdf[variation+'/eBBQ_solution']  = self.sols[variation]  \
                                             = pd.DataFrame(var_sol_accum, index = modes)            
            hdf[variation+'/meta_data']      = self.meta_data[variation]  \
                                             = Series(meta_data)
                                             
            if save_mesh_stats:
                msh = self.setup.get_mesh_stats(self.listvariations[ureg(variation)])
                mesh_stats += [msh] 
                if msh is not None:  hdf[variation+'/mesh_stats']  = msh   # returns dataframe 
                conv = self.setup.get_convergence(self.listvariations[ureg(variation)])  # returns dataframe                 
                #print 'conv.'
                if conv is not None: hdf[variation+'/convergence'] =  conv
            
        self.h5file.close()
        self.bbq_analysis = BbqAnalysis(self.data_filename, variations=self.variations)
#TODO: to be implemented below
#        if plot_fig:
#            self.bbq_analysis.plot_Hparams(modes=self.modes)
#            self.bbq_analysis.print_Hparams(modes=self.modes)
        return