Exemple #1
0
    def makeOriginDataCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          basic_path=None,
                          output_file=None,
                          stock_id=None):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        data = cur.execute(
            "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        data = cur.fetchall()
        if len(data) == 0:
            return None

        res = []
        for d in data:
            res.append([
                int(d[0]),
                int(d[1]),
                str(d[2]),
                float(d[3]),
                float(d[4]),
                float(d[5]),
                float(d[6]),
                float(d[7]),
                float(d[8]),
                float(d[9]),
                float(d[10])
            ])
        new_data = []
        for d in zip(*res):
            new_data.append(d)
        origin_data = {
            'id': new_data[0],
            'stock_id': new_data[1],
            'date': new_data[2],
            'opening': new_data[3],
            'closing': new_data[4],
            'difference': new_data[5],
            'percentage_difference': new_data[6],
            'lowest': new_data[7],
            'highest': new_data[8],
            'volume': new_data[9],
            'amount': new_data[10]
        }

        #读取原始数据,只保留需要使用的列
        total_data = DataFrame(origin_data)
        total_data.sort_values(by=['stock_id', 'date'], inplace=True)
        #根据股票代码分组
        g_stock_num = total_data.groupby(by=["stock_id"])
        total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] /
                                    g_stock_num.shift(1)["closing"] - 1)
        for i in total_data.index:
            total_data.loc[i, 'rate'] = str(
                np.round(float(total_data['rate'][i]), 2))
        #重新调整列的顺序,为接下来处理成输入、输出形式做准备
        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ]
        total_data = total_data[columns]

        def func_train_data(data_one_stock_num):
            if cls.groupby_skip == False:
                cls.groupby_skip = True
                return None
            print("正在处理的股票代码:%06s" % data_one_stock_num.name)
            data = {
                "stock_id": [],
                "date": [],
                "opening": [],
                "closing": [],
                "difference": [],
                "percentage_difference": [],
                "lowest": [],
                "highest": [],
                "volume": [],
                "amount": [],
                "rate": []
            }
            for i in range(len(data_one_stock_num.index) - 1):
                for k in data:
                    data[k].append(data_one_stock_num.iloc[i][k])
            pd.DataFrame(data).to_csv(output_path,
                                      index=False,
                                      columns=columns)

        total_data1 = total_data.dropna()
        total_data2 = total_data1.drop(
            total_data1[(total_data1.rate == 'nan')].index)
        g_stock_num = total_data2.groupby(by=["stock_id"])
        #清空接收路径下的文件,初始化列名
        cls.groupby_skip = False
        g_stock_num.apply(func_train_data)
Exemple #2
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na: bool = False,
    sparse: bool = False,
    drop_first: bool = False,
    dtype: Dtype | None = None,
) -> DataFrame:
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
    # dtype[Any], Type[object]]"; expected "Type[Any]"
    dtype = np.dtype(dtype)  # type: ignore[arg-type]

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = Index(
            [f"{prefix}{prefix_sep}{level}" for level in levels])

    index: Index | None
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: bool | float | int
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        # TODO: overload concat with Literal for axis
        out = cast(DataFrame, out)
        return out

    else:
        # take on axis=1 + transpose to ensure ndarray layout is column-major
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Exemple #3
0
    def parse(
        self,
        sheet_name=0,
        header=0,
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        dtype=None,
        true_values=None,
        false_values=None,
        skiprows=None,
        nrows=None,
        na_values=None,
        verbose=False,
        parse_dates=False,
        date_parser=None,
        thousands=None,
        comment=None,
        skipfooter=0,
        convert_float=True,
        mangle_dupe_cols=True,
        **kwds,
    ):

        validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(dict.fromkeys(sheets).keys())

        output = {}

        for asheetname in sheets:
            if verbose:
                print(f"Reading sheet {asheetname}")

            if isinstance(asheetname, str):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = maybe_convert_usecols(usecols)

            if not data:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = fill_mi_header(data[row], control_row)

                    if index_col is not None:
                        header_name, _ = pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if header is None:
                    offset = 0
                elif not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == "" or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(
                    data,
                    names=names,
                    header=header,
                    index_col=index_col,
                    has_index_names=has_index_names,
                    squeeze=squeeze,
                    dtype=dtype,
                    true_values=true_values,
                    false_values=false_values,
                    skiprows=skiprows,
                    nrows=nrows,
                    na_values=na_values,
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    comment=comment,
                    skipfooter=skipfooter,
                    usecols=usecols,
                    mangle_dupe_cols=mangle_dupe_cols,
                    **kwds,
                )

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname
                        ].columns.set_names(header_names)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
def _stack_multi_columns(frame, level_num=-1, dropna=True):
    def _convert_level_number(level_num, columns):
        """
        Logic for converting the level number to something we can safely pass
        to swaplevel:

        We generally want to convert the level number into a level name, except
        when columns do not have names, in which case we must leave as a level
        number
        """
        if level_num in columns.names:
            return columns.names[level_num]
        else:
            if columns.names[level_num] is None:
                return level_num
            else:
                return columns.names[level_num]

    this = frame.copy()

    # this makes life much simpler
    if level_num != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level_num, frame.columns.nlevels - 1):
            # Need to check if the ints conflict with level names
            lev1 = _convert_level_number(i, roll_columns)
            lev2 = _convert_level_number(i + 1, roll_columns)
            roll_columns = roll_columns.swaplevel(lev1, lev2)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        # Workaround the edge case where 0 is one of the column names,
        # which interferes with trying to sort based on the first
        # level
        level_to_sort = _convert_level_number(0, this.columns)
        this = this.sort_index(level=level_to_sort, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(
            zip(*[
                lev.take(lab) for lev, lab in zip(this.columns.levels[:-1],
                                                  this.columns.labels[:-1])
            ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    level_labels = sorted(set(this.columns.labels[-1]))
    level_vals_used = level_vals[level_labels]
    levsize = len(level_labels)
    drop_cols = []
    for key in unique_groups:
        loc = this.columns.get_loc(key)

        # can make more efficient?
        # we almost always return a slice
        # but if unsorted can get a boolean
        # indexer
        if not isinstance(loc, slice):
            slice_len = len(loc)
        else:
            slice_len = loc.stop - loc.start

        if slice_len == 0:
            drop_cols.append(key)
            continue
        elif slice_len != levsize:
            chunk = this.loc[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.labels[-1])
            value_slice = chunk.reindex(columns=level_vals_used).values
        else:
            if frame._is_mixed_type:
                value_slice = this.loc[:, this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        new_data[key] = value_slice.ravel()

    if len(drop_cols) > 0:
        new_columns = new_columns.difference(drop_cols)

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_labels = [lab.repeat(levsize) for lab in this.index.labels]
    else:
        new_levels = [this.index]
        new_labels = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(level_vals)
    new_labels.append(np.tile(level_labels, N))
    new_names.append(frame.columns.names[level_num])

    new_index = MultiIndex(levels=new_levels,
                           labels=new_labels,
                           names=new_names,
                           verify_integrity=False)

    result = DataFrame(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
Exemple #5
0
    def parse_csv(self):
        # Read the csv file, and skip the first row as it's a long string label name
        survey_data = pd.read_csv(self.in_filename)[1:]

        bb_survey_flags = [
            '2',
            '<strong>B. I want to record my experiences during the day today (please complete before going to bed).</strong>'
        ]
        # Before sleep survey data
        bb_survey = survey_data.loc[
            survey_data['QID20'] !=
            '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>']

        # Upon awakening survey data
        ab_survey = survey_data.loc[
            survey_data['QID20'] ==
            '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>']

        # Define a before sleep DataFrame
        bb_df = DataFrame()
        bb_df['User'] = bb_survey['V3']
        bb_df['Date'] = bb_survey['V8'].apply(to_ymdstr)
        bb_df['Day'] = bb_survey['V8'].apply(find_weekday_ymdhms)
        # Create empty submission times first, fill it later
        bb_df['MULT'] = ''
        bb_df['NAPN'] = bb_survey['QID27'].fillna(BLANK_E)
        bb_df['NAPT'] = bb_survey['QID11#2_1'].fillna(0).apply(
            hour_to_mins) + bb_survey['QID11#1_1'].fillna(0).apply(str_to_int)
        bb_df['ALN'] = bb_survey['QID15#3_1_1_TEXT'].fillna(BLANK_E)

        # ALT
        alt_series = bb_survey['QID15#2_1'].fillna(
            BLANK_NA) + ":" + bb_survey['QID15#1_1'].fillna(MM_ZERO)
        bb_df['ALT'] = alt_series.apply(fill_for_hhmm)

        bb_df['CAFN'] = bb_survey['QID23#3_1_1_TEXT'].fillna(BLANK_E)

        # CAFT
        caft_series = bb_survey['QID23#2_1'].fillna(
            BLANK_NA) + ":" + bb_survey['QID23#1_1'].fillna(MM_ZERO)
        bb_df['CAFT'] = caft_series.apply(fill_for_hhmm)

        # Parse SMED
        smed_df = DataFrame()
        smed_df['SMED'] = bb_survey['QID18']

        smed_df['SMED1'] = bb_survey['QID17#3_1_1_TEXT'].fillna(BLANK_E)
        smed_df['SMED1T_HH'] = bb_survey['QID17#2_1'].fillna(BLANK_NA)
        smed_df['SMED1T_MM'] = bb_survey['QID17#1_1'].fillna(MM_ZERO)

        smed_df['SMED2'] = bb_survey['QID17#3_2_1_TEXT'].fillna(BLANK_E)
        smed_df['SMED2T_HH'] = bb_survey['QID17#2_2'].fillna(BLANK_NA)
        smed_df['SMED2T_MM'] = bb_survey['QID17#1_2'].fillna(MM_ZERO)

        smed_df['SMED3'] = bb_survey['QID17#3_3_1_TEXT'].fillna(BLANK_E)
        smed_df['SMED3T_HH'] = bb_survey['QID17#2_3'].fillna(BLANK_NA)
        smed_df['SMED3T_MM'] = bb_survey['QID17#1_3'].fillna(MM_ZERO)
        smed_df = smed_df.apply(process_smed, axis=1)

        bb_df['SMED'] = smed_df['SMED']
        bb_df['SMED1'] = smed_df['SMED1']
        bb_df['SMED1T'] = smed_df['SMED1T']
        bb_df['SMED2'] = smed_df['SMED2']
        bb_df['SMED2T'] = smed_df['SMED2T']
        bb_df['SMED3'] = smed_df['SMED3']
        bb_df['SMED3T'] = smed_df['SMED3T']
        bb_df['NOTEBB'] = bb_survey['QID19'].fillna(BLANK_E)
        bb_df['ATTEMPT'] = ''
        bb_df['BT'] = ''
        bb_df['LO'] = ''
        bb_df['WT'] = ''
        bb_df['RT'] = ''
        bb_df['SOL'] = ''
        bb_df['SNZ'] = ''
        bb_df['TST'] = ''
        bb_df['WASON'] = ''
        bb_df['WASOT'] = ''
        bb_df['EA'] = ''
        bb_df['EAT'] = ''
        bb_df['SQ'] = ''
        bb_df['REST'] = ''
        bb_df['NOTEWU'] = ''
        bb_df['TIB'] = ''
        bb_df['SE1'] = ''
        bb_df['SE2'] = ''
        # process MULT
        bb_df['MULT'] = self.process_mult(bb_df)
        # test code
        # bb_df.to_csv('before_bed_survey.csv', index=False)
        # End of before sleep

        # Start for Upon awakening
        ab_df = DataFrame()
        ab_df['User'] = ab_survey['V3']
        ab_df['Date'] = ab_survey['V8'].apply(reduce_one_day_ymdstr)
        ab_df['Day'] = ab_df['Date'].apply(find_weekday_ymd)
        # submission times
        ab_df['MULT'] = ''
        ab_df['NAPN'] = ''
        ab_df['NAPT'] = ''
        ab_df['ALN'] = ''
        ab_df['ALT'] = ''
        ab_df['CAFN'] = ''
        ab_df['CAFT'] = ''
        ab_df['SMED'] = ''
        ab_df['SMED1'] = ''
        ab_df['SMED1T'] = ''
        ab_df['SMED2'] = ''
        ab_df['SMED2T'] = ''
        ab_df['SMED3'] = ''
        ab_df['SMED3T'] = ''
        ab_df['NOTEBB'] = ''

        tmp_ab_df = DataFrame()
        tmp_ab_df['Date'] = ab_df['Date']
        tmp_ab_df['ATTEMPT'] = ab_survey['QID24'].fillna('Yes').apply(
            check_for_attempt)
        tmp_ab_df['BT'] = ab_survey['QID2#2_1'].fillna(
            BLANK_NA) + ":" + ab_survey['QID2#1_1'].fillna(MM_ZERO)
        tmp_ab_df['LO'] = ab_survey['QID2#2_2'].fillna(
            BLANK_NA) + ":" + ab_survey['QID2#1_2'].fillna(MM_ZERO)
        tmp_ab_df['WT'] = ab_survey['QID2#2_3'].fillna(
            BLANK_NA) + ":" + ab_survey['QID2#1_3'].fillna(MM_ZERO)
        tmp_ab_df['RT'] = ab_survey['QID2#2_4'].fillna(
            BLANK_NA) + ":" + ab_survey['QID2#1_4'].fillna(MM_ZERO)
        tmp_ab_df['SOL'] = ab_survey['QID3#2_1'].fillna(0).apply(
            hour_to_mins) + ab_survey['QID3#1_1'].fillna(0).apply(str_to_int)
        tmp_ab_df['SNZ'] = ab_survey['QID3#2_2'].fillna(0).apply(
            hour_to_mins) + ab_survey['QID3#1_2'].fillna(0).apply(str_to_int)
        tmp_ab_df['TST'] = ab_survey['QID3#2_3'].fillna(0).apply(
            hour_to_mins) + ab_survey['QID3#1_3'].fillna(0).apply(str_to_int)
        tmp_ab_df['WASON'] = ab_survey['QID6#3_1_1_TEXT'].fillna(BLANK_E)
        tmp_ab_df['WASOT'] = ab_survey['QID6#2_1'].fillna(0).apply(
            hour_to_mins) + ab_survey['QID6#1_1'].fillna(0).apply(str_to_int)

        tmp_ab_df['EA'] = ab_survey['QID26'].fillna(BLANK_E)

        tmp_ab_df['EAT'] = ab_survey['QID7#2_1'].fillna(0).apply(
            hour_to_mins) + ab_survey['QID7#1_1'].fillna(0).apply(str_to_int)

        tmp_ab_df['SQ'] = ab_survey['QID5'].apply(fill_for_rank)
        tmp_ab_df['REST'] = ab_survey['QID8'].apply(fill_for_rank)

        tmp_ab_df = tmp_ab_df.apply(process_awaken, axis=1)

        ab_df['ATTEMPT'] = tmp_ab_df['ATTEMPT']
        ab_df['BT'] = tmp_ab_df['BT']
        ab_df['LO'] = tmp_ab_df['LO']
        ab_df['WT'] = tmp_ab_df['WT']
        ab_df['RT'] = tmp_ab_df['RT']
        ab_df['SOL'] = tmp_ab_df['SOL']
        ab_df['SNZ'] = tmp_ab_df['SNZ']
        ab_df['TST'] = tmp_ab_df['TST']
        ab_df['WASON'] = tmp_ab_df['WASON']
        ab_df['WASOT'] = tmp_ab_df['WASOT']
        ab_df['EA'] = tmp_ab_df['EA']
        ab_df['EAT'] = tmp_ab_df['EAT']
        ab_df['SQ'] = tmp_ab_df['SQ']
        ab_df['REST'] = tmp_ab_df['REST']
        ab_df['NOTEWU'] = ab_survey['QID28'].fillna(BLANK_E)
        ab_df['TIB'] = tmp_ab_df['TIB']
        ab_df['SE1'] = tmp_ab_df['SE1']
        ab_df['SE2'] = tmp_ab_df['SE2']

        # test code
        # ab_df.to_csv('after_bed_survey.csv', index=False)

        # Process MULT
        ab_df['MULT'] = self.process_mult(ab_df)

        # Merge two types of surveys together
        self.survey_new_csv = bb_df.append(ab_df, ignore_index=True)

        # sorting it first
        self.survey_new_csv = self.survey_new_csv.sort(['User', 'Date'],
                                                       ascending=[1, 1])

        # the combined_dulicated_dfs will hold the combined duplicated records
        combined_duplicated_dfs = []
        #  get all unique patient ids
        self.patient_ids = self.survey_new_csv.User.unique().tolist()

        for index, row in self.survey_new_csv.iterrows():
            user_id = row['User']
            date = row['Date']
            mult = row['MULT']
            key = '{}'.format(user_id) + '{}'.format(date) + '{}'.format(mult)
            found_index = self.get_survey_data_from_dict(key)
            # TODO: remove this temporary solution
            # if user_id == '1504':
            #     print('-- Removed the USER ID 1504 Record temporarily due to generate pdf error in R')
            #     self.survey_new_csv.drop([index], inplace=True)

            if found_index is None:
                self.set_survey_data_in_dict(key, index)
            else:
                duplicated_df = DataFrame(self.survey_new_csv,
                                          index=[found_index, index])
                # print('------ duplicated df: {}'.format(duplicated_df))
                # we drop these duplicated df recordes
                self.survey_new_csv.drop([found_index, index], inplace=True)

                # combines these two duplicated dfs into one
                combined_df = self.combine_rows(duplicated_df)
                # print('------ combined df: {}'.format(combined_df))
                #  append this into combined_duplicated_dfs list
                combined_duplicated_dfs.append(combined_df)
        # concat these combined duplicated df list
        all_duplicated = pd.concat(combined_duplicated_dfs)
        # append it into survey new csv file
        self.survey_new_csv = self.survey_new_csv.append(all_duplicated,
                                                         ignore_index=True)
Exemple #6
0
rp_nat_ = []
rps_nat_ = []
tp_nat_ = []
tps_nat_ = []
dps_nat_ = []
alt_nat_ = []

for i in range(len(flow_num_str)):
    savename_for = "/Users/user/Desktop/plot/latency_for_" + flow_num_str[
        i] + ".csv"
    savename_nat = "/Users/user/Desktop/plot/latency_nat_" + flow_num_str[
        i] + ".csv"
    savename_for_rp = "/Users/user/Desktop/plot/throughput_for_" + flow_num_str[
        i] + ".csv"
    savename_nat_rp = "/Users/user/Desktop/plot/throughput_nat_" + flow_num_str[
        i] + ".csv"
    rp_for, tp_for, tps_for, dps_for, alt_for = read_files(
        fileName_for[i], flow_num[i])
    alt_for.to_csv(savename_for)
    rp_nat, tp_nat, tps_nat, dps_nat, alt_nat = read_files(
        fileName_nat[i], flow_num[i])
    rps_for, rps_nat = get_throughput_speed(fileName_for[i], fileName_nat[i])
    a = {'0': rps_for}
    b = {'0': rps_nat}
    rp_df_for = DataFrame(a)
    rp_df_nat = DataFrame(b)
    rp_df_for.to_csv(savename_for_rp)
    rp_df_nat.to_csv(savename_nat_rp)

    alt_nat.to_csv(savename_nat)
Exemple #7
0
#It is important to check the description of the dataset we access by using the following codes 
Datatoread='F-F_Research_Data_Factors_daily'
sdate='2017-07-01'
edate='2018-06-30'

ds_factors = web.DataReader(Datatoread,'famafrench',start=sdate,end=edate) # Taking [0] as extracting 1F-F-Research_Data_Factors_2x3')
print('\nKEYS\n{}'.format(ds_factors.keys()))
print('DATASET DESCRIPTION \n {}'.format(ds_factors['DESCR']))

#ds_factors[0].head()
#copy the right dict for later examination
dfFactor = ds_factors[0].copy()/100
#dfFirm = ds_factors[4].copy()
#dfFirm['Offical_total'] = dfFirm.apply(lambda x: x.sum(), axis=1)
_ff=DataFrame(dfFactor)
_ff=_ff.reset_index()
#Data processing 
###Not necessary your case
_ff=DataFrame(dfFactor)
_ff=_ff.reset_index()
factor='SMB'
wfactor='WSMB'

_ff=_ff[['Date',factor]]
_ff.rename(columns = {'Date':'date'}, inplace = True) 

#suppose I'm reading from excel: return dataframe(sheet_name use direct int). You need to make change on the file name and path
infile='F:\\RA_Fama_French_Factor\\five_factor_model\\SIZE_HML\\Daily_SIZE_HML_TEST072018.xlsx'
strlist = infile.split('.')
stitle=strlist[0]
Exemple #8
0
 def test_stata_doc_examples(self):
     with tm.ensure_clean() as path:
         df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
         df.to_stata(path)
#model
starttime = datetime.datetime.now()  #Caculate time
sample_model = KMeans(n_clusters=10).fit(images_train_sample)  #K-Means
endtime = datetime.datetime.now()  #Caculate time
scikit_learn_execution_time = (endtime - starttime).seconds
print('scikit-learn execution time:',
      scikit_learn_execution_time)  #Caculate time 429s

#objective function value
cluster = sample_model.labels_
objective_function_value = sample_model.inertia_  #394810072745.4526
print('scikit-learn objective function value:', objective_function_value)

#accuracy
crosstable_data = {'label': labels_train_sample, 'cluster': list(cluster)}
df = DataFrame(crosstable_data)
crosstable = pd.crosstab(index=df['label'], columns=df['cluster'])
scikit_accuracy = sum(crosstable.max(axis=0)) / sum(crosstable.sum())  #0.22124
print('scikit-learn accuracy:', scikit_accuracy)

#PART3 my kmeans
######
#####
####
###
##
#

#model
starttime_2 = datetime.datetime.now()  #Caculate start time
cluster_center, cluster_assign = Kmeans(array(images_train_sample), 10)
Exemple #10
0
model.fit(x=X_train,
          y=y_train,
          epochs=3,
          batch_size=128,
          verbose=2,
          validation_split=0.1)
#预测
y_predict = model.predict(X_test)
#转换预测结果
y_predict_label = label2tag(predictions=y_predict, y=y)
#统计正确率
Y_test = label2tag(predictions=y_test, y=y)
print(
    sum([y_predict_label[i] == Y_test[i]
         for i in range(len(y_predict))]) / len(y_predict))

#导入另一个测试集进行预测,并导出结果
filename = 'xiaomi5a.csv'
test_data = pd.read_csv(filename)
x = test_data['comment']
X_cut = cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
X_seq = text2seq(texts_cut=X_cut, maxlen=maxlen, tokenizer=tokenizer)
X_seq = np.array(X_seq)
y_predict = model.predict(X_seq)
y_predict_label = label2tag(predictions=y_predict, y=y)
#Series转成dateframe
out_x = x.to_frame(name=None)
out_y = DataFrame(y_predict_label)
out_x.to_csv('x.csv')
out_y.to_csv('y.csv')
    if lista_nombres[i][-2] == lista_nombres[i][1]:
        n2 = ""

    medico = clases.Medico(n1, n2, ap1, ap2, lista_ruts[i], lista_edad[i],
                           lista_emails[i], lista_numero[i],
                           lista_especialidad[i])

    lista_medicos.append(medico)
clinica_objeto = clases.Clinica("Clinica de la Salud", "Público",
                                "Avenida Verdadera #123, Rancagua", "",
                                lista_medicos, lista_pacientes)

lista_citas = []
cita_vacia = clases.Cita("", "", "", "")
cita_csv = pd.read_csv('./datos/Citas.csv')
cita_csv = DataFrame(cita_csv)
codigo = cita_csv["codigo"].values
rut_paciente = cita_csv["rut paciente"].values
rut_medico = cita_csv["rut medico"].values
fecha_citada = cita_csv["fecha citada"].values
fecha_creacion = cita_csv["fecha de creacion"].values
modalidad = cita_csv["modalidad"].values
prestacion = cita_csv["prestacion"].values
confirmada = cita_csv["confirmada"].values
tiempo_restante = cita_csv["tiempo restante"].values

for i in range(len(codigo)):
    cita_vacia.setCodigo(codigo[i])
    cita_vacia.setPaciente(clinica_objeto.buscarPaciente(rut_paciente[i])[0])
    cita_vacia.setMedico(clinica_objeto.buscarMedico(rut_medico[i])[0])
    cita_vacia.setFechaCitada(parser.parse(fecha_citada[i]))
Exemple #12
0
    def _parse_excel(self,
                     sheetname=0,
                     header=0,
                     skiprows=None,
                     names=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     convert_float=True,
                     true_values=None,
                     false_values=None,
                     verbose=False,
                     dtype=None,
                     squeeze=False,
                     **kwds):

        skipfooter = kwds.pop('skipfooter', None)
        if skipfooter is not None:
            skip_footer = skipfooter

        _validate_header_arg(header)
        if has_index_names is not None:
            warn(
                "\nThe has_index_names argument is deprecated; index names "
                "will be automatically inferred based on index_col.\n"
                "This argmument is still necessary if reading Excel output "
                "from 0.16.2 or prior with index names.",
                FutureWarning,
                stacklevel=3)

        if 'chunksize' in kwds:
            raise NotImplementedError("chunksize keyword of read_excel "
                                      "is not implemented")

        if parse_dates is True and index_col is None:
            warn("The 'parse_dates=True' keyword of read_excel was provided"
                 " without an 'index_col' keyword value.")

        def _parse_cell(cell_contents, cell_typ):
            """converts the contents of the cell into a pandas
               appropriate object"""

            if cell_typ == XL_CELL_DATE:

                if xlrd_0_9_3:
                    # Use the newer xlrd datetime handling.
                    try:
                        cell_contents = \
                            xldate.xldate_as_datetime(cell_contents,
                                                      epoch1904)
                    except OverflowError:
                        return cell_contents
                    # Excel doesn't distinguish between dates and time,
                    # so we treat dates on the epoch as times only.
                    # Also, Excel supports 1900 and 1904 epochs.
                    year = (cell_contents.timetuple())[0:3]
                    if ((not epoch1904 and year == (1899, 12, 31))
                            or (epoch1904 and year == (1904, 1, 1))):
                        cell_contents = time(cell_contents.hour,
                                             cell_contents.minute,
                                             cell_contents.second,
                                             cell_contents.microsecond)
                else:
                    # Use the xlrd <= 0.9.2 date handling.
                    try:
                        dt = xldate.xldate_as_tuple(cell_contents, epoch1904)

                    except xldate.XLDateTooLarge:
                        return cell_contents

                    if dt[0] < MINYEAR:
                        cell_contents = time(*dt[3:])
                    else:
                        cell_contents = datetime(*dt)

            elif cell_typ == XL_CELL_ERROR:
                cell_contents = np.nan
            elif cell_typ == XL_CELL_BOOLEAN:
                cell_contents = bool(cell_contents)
            elif convert_float and cell_typ == XL_CELL_NUMBER:
                # GH5394 - Excel 'numbers' are always floats
                # it's a minimal perf hit and less suprising
                val = int(cell_contents)
                if val == cell_contents:
                    cell_contents = val
            return cell_contents

        ret_dict = False
        if isinstance(sheetname, list):
            sheets = sheetname
            ret_dict = True
        elif sheetname is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheetname]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())
        output = OrderedDict()

        import xlrd
        from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        # Keep sheetname to maintain backwards compatibility.
        for asheetname in sheets:
            if verbose:
                print("Reading sheet %s" % asheetname)
            if isinstance(asheetname, compat.string_types):
                sheet = self.book.sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.book.sheet_by_index(asheetname)

            data = []
            should_parse = {}

            if sheet.nrows > 5000:
                raise Exception(
                    "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload"
                )
            elif kwds.get('MaxTest'):
                continue

            for i in range(sheet.nrows):

                row = []
                for j, (value, typ) in enumerate(
                        zip(sheet.row_values(i), sheet.row_types(i))):
                    if parse_cols is not None and j not in should_parse:
                        should_parse[j] = self._should_parse(j, parse_cols)

                    if parse_cols is None or should_parse[j]:
                        row.append(_parse_cell(value, typ))
                data.append(row)
#            output[asheetname] = data
            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None:
                if is_list_like(header):
                    header_names = []
                    control_row = [True for x in data[0]]
                    for row in header:
                        if is_integer(skiprows):
                            row += skiprows

                        data[row], control_row = _fill_mi_header(
                            data[row], control_row)
                        header_name, data[row] = _pop_header_name(
                            data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # forward fill values for MultiIndex index
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == '' or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

            if is_list_like(header) and len(header) > 1:
                has_index_names = True

            if kwds.get('parsed'):
                try:
                    parser = TextParser(data,
                                        header=header,
                                        index_col=index_col,
                                        has_index_names=has_index_names,
                                        na_values=na_values,
                                        thousands=thousands,
                                        parse_dates=parse_dates,
                                        date_parser=date_parser,
                                        true_values=true_values,
                                        false_values=false_values,
                                        skiprows=skiprows,
                                        skipfooter=skip_footer,
                                        squeeze=squeeze,
                                        dtype=dtype,
                                        **kwds)
                    output[asheetname] = parser.read()
                    if names is not None:
                        output[asheetname].columns = names
                    if not squeeze or isinstance(output[asheetname],
                                                 DataFrame):
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                except EmptyDataError:
                    # No Data, return an empty DataFrame
                    output[asheetname] = DataFrame()
            else:
                output[asheetname] = data

        if ret_dict or kwds.get('MaxTest'):
            return output
        else:
            return output[asheetname]
Exemple #13
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels]

    index: Optional[Index]
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: Union[bool, float, int]
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: List[List] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Exemple #14
0
    def makeNewsDataCsv(cls,
                        cur=None,
                        start_date=None,
                        end_date=None,
                        basic_path=None,
                        word_trend_file=None,
                        news_file=None,
                        output_file=None,
                        stock_id=None):
        if cur == None or start_date == None or end_date == None or word_trend_file is None or output_file == None or stock_id == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        news_path = os.path.join(basic_path, news_file)
        word_trend_path = os.path.join(basic_path, word_trend_file)
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ] + ["news_pos_num", "news_neg_num"]
        data = {}
        for k in columns:
            data[k] = []
        pd.DataFrame(data).to_csv(output_path, index=False, columns=columns)

        word_trend = {}
        word_trend_temp = pd.read_csv(word_trend_path)
        for k in word_trend_temp["0"].keys():
            word_trend[word_trend_temp["0"][k]] = [
                word_trend_temp["1"][k], word_trend_temp["2"][k]
            ]
        p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] +
                                               word_trend['total_words'][1])
        p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] +
                                                 word_trend['total_words'][1])

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]

        skip = 100
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else
                   slimit - 1, skip if slimit - 1 < 0 else skip + 1))
            slimit += skip
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([
                    int(h[0]),
                    float(h[1]),
                    float(h[2]),
                    float(h[3]),
                    float(h[4]),
                    float(h[5]),
                    float(h[6]),
                    float(h[7]),
                    float(h[8]),
                    str(h[9])
                ])
            del history_tt

            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'opening': history_temp[1],
                'closing': history_temp[2],
                'difference': history_temp[3],
                'percentage_difference': history_temp[4],
                'lowest': history_temp[5],
                'highest': history_temp[6],
                'volume': history_temp[7],
                'amount': history_temp[8],
                'date': history_temp[9]
            }
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            '''
            '''
            sdate = str(history['date'][history['date'].keys()[0]])
            edate = str(history['date'][history['date'].keys()[-1]])
            # sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d')
            # sdate = (sdate - datetime.timedelta(days=0)).strftime('%Y-%m-%d')
            cur.execute(
                "SELECT GROUP_CONCAT(id  SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time"
                % (sdate, edate))
            news_temp = cur.fetchall()
            news_by_date = {}
            news_by_id = {}
            for n in news_temp:
                news_by_date[str(n[1])] = n[0].split(",")
                for nid in news_by_date[str(n[1])]:
                    news_by_id[nid] = None
            del news_temp

            nid_len = len(news_by_id)
            reader = pd.read_csv(news_path, chunksize=1000)
            for sentences in reader:
                if nid_len > 0:
                    for k in sentences['1'].keys():
                        nid = str(sentences['0'][k])
                        if nid in news_by_id and news_by_id[nid] == None:
                            news_by_id[nid] = str(sentences['1'][k]).split(" ")
                            wp_up = p_up
                            wp_down = p_down
                            for w in news_by_id[nid]:
                                if w not in word_trend:
                                    wp_up *= (1 / word_trend['total_words'][0])
                                    wp_down *= (1 /
                                                word_trend['total_words'][1])
                                else:
                                    if word_trend[w][0] > 0:
                                        wp_up *= word_trend[w][0]
                                    else:
                                        wp_up *= (1 /
                                                  word_trend['total_words'][0])

                                    if word_trend[w][1] > 0:
                                        wp_down *= word_trend[w][1]
                                    else:
                                        wp_down *= (
                                            1 / word_trend['total_words'][1])
                                while True:
                                    if wp_up < 1 and wp_down < 1:
                                        wp_up *= 10
                                        wp_down *= 10
                                    else:
                                        break

                            news_by_id[nid] = [
                                wp_up / (wp_up + wp_down),
                                -1 * wp_down / (wp_up + wp_down)
                            ]
                            nid_len -= 1
                            if nid_len <= 0:
                                break
                else:
                    break
            reader.close()
            del reader, sentences

            for d in news_by_date:
                sumn = [0, 0]
                for nid in news_by_date[d]:
                    sumn[0] += news_by_id[nid][0]
                    sumn[1] += news_by_id[nid][1]
                le = len(news_by_date[d])
                if le > 0:
                    sumn[0] /= le
                    sumn[1] /= le
                news_by_date[d] = sumn
                print(d)

            history['news_pos_num'] = 0
            history['news_neg_num'] = 0
            for i in history.index:
                history.loc[i, 'rate'] = str(
                    np.round(float(history['rate'][i]), 2))
                if str(history['date'][i]) in news_by_date:
                    history.loc[i, 'news_pos_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][0]),
                            2))
                    history.loc[i, 'news_neg_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][1]),
                            2))
                else:
                    history.loc[i, 'news_pos_num'] = "0"
                    history.loc[i, 'news_neg_num'] = "0"

            #将经过标准化的数据处理成训练集和测试集可接受的形式
            def func_train_data(data_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print("正在处理的股票代码:%06s" % data_stock.name)

                data = {}
                for k in columns:
                    data[k] = []
                for i in range(len(data_stock) - 1):
                    for k in data:
                        data[k].append(data_stock.iloc[i][k])
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          columns=columns)

            g_stock = history.groupby(by=["stock_id"])
            #清空接收路径下的文件,初始化列名
            cls.groupby_skip = False
            g_stock.apply(func_train_data)
Exemple #15
0
def pivot_annual(series, freq=None):
    """
    Deprecated. Use ``pivot_table`` instead.

    Group a series by years, taking leap years into account.

    The output has as many rows as distinct years in the original series,
    and as many columns as the length of a leap year in the units corresponding
    to the original frequency (366 for daily frequency, 366*24 for hourly...).
    The first column of the output corresponds to Jan. 1st, 00:00:00,
    while the last column corresponds to Dec, 31st, 23:59:59.
    Entries corresponding to Feb. 29th are masked for non-leap years.

    For example, if the initial series has a daily frequency, the 59th column
    of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
    and the 60th column is masked for non-leap years.
    With a hourly initial frequency, the (59*24)th column of the output always
    correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
    the 24 columns between (59*24) and (61*24) are masked.

    If the original frequency is less than daily, the output is equivalent to
    ``series.convert('A', func=None)``.

    Parameters
    ----------
    series : Series
    freq : string or None, default None

    Returns
    -------
    annual : DataFrame
    """

    msg = "pivot_annual is deprecated. Use pivot_table instead"
    warnings.warn(msg, FutureWarning)

    index = series.index
    year = index.year
    years = algorithms.unique1d(year)

    if freq is not None:
        freq = freq.upper()
    else:
        freq = series.index.freq

    if freq == 'D':
        width = 366
        offset = np.asarray(index.dayofyear) - 1

        # adjust for leap year
        offset[(~isleapyear(year)) & (offset >= 59)] += 1

        columns = lrange(1, 367)
        # todo: strings like 1/1, 1/25, etc.?
    elif freq in ('M', 'BM'):
        width = 12
        offset = np.asarray(index.month) - 1
        columns = lrange(1, 13)
    elif freq == 'H':
        width = 8784
        grouped = series.groupby(series.index.year)
        defaulted = grouped.apply(lambda x: x.reset_index(drop=True))
        defaulted.index = defaulted.index.droplevel(0)
        offset = np.asarray(defaulted.index)
        offset[~isleapyear(year) & (offset >= 1416)] += 24
        columns = lrange(1, 8785)
    else:
        raise NotImplementedError(freq)

    flat_index = (year - years.min()) * width + offset
    flat_index = _ensure_platform_int(flat_index)

    values = np.empty((len(years), width))
    values.fill(np.nan)
    values.put(flat_index, series.values)

    return DataFrame(values, index=years, columns=columns)
Exemple #16
0
target = cl.astype('int')
print (target)
# 切分訓練與測試資料
train_X, test_X, train_y, test_y = train_test_split(data, target, train_size = 0.9, random_state = 42)
print (train_y)

# 建立分類器
clf = neighbors.KNeighborsClassifier(n_neighbors = 25)
data_clf = clf.fit(train_X, train_y)

# 預測
test_y_predicted = data_clf.predict(test_X)
"""print (test_y_predicted)

# 標準答案
print (test_y)"""

# 績效
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print "accuracy : ", accuracy
precision = metrics.precision_score(test_y, test_y_predicted, average='macro')
print "precision : ", precision
recall = metrics.recall_score(test_y, test_y_predicted, average='macro')
print "recall : ", recall
f_measure = 2 * (precision * recall / (precision + recall))
print "f_measure : ", f_measure

output = {'click' : test_y_predicted}
output = DataFrame(output)
output.to_csv('output.csv', sep=',', index = 0)
Exemple #17
0
def on_data(context: Context):

    if datetime.datetime.strftime(context.now,
                                  '%Y-%m-%d') not in context.month_begin:
        return
    # 获取沪深300指数数据
    price = get_reg_kdata(reg_idx=context.reg_kdata[0],
                          length=1,
                          fill_up=True,
                          df=True)
    index = get_reg_kdata(reg_idx=context.reg_kdata[0],
                          target_indices=300,
                          length=context.long + context.Len - 1,
                          fill_up=False,
                          df=True)
    factor = get_reg_factor(reg_idx=context.reg_factor[0],
                            target_indices=(),
                            length=5,
                            df=True)
    if price['close'].isna().any():
        return
    """
    计算沪深300指数的长短期波动率,以长期波动率为门限,若短期波动率突破,
    则降低股票池持仓为50%
    """
    index['ret'] = index.groupby('target_idx')['close'].apply(
        lambda x: (x - x.shift()) / x.shift())
    index = index.fillna(0)  # 将NaN换为0
    ret = index.ret.values.astype(float)
    StdDev = talib.STDDEV(ret, timeperiod=context.Len, nbdev=1)
    StdDev = DataFrame({"a": StdDev})
    StdDev = StdDev.dropna()
    std = StdDev['a'].tolist()
    std_short = np.mean(std[-14:])
    bound = np.mean(std)

    # factor的注册频率默认为日频
    factor = factor.dropna(subset=['date'])  # 删除非法日期
    factor['code'] = factor['target_idx'].apply(
        lambda x: context.target_list[x])  # 将用0,1,2,3等表示的股票换成对应的股票代码
    factor['month'] = factor['date'].apply(lambda x: int(
        str(x)[0:4] + str(x)[5:7]))  # 增加month列,2017-01,2017-02,只记录月份,不记录日时分秒
    factor_name = factor['factor'].drop_duplicates().tolist()  # 以列表的形式取出因子名称
    # 将factor按['target_idx','month','factor']分组,分别取每组的最后一行
    # 即取出各股票每个月末的所有因子值
    factor_month = factor.groupby(
        ['target_idx', 'month',
         'factor']).apply(lambda x: x.iloc[-1])[['date',
                                                 'value']].reset_index()
    # 添加所有因子名作为新的列
    factor_month1 = factor_month.groupby(['target_idx',
                                          'month']).apply(deal).reset_index()
    """
    取最后一个月(当前时间)
    """
    test = factor_month1.groupby('target_idx').apply(lambda x: x.iloc[-1])
    scaler = StandardScaler()  # 标准化

    X_test = test[factor_name]
    X_test = X_test.fillna(0).values
    #X_test=scaler.fit_transform(X_test)      # 因子标准化
    X_test = scaler.fit_transform(X_test)  # 因子标准化

    # 预测
    model = pickle.load(open("XGboost_ret0.06_5factor.pickle.dat", "rb"))
    y_pred = model.predict(X_test)
    y_pred1 = pd.DataFrame(y_pred, columns=['label'])
    idx_list = list(y_pred1[y_pred1['label'] == 1].index)
    print(idx_list)

    positions = context.account().positions
    # 根据波动率进行风险控制
    if std_short > bound:
        for target_idx in positions.loc[positions['volume_long'] > 0,
                                        'target_idx'].astype(int):
            if target_idx == 300:
                pass
            else:
                volume = positions['volume_long'].iloc[target_idx]
                order_volume(account_idx=0,
                             target_idx=target_idx,
                             volume=int(volume * 0.5),
                             side=2,
                             position_effect=2,
                             order_type=2,
                             price=0)

    if len(idx_list) == 0:  # 没有一只股票在标的池,则卖出全部股票
        for target_idx in positions.loc[positions['volume_long'] > 0,
                                        'target_idx'].astype(int):
            if target_idx == 300:
                pass
            else:
                volume = positions['volume_long'].iloc[target_idx]
                order_volume(account_idx=0,
                             target_idx=target_idx,
                             volume=int(volume),
                             side=2,
                             position_effect=2,
                             order_type=2,
                             price=0)

    else:
        # 平不在标的池的股票
        for target_idx in positions.target_idx.astype(int):
            if target_idx not in idx_list:
                if positions['volume_long'].iloc[target_idx] > 0:
                    volume = positions['volume_long'].iloc[target_idx]
                    order_volume(account_idx=0,
                                 target_idx=target_idx,
                                 volume=int(volume),
                                 side=2,
                                 position_effect=2,
                                 order_type=2,
                                 price=0)

        # 获取股票的权重
        percent_b = context.ratio / len(idx_list)
        # print(percent_b)
        # 买在标的池中的股票
        for target_idx in idx_list:
            if target_idx == 300:
                pass
            else:
                order_target_percent(account_idx=0,
                                     target_idx=target_idx,
                                     target_percent=percent_b,
                                     side=1,
                                     order_type=2)

        print(positions.loc[positions['volume_long'] > 0, 'code'].tolist())
Exemple #18
0
def predict_role(ps):
    fd = pd.read_csv('player_label.csv')

    df_obj = fd.label
    fd.label = df_obj.apply(lambda x: str(x).strip())
    print(fd.label)
    test_set = fd[['label']]
    train_set = fd[[
        'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing',
        'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
        'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration',
        'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
        'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
        'interceptions', 'positioning', 'vision', 'penalties', 'marking',
        'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling',
        'gk_kicking', 'gk_positioning', 'gk_reflexes'
    ]]
    train_set = train_set[1:]
    test_set = test_set[1:]

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(train_set,
                                                        test_set,
                                                        test_size=0.33,
                                                        random_state=12)

    from sklearn.naive_bayes import MultinomialNB
    clf_NB = MultinomialNB().fit(x_train, y_train)
    predicted = clf_NB.predict(x_test)
    import numpy as np
    from sklearn import metrics
    print("#################### NB ######################")
    confusion_matrix_NB = metrics.confusion_matrix(y_test, predicted)
    print(confusion_matrix_NB)
    accuracy_NB = metrics.accuracy_score(y_test, predicted)
    print(accuracy_NB)
    #	print(metrics.classification_report(y_test, predicted))

    print("##############################################")

    from sklearn import tree
    clf_tree = tree.DecisionTreeClassifier().fit(x_train, y_train)
    predicted = clf_tree.predict(x_test)
    print("#################### Decision Tree ######################")
    print(metrics.confusion_matrix(y_test, predicted))
    accuracy_DT = metrics.accuracy_score(y_test, predicted)
    print(accuracy_DT)
    #	print(metrics.classification_report(y_test, predicted))
    print("##############################################")

    from sklearn.linear_model import SGDClassifier
    clf_SGD = SGDClassifier().fit(x_train, y_train)
    predicted = clf_SGD.predict(x_test)
    print("#################### SGD Classifier ######################")
    print(metrics.confusion_matrix(y_test, predicted))
    accuracy_SGD = metrics.accuracy_score(y_test, predicted)
    print(accuracy_SGD)
    print("##############################################")

    from pandas.core.frame import DataFrame
    predict_data = DataFrame(ps)

    print(
        "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------"
    )
    print(predict_data)
    print(
        "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------"
    )

    predict_data = predict_data.iloc[:, 7:]
    print(
        "---------------------- become 38  ----------------------------------------"
    )
    print(predict_data)
    print(
        "---------------------- become 38  ----------------------------------------"
    )

    accuracy_list = [accuracy_NB, accuracy_DT, accuracy_SGD]
    if max(accuracy_list) == accuracy_NB:
        clf_model = clf_NB
    elif max(accuracy_list) == accuracy_DT:
        clf_model = clf_tree
    elif max(accuracy_list) == accuracy_SGD:
        clf_model = clf_SGD
    predicted = clf_model.predict(predict_data)
    print("************* model selection ****************")
    print(clf_model)
    pd.value_counts(predicted)
    print(predicted)
    print(type(predicted))
    return predicted.tolist()
Exemple #19
0
    def recommend(self, userID:int, portFolioModel:DataFrame, argumentsDict:Dict[str,object]):
        if type(userID) is not int and type(userID) is not np.int64:
            raise ValueError("Argument userID isn't type int.")
        if type(portFolioModel) is not DataFrame:
            raise ValueError("Argument portFolioModel isn't type DataFrame.")
        if type(argumentsDict) is not dict:
            raise ValueError("Argument argumentsDict isn't type dict.")

        numberOfItems:int = argumentsDict[self.ARG_NUMBER_OF_AGGR_ITEMS]

        recomItemIDsWithRspR1Ser:Series = self._recommender.recommend(userID, numberOfItems=numberOfItems, argumentsDict=argumentsDict)

        recomItemIDsAggr1:List[int]
        recomItemIDsWithRspAggr1:Series
        recomItemIDsAggr1, recomItemIDsWithRspAggr1 = self._portfolio1Aggr.recommend(userID, portFolioModel, argumentsDict=argumentsDict)
        print(recomItemIDsWithRspAggr1)

        aggrBanditsResp = countAggrBanditsResponsibility(recomItemIDsWithRspAggr1, portFolioModel)
        #aggrBanditsResp = countAggrDHondtResponsibility(dict(recomItemIDsWithRspAggr1), portFolioModel)
        aggrBanditsRespSer:Series = Series(dict(aggrBanditsResp))

        recomItemIDsNegativeSer:Series = Series(self._penaltyTool.getPenaltiesOfItemIDs(userID, self._history))
        if len(recomItemIDsNegativeSer) > 0:
            finalNegScores = normalize(np.expand_dims(recomItemIDsNegativeSer.values, axis=0))[0, :]
            recomItemIDsNegativeSer:Series = Series(finalNegScores.tolist(), index=recomItemIDsNegativeSer.index)
        #print(a)

        #recomItemIDsNegative = normalize(np.expand_dims(recomItemIDsNegative, axis=0))[0, :]

        inputItemIDsDict:dict = {"input1":recomItemIDsWithRspR1Ser,
                                 "input2":aggrBanditsRespSer,
                                 "negative":recomItemIDsNegativeSer}

        aggItemIDsWithRelevanceSer:Series = self._aggrHier.runWithResponsibility(inputItemIDsDict, DataFrame(), userID, numberOfItems, argumentsDict)

        aggItemIDs:List[int] = list(aggItemIDsWithRelevanceSer.index)

        aggItemIDsWithRelevance:List = [(itemI, dict(recomItemIDsWithRspAggr1).get(itemI, {})) for itemI in aggItemIDs]

        return (aggItemIDs, aggItemIDsWithRelevance)
Exemple #20
0
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 default_kind=None,
                 default_fill_value=None,
                 dtype=None,
                 copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(data,
                                      index,
                                      columns,
                                      dtype=dtype,
                                      fill_value=default_fill_value)
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(data.to_frame(),
                                  data.index,
                                  columns=None,
                                  dtype=dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data,
                                 axes=dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan,
                                          index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        else:
            msg = ('SparseDataFrame called with unknown type "{data_type}" '
                   'for data argument')
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)
Exemple #21
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0:  # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else:  # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index = Index(_convert_types(index, self.na_values),
                              name=self.index_name)
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arrays.append(_convert_types(arr, self.na_values))
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index.get_duplicates()
            raise Exception('Index has duplicates: %s' % str(dups))

        if len(self.columns) != len(zipped_content):
            raise Exception('wrong number of columns')

        data = dict((k, v) for k, v in zip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = np.vectorize(f)(data[col])

        data = _convert_to_ndarrays(data, self.na_values)

        return DataFrame(data=data, columns=self.columns, index=index)
Exemple #22
0
    def granger_causality(self):
        """Returns the f-stats and p-values from the Granger Causality Test.

        If the data consists of columns x1, x2, x3, then we perform the
        following regressions:

        x1 ~ L(x2, x3)
        x1 ~ L(x1, x3)
        x1 ~ L(x1, x2)

        The f-stats of these results are placed in the 'x1' column of the
        returned DataFrame.  We then repeat for x2, x3.

        Returns
        -------
        Dict, where 'f-stat' returns the DataFrame containing the f-stats,
        and 'p-value' returns the DataFrame containing the corresponding
        p-values of the f-stats.
        """
        from pandas.stats.api import ols
        from scipy.stats import f

        d = {}
        for col in self._columns:
            d[col] = {}
            for i in xrange(1, 1 + self._p):
                lagged_data = self._lagged_data[i].filter(self._columns -
                                                          [col])

                for key, value in lagged_data.iteritems():
                    d[col][_make_param_name(i, key)] = value

        f_stat_dict = {}
        p_value_dict = {}

        for col, y in self._data.iteritems():
            ssr_full = (self.resid[col]**2).sum()

            f_stats = []
            p_values = []

            for col2 in self._columns:
                result = ols(y=y, x=d[col2])

                resid = result.resid
                ssr_reduced = (resid**2).sum()

                M = self._p
                N = self._nobs
                K = self._k * self._p + 1
                f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
                f_stats.append(f_stat)

                p_value = f.sf(f_stat, M, N - K)
                p_values.append(p_value)

            f_stat_dict[col] = Series(f_stats, self._columns)
            p_value_dict[col] = Series(p_values, self._columns)

        f_stat_mat = DataFrame(f_stat_dict)
        p_value_mat = DataFrame(p_value_dict)

        return {
            'f-stat': f_stat_mat,
            'p-value': p_value_mat,
        }
Exemple #23
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=np.uint8)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=np.uint8)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Exemple #24
0
    def __init__(self, data=None, index=None, columns=None, default_kind=None,
                 default_fill_value=None, dtype=None, copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if isinstance(data, dict):
            mgr = self._init_dict(data, index, columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype, copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                                 dtype=dtype, copy=copy)
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = _ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan, index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)

        NDFrame.__init__(self, mgr)
Exemple #25
0
    def combine_rows(self, to_combined_df):
        tmp_df = DataFrame()
        tmp_df['User'] = to_combined_df['User'].values[:1]
        tmp_df['Date'] = to_combined_df['Date'].values[:1]
        tmp_df['Day'] = to_combined_df['Day'].values[:1]
        tmp_df['MULT'] = to_combined_df['MULT'].values[:1]
        napn = to_combined_df['NAPN'].fillna('').values
        if napn[0] != '':
            tmp_df['NAPN'] = napn[0]
        else:
            tmp_df['NAPN'] = napn[1]

        napt = to_combined_df['NAPT'].fillna('').values
        if napt[0] != '':
            tmp_df['NAPT'] = napt[0]
        else:
            tmp_df['NAPT'] = napt[1]

        aln = to_combined_df['ALN'].fillna('').values
        if aln[0] != '':
            tmp_df['ALN'] = aln[0]
        else:
            tmp_df['ALN'] = aln[1]

        alt = to_combined_df['ALT'].fillna('').values
        if alt[0] != '':
            tmp_df['ALT'] = alt[0]
        else:
            tmp_df['ALT'] = alt[1]

        cafn = to_combined_df['CAFN'].fillna('').values
        if cafn[0] != '':
            tmp_df['CAFN'] = cafn[0]
        else:
            tmp_df['CAFN'] = cafn[1]

        caft = to_combined_df['CAFT'].fillna('').values
        if caft[0] != '':
            tmp_df['CAFT'] = caft[0]
        else:
            tmp_df['CAFT'] = caft[1]

        smed = to_combined_df['SMED'].fillna('').values
        if smed[0] != '':
            tmp_df['SMED'] = smed[0]
        else:
            tmp_df['SMED'] = smed[1]

        smed1 = to_combined_df['SMED1'].fillna('').values
        if smed1[0] != '':
            tmp_df['SMED1'] = smed1[0]
        else:
            tmp_df['SMED1'] = smed1[1]

        smed1t = to_combined_df['SMED1T'].fillna('').values
        if smed1t[0] != '':
            tmp_df['SMED1T'] = smed1t[0]
        else:
            tmp_df['SMED1T'] = smed1t[1]

        smed2 = to_combined_df['SMED2'].fillna('').values
        if smed2[0] != '':
            tmp_df['SMED2'] = smed2[0]
        else:
            tmp_df['SMED2'] = smed2[1]

        smed2t = to_combined_df['SMED2T'].fillna('').values
        if smed2t[0] != '':
            tmp_df['SMED2T'] = smed2t[0]
        else:
            tmp_df['SMED2T'] = smed2t[1]

        smed3 = to_combined_df['SMED3'].fillna('').values
        if smed3[0] != '':
            tmp_df['SMED3'] = smed3[0]
        else:
            tmp_df['SMED3'] = smed3[1]

        smed3t = to_combined_df['SMED3T'].fillna('').values
        if smed3t[0] != '':
            tmp_df['SMED3T'] = smed3t[0]
        else:
            tmp_df['SMED3T'] = smed3t[1]

        notebb = to_combined_df['NOTEBB'].fillna('').values
        if notebb[0] != '':
            tmp_df['NOTEBB'] = notebb[0]
        else:
            tmp_df['NOTEBB'] = notebb[1]

        attempt = to_combined_df['ATTEMPT'].fillna('').values
        if attempt[0] != '':
            tmp_df['ATTEMPT'] = attempt[0]
        else:
            tmp_df['ATTEMPT'] = attempt[1]

        bt = to_combined_df['BT'].fillna('').values
        if bt[0] != '':
            tmp_df['BT'] = bt[0]
        else:
            tmp_df['BT'] = bt[1]

        lo = to_combined_df['LO'].fillna('').values
        if lo[0] != '':
            tmp_df['LO'] = lo[0]
        else:
            tmp_df['LO'] = lo[1]

        wt = to_combined_df['WT'].fillna('').values
        if wt[0] != '':
            tmp_df['WT'] = wt[0]
        else:
            tmp_df['WT'] = wt[1]

        rt = to_combined_df['RT'].fillna('').values
        if rt[0] != '':
            tmp_df['RT'] = rt[0]
        else:
            tmp_df['RT'] = rt[1]

        sol = to_combined_df['SOL'].fillna('').values
        if sol[0] != '':
            tmp_df['SOL'] = sol[0]
        else:
            tmp_df['SOL'] = sol[1]

        snz = to_combined_df['SNZ'].fillna('').values
        if snz[0] != '':
            tmp_df['SNZ'] = snz[0]
        else:
            tmp_df['SNZ'] = snz[1]

        tst = to_combined_df['TST'].fillna('').values
        if tst[0] != '':
            tmp_df['TST'] = tst[0]
        else:
            tmp_df['TST'] = tst[1]

        wason = to_combined_df['WASON'].fillna('').values
        if wason[0] != '':
            tmp_df['WASON'] = wason[0]
        else:
            tmp_df['WASON'] = wason[1]

        wasot = to_combined_df['WASOT'].fillna('').values
        if wasot[0] != '':
            tmp_df['WASOT'] = wasot[0]
        else:
            tmp_df['WASOT'] = wasot[1]

        ea = to_combined_df['EA'].fillna('').values
        if ea[0] != '':
            tmp_df['EA'] = ea[0]
        else:
            tmp_df['EA'] = ea[1]

        eat = to_combined_df['EAT'].fillna('').values
        if eat[0] != '':
            tmp_df['EAT'] = eat[0]
        else:
            tmp_df['EAT'] = eat[1]

        sq = to_combined_df['SQ'].fillna('').values
        if sq[0] != '':
            tmp_df['SQ'] = sq[0]
        else:
            tmp_df['SQ'] = sq[1]

        rest = to_combined_df['REST'].fillna('').values
        if rest[0] != '':
            tmp_df['REST'] = rest[0]
        else:
            tmp_df['REST'] = rest[1]

        notewu = to_combined_df['NOTEWU'].fillna('').values
        if notewu[0] != '':
            tmp_df['NOTEWU'] = notewu[0]
        else:
            tmp_df['NOTEWU'] = notewu[1]

        tib = to_combined_df['TIB'].fillna('').values
        if tib[0] != '':
            tmp_df['TIB'] = tib[0]
        else:
            tmp_df['TIB'] = tib[1]

        se1 = to_combined_df['SE1'].fillna('').values
        if se1[0] != '':
            tmp_df['SE1'] = se1[0]
        else:
            tmp_df['SE1'] = se1[1]

        se2 = to_combined_df['SE2'].fillna('').values
        if se2[0] != '':
            tmp_df['SE2'] = se2[0]
        else:
            tmp_df['SE2'] = se2[1]
        return tmp_df
Exemple #26
0
 def _unstack_vector(self, vec, index=None):
     if index is None:
         index = self._y_trans.index
     panel = DataFrame(vec, index=index, columns=['dummy'])
     return panel.to_panel()['dummy']
Exemple #27
0
 def get_empty_frame(data) -> DataFrame:
     if isinstance(data, Series):
         index = data.index
     else:
         index = np.arange(len(data))
     return DataFrame(index=index)
Exemple #28
0
 def __init__(self, records, columns):
     self.dataframe = DataFrame(records, columns=columns)
'''
from pandas.core.frame import DataFrame
from printheader import print_header

cols = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
index = ['a', 'b', 'c', 'd', 'e', 'f']
values = [
    [100, 110, 120, 130, 140],
    [200, 210, 220, 230, 240],
    [300, 310, 320, 330, 340],
    [400, 410, 420, 430, 440],
    [500, 510, 520, 530, 540],
    [600, 610, 620, 630, 640],
]
print_header('values:')
print(values, '\n\n')

df = DataFrame(values, index=index, columns=cols)
print_header('DataFrame df')
print(df, '\n')

df2 = df.drop(['beta', 'delta'], axis=1)
print_header("After dropping beta and delta:")
print(df2, '\n')

print_header("After dropping rows b, c, and e")
dfx = df.drop(['b', 'c', 'e'], inplace=True)
print(df)
print(df['a':'d'])
print(dfx)
Exemple #30
0
    def makeBindexDataCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          basic_path=None,
                          output_file=None,
                          word_count=20,
                          stock_id=None,
                          ranking_type='tfidf'):
        if cur == None or start_date == None or end_date == None or output_file == None or stock_id == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if word_count < 0:
            word_count = 20
        if ranking_type not in ["tfidf", "textrank"]:
            ranking_type = "tfidf"
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type)
        word_count = len(words)
        for i in range(len(words)):
            words[i] = "'" + words[i] + "'"
        words_str = ",".join(words)
        del words

        word_key_list = []
        for i in range(1, word_count + 1):
            word_key_list.append("word%s" % i)
        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ] + word_key_list
        data = {}
        for k in columns:
            data[k] = []
        pd.DataFrame(data).to_csv(output_path, index=False, columns=columns)

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]

        skip = 50
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else
                   slimit - 1, skip if slimit - 1 < 0 else skip + 1))
            slimit += skip
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([
                    int(h[0]),
                    float(h[1]),
                    float(h[2]),
                    float(h[3]),
                    float(h[4]),
                    float(h[5]),
                    float(h[6]),
                    float(h[7]),
                    float(h[8]),
                    str(h[9])
                ])
            del history_tt

            sdate = str(history_t[0][9])
            edate = str(history_t[-1][9])
            sdate = datetime.datetime.strptime(sdate, '%Y-%m-%d')
            sdate = (sdate - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
            cur.execute(
                "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc"
                % (words_str, sdate, edate))
            bindex = cur.fetchall()
            bindex_t = []
            bindex_vec = 0
            cur_date = None
            if len(bindex) > 0:
                cur_date = str(bindex[0][2])
            bix = []
            bix_item = [cur_date]
            if len(bindex) > 0:
                for bi in bindex:
                    if str(bi[2]) != cur_date:
                        cur_date = str(bi[2])
                        bix.append(bix_item)
                        bix_item = [cur_date]
                    bix_temp = json.loads(bi[1])
                    bix_item.append(bix_temp['all']['0'])
                bix.append(bix_item)
            del bindex

            bindex = {}
            for k in range(1, len(bix)):
                b_t = []
                for kk in range(1, len(bix[k])):
                    if int(bix[k][kk]) != 0 and int(bix[k - 1][kk]) != 0:
                        b_t.append(
                            str(
                                np.round(
                                    float(100 * (int(bix[k][kk]) /
                                                 int(bix[k - 1][kk]) - 1)),
                                    2)))
                    else:
                        b_t.append(str(0.01))
                bindex[bix[k][0]] = b_t
            del bix

            for i in range(len(history_t)):
                history_t[i] += bindex[history_t[i][9]]
            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'opening': history_temp[1],
                'closing': history_temp[2],
                'difference': history_temp[3],
                'percentage_difference': history_temp[4],
                'lowest': history_temp[5],
                'highest': history_temp[6],
                'volume': history_temp[7],
                'amount': history_temp[8],
                'date': history_temp[9]
            }
            for i in range(10, 10 + word_count):
                history["word%s" % (i - 9)] = history_temp[i]
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            for i in history.index:
                history.loc[i, 'rate'] = str(
                    np.round(float(history['rate'][i]), 2))

            #将经过标准化的数据处理成训练集和测试集可接受的形式
            def func_train_data(data_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print("正在处理的股票代码:%06s" % data_stock.name)

                data = {}
                for k in columns:
                    data[k] = []
                for i in range(len(data_stock) - 1):
                    for k in data:
                        data[k].append(data_stock.iloc[i][k])
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          columns=columns)

            g_stock = history.groupby(by=["stock_id"])
            #清空接收路径下的文件,初始化列名
            cls.groupby_skip = False
            g_stock.apply(func_train_data)