Example #1
0
 def get_data_frame(self):
     data = DataFrame(self.data)
     data.columns = ['学校', '考试方式', '院系所', '', '专业',
                     '学习方式', '研究方向', '指导教师', '拟招生人数', '备注']
     data.drop(labels='', axis=1, inplace=True)
     data.to_csv(self.provinceName + "研究生招生信息.csv",
                 encoding="utf_8_sig", index=False)
def process_data(data):
    drop_list = []  #the list of lines we need to drop
    new_zip = []
    new_AMT = []
    new_DT = []
    #make a dataframe
    header=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI','IMAGE_NUM','TRANSACTION_TP',\
            'ENTITY_TP','NAME','CITY','STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',\
            'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD','MEMO_TEXT','SUB_ID']
    df = DataFrame(data, columns=header)
    #make new dataframe with the information we need
    df = df[[
        'CMTE_ID', 'NAME', 'ZIP_CODE', 'TRANSACTION_DT', 'TRANSACTION_AMT',
        'OTHER_ID'
    ]]
    #transfer string in TRANSACTION_AMT into int
    for i in df['TRANSACTION_AMT']:
        new_AMT.append(int(i))
    df['TRANSACTION_AMT'] = new_AMT
    #remove invalid data
    for i in list(df.index.values):
        if df.loc[i]['CMTE_ID']=='' or validate_name(df.loc[i]['NAME'])==False \
        or (len(df.loc[i]['ZIP_CODE'])>=5)==False or validate_date(df.loc[i]['TRANSACTION_DT'])!=True \
        or df.loc[i]['TRANSACTION_AMT']=='' or df.loc[i]['OTHER_ID']!='':
            drop_list.append(i)
        else:
            new_zip.append(df.loc[i]['ZIP_CODE'][0:5])
            new_DT.append(int(df.loc[i]['TRANSACTION_DT'][4:]))
    #remove invalid line in drop list
    df.drop(drop_list, inplace=True)
    #Add number of
    df['ZIP_CODE'] = new_zip
    df['TRANSACTION_DT'] = new_DT
    return df
Example #3
0
def condense_heatmap(df_input: DataFrame, new_order: List[str]) -> DataFrame:
    """
    Converts the np.array with stored enrichment scores into the condensed heatmap
    """
    df_input = df_input.copy()
    df_input.drop(['Position'], axis=1, inplace=True)

    # Group by sequence and aminoacid, and then pivot table
    df_grouped = df_input.groupby(['Sequence', 'Aminoacid'], sort=False).mean()
    df_pivoted = df_grouped.pivot_table(values='Score',
                                        index='Aminoacid',
                                        columns='Sequence')
    df_pivoted.reset_index(drop=False, inplace=True)

    # Sort in y axis desired order
    df_pivoted['Aminoacid'] = Categorical(df_pivoted['Aminoacid'], new_order)
    df_pivoted = df_pivoted.sort_values(by=['Aminoacid'])

    # Sort in x axis desired order
    x_order = return_common_elements(new_order, list(df_pivoted.columns))

    # Drop amino acid column
    data_dropped = df_pivoted.drop(['Aminoacid'], axis=1)

    return data_dropped[x_order]
Example #4
0
def _drop_single_point(move_data: DataFrame, label_new_tid: str,
                       label_id: str):
    """
    Removes trajectory with single point.

    Parameters
    ----------
    move_data: dataframe
        dataframe with trajectories
    label_new_tid : str
        The label of the column containing the ids of the formed segments.
        Is the new splitted id.
    label_id : str
         Indicates the label of the id column in the user dataframe, by default TRAJ_ID

    """
    shape_before_drop = move_data.shape
    idx = move_data[move_data[label_new_tid] == -1].index
    if idx.shape[0] > 0:
        logger.debug('...Drop Trajectory with a unique GPS point\n')
        ids_before_drop = move_data[label_id].unique().shape[0]
        move_data.drop(index=idx, inplace=True)
        logger.debug('...Object - before drop: {} - after drop: {}'.format(
            ids_before_drop, move_data[label_id].unique().shape[0]))
        logger.debug('...Shape - before drop: {} - after drop: {}'.format(
            shape_before_drop, move_data.shape))
    else:
        logger.debug('...No trajectories with only one point.')
def deleteHeadToTrain(data: DataFrame, rows: int):
    data = data.drop(range(0, rows), axis=0)
    for x in range(len(data.index)):
        if data['HM'].values[x] < 3 or data['P1'].values[x] < 4 or data[
                'AM'].values[x] < 3 or data['P2'].values[x] < 4:
            data.drop(x, axis=0)
    return data
Example #6
0
def ListoDF(data):
    if isinstance(data, list):
        Df = DataFrame(data)  # 转为数据框
        Df.columns = Df.iloc[0, :]  # 修改列名
        Df.drop(0, axis=0, inplace=True)  # 删除第一行
    else:
        Df = data
    return Df
Example #7
0
def _sort_yaxis_aminoacids(df_input: DataFrame, neworder_aminoacids: list,
                           old_order: list) -> DataFrame:
    # Sort in y axis desired order
    df_input['Aminoacid_new'] = old_order
    df_input['Aminoacid_new'] = Categorical(df_input['Aminoacid_new'],
                                            neworder_aminoacids)
    df_input.sort_values(by=['Aminoacid_new'], inplace=True)
    df_input.drop(['Aminoacid_new'], inplace=True, axis=1)
    return df_input
 def find_matches(self, df: DataFrame, currentIdx: int, indexes: list,
                  currentPattern):
     if not (str(currentPattern) in df.columns):
         df[str(currentPattern)] = 0
     for index in indexes:
         amount = df.at[index, 'totalcount']
         df.at[currentIdx, 'totalcount'] += amount
         df.at[currentIdx, str(currentPattern)] += amount
         df.drop(index, inplace=True)
     return df
def process_dataframe(df: DataFrame) -> DataFrame:
    df = df.drop(index=[0, 1, 4, len(df) - 1])

    df_member_df = df.loc[3]
    df_member_df = df_member_df.dropna()

    df_member_list = list()

    for i in list(df_member_df):
        if '\n' in i:
            temp_text = i.split('\n')
            if temp_text[0]:
                df_member_list.append(temp_text[0])

    df_column_pre = df.loc[2]

    df_column_pre = df_column_pre.drop(df_column_pre.tail(2).index)

    df_column_pre = df_column_pre.dropna()

    df_column = pd.concat([df_column_pre, pd.DataFrame(df_member_list)])

    df_column = df_column[~df_column[0].str.contains("후보자별 득표수")]

    df = df.drop(index=[2, 3])

    df_left = df[df.columns[:len(df_column)]]
    df_left = df_left.reset_index(drop=True)
    df_right = df[df.columns[-2:]]
    df_right = df_right.reset_index(drop=True)

    df_dataset = pd.merge(df_left,
                          df_right,
                          how='outer',
                          left_index=True,
                          right_index=True)

    df_column = pd.concat([df_column, pd.DataFrame(["무효 투표수", "기권수"])])

    df_dataset.columns = [
        *list(df_column.reset_index().drop(columns=['index'])[0])
    ]

    # df_dataset['취소표'] = df_dataset['무효 투표수'].add(df_dataset['기권수'])

    df_dataset = df_dataset.drop(columns=['무효 투표수', '기권수'])

    df_dataset = df_dataset.drop(index=range(5, len(df_dataset)),
                                 columns=['투표구명'])

    return df_dataset
Example #10
0
 def _onehotItemsGenresML(self, items: DataFrame):
     one_hot_encoding = items["Genres"].str.get_dummies(sep='|')
     one_hot_encoding.drop(one_hot_encoding.columns[0],
                           axis=1,
                           inplace=True)
     tmp = items.drop(['Genres'], axis=1, inplace=False)
     return pd.concat([tmp, one_hot_encoding], axis=1)
def analysis(results: DataFrame):
    df = results.drop(['trial_iteration', 'rsi_idx'], 1)
    uniq_names = df['trial_name'].nunique()
    tests_per_name = len(df) / uniq_names
    cor_df = df[df['correct'] == True].groupby('trial_name')['correct']
    pct_correct_by_type = cor_df.value_counts() / tests_per_name * 100
    std_dev_by_type = df.groupby('trial_name')['correct'].std()
    adf = pct_correct_by_type.to_frame().join(
        std_dev_by_type,
        on='trial_name',
        lsuffix="_percent",
        rsuffix="_stddev").reset_index().drop('correct',
                                              1).set_index('trial_name')
    adf['p-value'] = pandas.NA

    # f-test
    unw_d = df[df['trial_name'] == "Unweighted_Disalike"]['correct']
    p1 = unw_d.to_numpy()
    for name in df['trial_name'].unique():
        if name == "Unweighted_Disalike":
            continue
        other_df = df[df['trial_name'] == name]['correct']
        p2 = other_df.to_numpy()
        f, p = ftest(p1, p2)
        adf.at[name, 'p-value'] = p

    return adf
Example #12
0
def create_paths_models(df):

	# Models will be indexed in 0, 64, 128...

	lines = df.line.unique()
	jump = 64

	new_df = DataFrame()

	for line in lines:
		print('line:', line)
		current_df = df[df.line == line]

		new_df = new_df.append(current_df.iloc[0])

		possible_index_paths = current_df.index_path.unique()
		total = possible_index_paths[-1] - possible_index_paths[0]
		count = 0
		for index_path in range(possible_index_paths[0], possible_index_paths[-1], jump):
			
			count += (1*jump)
			current_path = current_df[current_df.index_path == index_path]
			print(count/total *100)
			for row in current_path.iterrows():
				row = row[1]
				if not has_distance_from_coordinate(new_df[new_df.line == line], row, distance=minimum_distance):
					new_df = new_df.append(row)
					

	return new_df.drop(['index_path', 'order'], axis=1)
Example #13
0
def extract_parnoise():
    # 抽取数据
    path = 'D:/yansixing/tmp'
    source = 'parnoise_data.csv'
    poslist, neglist = [], []
    with open(os.path.join(path, source), 'r', encoding='gbk', errors='ignore', newline='') as f:
        for line in f.readlines():
            pos = None
            if ',' in line:
                neg, pos = tuple(line.split(','))
            else:
                neg = line
            if pos is not None and len(pos.strip()) > 1:
                poslist.append(pos.strip())
            if len(neg.strip()) > 1:
                neglist.append(neg.strip())

    targets = [1] * len(poslist) + [0] * len(neglist)
    sent = poslist + neglist
    df = DataFrame({'target': targets, 'sent': sent})

    print('Data shape : ', df.shape)
    print(df.head())

    #pattern = re.compile(r'([\u4e00-\u9fa5])')
    df['sent'] = df['sent'].apply(lambda x: ''.join(w.strip() for w in re.findall(r'[\u4e00-\u9fa5]', x) if len(w.strip()) > 0))

    # 使用特征抽取模式,只使用一个特征
    print('Get features')

    def getFeature(X):
        m = getPikcle(os.path.join(const.PKPATH, 'lm_3_paopao_jieba.pk'))
        X['sent'] = X['sent'].apply(lambda x: str(x))
        X['sent'] = X['sent'].apply(lambda x: ' '.join(w for w in jieba.cut(x)))
        X['3n_etp_n_jieba'] = X['sent'].apply(lambda x: m.entropy(ngrams(x, 3, True, True, '<s>', '</s>')) if m.entropy(ngrams(x, 3, True, True, '<s>', '</s>')) != float('inf') else -1)
        return X

    df = getFeature(df)
    print(df.head())
    df.drop(['sent'], axis=1).to_csv(os.path.join(const.DATAPATH, 'parnoise_feats.csv'))

    # 使用fasttext格式
    print('Get fasttext')
    with open(os.path.join(const.DATAPATH, 'parnoise_fasttext.txt'), 'a', encoding='utf-8', errors='ignore') as f:
        for x, y in zip(df['target'].values.tolist(), df['sent'].values.tolist()):
            line = '{0}\t__label__{1}\n'.format(y, x)
            f.write(line)
Example #14
0
def splitDataRandom(data: DataFrame, split_ratio: float, drop_second: bool = False) -> Tuple[DataFrame, DataFrame]:

    rows = data.shape[0]
    pool = [x[0] for x in data.iterrows()]
    chosen = []
    for _ in range(int(rows * split_ratio)):
        i = randint(0, len(pool) - 1)
        chosen.append(pool[i])
        pool.pop(i)

    data1 = data.drop(pool)
    if drop_second:
        data2 = None
    else:
        data2 = data.drop(chosen)

    return (data1, data2)
Example #15
0
def chooseNAttributes(data: DataFrame, n: int, class_label: str) -> DataFrame:

    attributes = [x for x in data.columns]
    attributes.remove(class_label)

    for i in range(n):
        attributes.pop(randint(0, len(attributes) - 1))
    
    return data.drop(attributes, axis=1)
 def _prepare_dataset(self, df: DataFrame) -> DataFrame:
     print('Preparing dataset...')
     df = df[['nomenclature', 'description', 'turnover']]
     df = df.drop(df[df.turnover.isnull()].index)
     df = df.fillna('')
     df = self._extract_unique_dataset(df)
     df = self._remove_rare_targets(df)
     print('├── Complete')
     return df
Example #17
0
def pre_processing(df: DataFrame):
    """
    input : a data frame
    outputs: clean data frame
            dtype.txt : a file that has type of each columns
            database:information.sqlite
            tables:
                 information  : clean data frame
                 before_process : data before process
                 missing_information : information of missing_data function output
                 outliers : outliers data
                 describe : describe of clean data
    Description:
                delete null information
                merge capital_gain and capital_loss
                delete education column
                delete outlier information with IQR method
                save information in database
    """
    sql_manager = SqlManager("information.sqlite")
    df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace")
    missing_data_df = missing_data(df)
    missing_data_df.to_sql(name="missing_information",
                           con=sql_manager.conn,
                           if_exists="replace")
    df = df.drop(columns=[
        "status_id", "status_published", 'Column1', "Column2", "Column3",
        "Column4"
    ])
    main_df = df.dropna()
    print(main_df.shape)
    outliers_df, main_df = drop_numerical_outliers(main_df)
    main_df = main_df[main_columns]
    outliers_df.to_sql(name="outliers",
                       con=SqlManager("information.sqlite").conn,
                       if_exists="replace",
                       index=False)
    main_df.to_sql(name="after_clear",
                   con=SqlManager("information.sqlite").conn,
                   if_exists="replace",
                   index=False)
    label_encode(main_df)
    scaled_df = DataFrame(preprocessing.robust_scale(main_df),
                          columns=main_columns)
    scaled_df.to_sql(name="information",
                     con=SqlManager("information.sqlite").conn,
                     if_exists="replace",
                     index=False)
    print(main_df.shape)
    main_df.describe().to_sql(name="describe",
                              con=sql_manager.conn,
                              if_exists='replace')
    create_folder("outs")
    with open("outs\\dtypes.txt", "w") as file:
        file.write(str(main_df.dtypes))
    return main_df
Example #18
0
def testMyMetric(real, pred):

    metrics = eval_my_metric(real, pred, debug=1)
    df = DataFrame(metrics)
    print(df)
    df = df.drop(['tp', 'fp', 'fn'])
    spiderchart.plot(df, [0.25, .5, .75])
    #plotSpiderChart(df[df.index=='precision'],[0.25,.5,.75])
    #plotSpiderChart(df[df.index=='f1'],[0.25,.5,.75])
    print(metrics)
Example #19
0
 def _prepare_dataset(self, df: DataFrame) -> DataFrame:
     print('Preparing dataset...')
     df = df[['object', 'financing', 'project', 'budget']]
     df = df.drop(df[df.budget.isnull()].index)
     df = df.fillna('')
     df = self._replace_year_specific_targets(df)
     df = self._extract_unique_dataset(df)
     df = self._remove_rare_targets(df)
     print('├── Complete')
     return df
Example #20
0
 def to_X(self, dataframe: DataFrame) -> DataFrame:
     dataframe = dataframe.drop(columns=self.params['Y_field'],
                                errors='ignore')
     dataframe = self.X_features(dataframe)
     # noinspection PyProtectedMember
     dataframe = dataframe._get_numeric_data(
     )  # BUGFIX: Linear Regression Crashes if provided with non-numeric inputs
     dataframe = dataframe.fillna(
         0)  # allow X_features() to do custom fillna() first
     return dataframe
def drop_cols(df: DataFrame,
              drop_cols: List[str] = [
                  "Id", "Credit Score", "Purpose", "Home Ownership", "Term"
              ],
              extra_drop_cols: List[str] = []) -> DataFrame:
    """
    Drop the columns specified; (default was ID and credit score but these
    can be specified manually if required)
    
    Here we also drop columns manipulated in the other functions because 
    regardless of whether or not we 'add' in the altered version, we want
    to drop the 'unclean' version
    
    extra_drop_cols is just an easy way to append columns without changing the default list
    """
    if extra_drop_cols:
        drop_cols.extend(extra_drop_cols)

    df.drop(drop_cols, axis=1, inplace=True)

    return df
Example #22
0
    def __init__(self, data: DataFrame, k: int, random: bool = False):

        self.k = k
        self.packs = []
        test_packs = []
        if random:
            test_packs = splitDataToPacksRandom(data, k)
        else:
            test_packs = splitDataToPacksSequencial(data, k)
        for test_pack in test_packs:
            drop_rows = [x[0] for x in test_pack.iterrows()]
            train_pack = data.drop(drop_rows)
            self.packs.append((train_pack, test_pack))
Example #23
0
def print_cluster_scatterplot(df: DataFrame, centroids: list):
    """Only works for TwoDimHard dataset"""

    # Cluster palette
    colors = [
        'green', 'orange', 'blue', 'purple', 'tan', 'yellowgreen', 'royalblue',
        'mediumvioletred', 'pink', 'salmon'
    ]

    # x = []
    # y = []
    # for i, centroid in enumerate(centroids):
    #     # Centroid position
    #     x.append(centroid.position[0])
    #     y.append(centroid.position[1])

    #     # Plot points in the centroid
    #     subset = df.loc[df['_cluster'] == i]
    #     plt.scatter(subset['X.1'], subset['X.2'], c=colors[i], s=5)

    # add + markers for all centroids
    # plt.scatter(x, y, c='red', marker='+', s=50)

    y = df['_cluster']
    x = df.drop(['ID', '_cluster', '_distance'], axis=1)
    x_norm = (x - x.min()) / (x.max() - x.min())

    pca = PCA(n_components=2)
    transformed = DataFrame(pca.fit_transform(x_norm))

    # lda = LDA(n_components=2)
    # transformed = DataFrame(lda.fit_transform(x_norm, y))

    for i, centroid in enumerate(centroids):
        plt.scatter(transformed[y == i][0],
                    transformed[y == i][1],
                    label='Class ' + str(i),
                    c=colors[i],
                    s=5)

    plt.legend()
    plt.show()
Example #24
0
def get_xy(df: DataFrame,
           target_name: str,
           standardise_data: bool = False,
           train_size: float = 1.0,
           seed: int = 123) -> Results:
    X = df.drop(target_name, axis=1).values
    y = df[target_name].values.reshape(-1, 1)
    if train_size < 1.:
        Xtr, Xte, ytr, yte = train_test_split(X,
                                              y,
                                              random_state=seed,
                                              train_size=train_size,
                                              shuffle=False)
    else:
        Xtr, ytr = X, y
        Xte, yte = None, None
    if standardise_data:
        Xtr, Xmean, Xstd = standardise(Xtr)
        if Xte is not None:
            Xte = standardise(Xte, Xmean, Xstd)
    return Results((Xtr, Xte, ytr, yte), (Xmean, Xstd))
Example #25
0
def preprocess(df: DataFrame) -> None:
    """Add extra attributes for tracking clusters and distances"""
    zeroes = np.zeros(len(df))

    df = df.assign(_cluster=Series(zeroes))
    df = df.assign(_distance=Series(zeroes))

    # Remove columns that we don't cluster on
    # This is built for the wine dataset and the TwoDimHard.
    # It'd be nice if this was more intelligent, but alas.
    for column in IGNORED_COLUMNS:
        if column in df:
            df = df.drop(column, 1)

    # Normalize non-ID columns
    ids = []
    for column in df.columns:
        if column != 'ID':
            ids.append(column)

    df[ids] = df[ids].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return df
def _kernel_data_preparation(data: DataFrame,
                             cutoff: float) -> Tuple[npt.NDArray, npt.NDArray]:
    """
    This function will copy the data, eliminate stop codon, eliminate
    values lower than -1, flatten and eliminate np.nan. Will return the
    data in that format + the adjusted kernel.
    """

    # Eliminate stop codon
    data_corrected: npt.NDArray = np.array(
        data.drop('*', errors='ignore').copy())

    # Eliminate values lower than -1
    data_corrected = data_corrected[(data_corrected >= -cutoff)
                                    & (data_corrected <= cutoff)]

    # Get rid of np.nan values and convert matrix into 1d matrix
    data_corrected = data_corrected[np.invert(np.isnan(data_corrected))]

    # Adjust gaussian kernel
    kernel_processed_data = gaussian_kde(data_corrected)

    return data_corrected, kernel_processed_data
Example #27
0
class PandasBackend(DataBackend):
    _data: DataFrame
    _index: PandasIndex
    _loc: _LocIndexer
    _iloc: _ILocIndexer

    def __init__(
        self,
        data: Optional[Union(Series, DataFrame, dict[str, list])] = None,
        index: Optional[PandasIndex] = None,
    ) -> None:
        if data is None:
            self._data = DataFrame(dtype="object")
        elif type(data) is Series:
            self._data = cast(Series, data).to_frame().transpose()
        elif type(data) is DataFrame:
            self._data = DataFrame(data)
        elif type(data) is dict:
            sample_value = next(iter(data.values()))
            if not isinstance(sample_value, Iterable) or isinstance(
                    sample_value, str):
                self._data = Series(data).to_frame().transpose()
            else:
                self._data = DataFrame(data)
        else:
            raise ValueError(
                f"Received unexpected value type {type(data)}: {data}")

        if index is None:
            self._data.index.name = "index"
            self._index = PandasIndex(self._data.index, [])
        else:
            if not isinstance(index, PandasIndex):
                index = PandasIndex(index)
            self._data.index = index._data
            self._index = index
        self._loc = _LocIndexer(self)
        self._iloc = _ILocIndexer(self)

    def is_link(self) -> bool:
        return False

    def link_token(self) -> Optional[DataToken]:
        return None

    def to_pandas(self) -> DataFrame:
        return self._data

    @property
    def columns(self) -> list[str]:
        return self._data.columns.tolist()

    @property
    def values(self) -> np.ndarray:
        data_values = self._data.values
        shape = data_values.shape
        if shape[1] == 1:
            return np.squeeze(data_values, axis=1)
        elif shape[0] == 1:
            return np.squeeze(data_values, axis=0)
        else:
            return data_values

    @property
    def dtypes(self) -> dict[str, DataType]:
        return {
            col: DataType(dtype)
            for col, dtype in self._data.dtypes.items()
        }

    def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend:
        return PandasBackend(self._data.astype(column_dtypes, errors="ignore"))

    def to_dict(self) -> dict[str, any]:
        return self._data.to_dict("list")

    @property
    def index(self) -> Index:
        return self._index

    @property
    def index_name(self) -> Union[str, list[str]]:
        return self._data.index.name

    @property
    def loc(self: PandasBackend) -> LocIndexer[PandasBackend]:
        return self._loc

    @property
    def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]:
        return self._iloc

    def equals(self, other: PandasBackend) -> bool:
        if type(other) is not PandasBackend:
            return False
        return np.array_equal(self._data.values,
                              other._data.values) and self._index.equals(
                                  other._index)

    def __eq__(self, other) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data == other

    def __ne__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data != other

    def __gt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data > other

    def __ge__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data >= other

    def __lt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data < other

    def __le__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data <= other

    def __len__(self) -> int:
        return len(self._data)

    def __iter__(self) -> Generator[str, None, None]:
        return iter(self._data)

    def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]:
        for i, row in self._data.iterrows():
            yield (i, PandasBackend(row.to_frame().transpose()))

    def itertuples(self, ignore_index: bool = False):
        for values in self._data.itertuples(index=not ignore_index):
            yield values

    def __getitem__(self, item: str) -> Any:
        return PandasBackend(self._data[item].to_frame())

    def getitems(self, items: list[str]) -> PandasBackend:
        return PandasBackend(self._data[items])

    def getmask(self, mask: list[bool]) -> PandasBackend:
        return PandasBackend(self._data[mask])

    def query(self, query: "Query") -> PandasBackend:
        from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler

        query_compiler = PandasQueryCompiler(self._data)
        query = query_compiler.compile(query)
        return PandasBackend(self._data[query])

    def __setitem__(self, items: str, value: Any) -> None:
        if isinstance(value, PandasBackend):
            value = value._data
        self._data[items] = value

    def get_index(self, index_alias: IndexAlias) -> Index:
        cols = [str(col) for col in index_alias.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index_alias.name
        return PandasIndex(new_data.index, cols)

    def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend:
        cols = [str(col) for col in index.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index.name
        new_index = PandasIndex(new_data.index, cols)
        return PandasBackend(new_data, new_index)

    def reset_index(self: PandasBackend) -> PandasBackend:
        new_data = self._data.reset_index(drop=True)
        new_data.index.name = "index"
        new_index = PandasIndex(new_data.index, [])
        return PandasBackend(new_data, new_index)

    def append(
        self: PandasBackend,
        new_backend: PandasBackend,
        ignore_index: bool = False,
    ) -> PandasBackend:
        return PandasBackend(
            self._data.append(new_backend._data, ignore_index=ignore_index))

    def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend:
        return PandasBackend(self._data.drop(indices))

    @classmethod
    def concat(
        cls: type[PandasBackend],
        all_backends: list[PandasBackend],
        ignore_index: bool = False,
    ) -> PandasBackend:
        all_data = [backend._data for backend in all_backends]
        return PandasBackend(pd.concat(all_data, ignore_index=ignore_index))

    def nunique(self) -> int:
        return self._data.nunique()

    def __str__(self) -> str:
        return str(self._data)

    def __repr__(self) -> str:
        return str(self)
Example #28
0
         continue
     tokens = line.split(",")
     domain = tokens[0]
     label = tokens[1]
     DomainLen.append(len(domain))
     Numbers.append(NumCollect(domain))
     Entropy.append(LettersEntropy(domain))
     if label == 'notdga':
         Type.append(0)
     else:
         Type.append(1)
 traindata = {'Length': DomainLen, 'NumInDomain': Numbers, 'Entropy': Entropy, 'Type': Type}
 traindata = DataFrame(traindata)
 # 设置训练数据集
 y = traindata.Type
 x = traindata.drop('Type', axis=1)
 xtrain = x
 ytrain = y
 # 读取test.txt文件
 testDomainLen = []
 testNumbers = []
 testEntropy = []
 testType = []
 testDomainName = []
 TestFile = open(r'test.txt')
 for line in TestFile:
     line = line.strip()
     if line == "":
         continue
     testDomainName.append(line)
     testDomainLen.append(len(line))
Example #29
0
Created on Sun May 19 20:42:32 2013

'''
from pandas.core.frame import DataFrame
from printheader import print_header

cols = ['alpha','beta','gamma','delta','epsilon']
index = ['a','b','c','d','e','f']
values = [
    [100, 110, 120, 130, 140],
    [200, 210, 220, 230, 240],
    [300, 310, 320, 330, 340],
    [400, 410, 420, 430, 440],
    [500, 510, 520, 530, 540],
    [600, 610, 620, 630, 640],
]
print_header('values:')
print(values, '\n\n')

df = DataFrame(values, index=index, columns=cols)
print_header('DataFrame df')
print(df, '\n')

df2 = df.drop(['beta','delta'], axis=1)
print_header("After dropping beta and delta:")
print(df2, '\n')

print_header("After dropping rows b, c, and e")
df3 = df.drop(['b','c','e'], axis=0)
print(df3)
Example #30
0
def lundong(t, temp2, temp3, temp4, mairude, maichude, zhengchangde):
    '''
    轮动
    接受八个参数:所有交易日期,购买日期,卖出日期,无操作日期以及后面三个日期中的每个股票的长度
    在购买日期 将可用资金平均分给每个股票 以开盘价格买入
    在卖出日期 以收盘价卖出全部股票
    在无操作日期 只需根据当天收盘价更新股票价值
    
    '''

    keyongzijin = cash  #可用资金为cash
    chigujiazhi = 0  #初始持股价值为0
    keyongzijin_1 = []  #空列表用于每次更新可用价值,下同
    chigujiazhi_1 = []
    chigushuliang = []  #创建空列表存储持股数量
    pp = 0
    kk = 0
    gg = 0
    tt = 0  #mairude里面的
    cc = 0  #maichude里面的
    zz = 0  #zhengchangde里面的
    print('--------正在轮动中,初始资金为:{0}---------'.format(keyongzijin))
    for i in range(len(t)):

        if (t.ix[i, 1] == 'T' and t.ix[i, 2] == 'F'):
            print('第{0}个交易日买入'.format(i))
            goumaizijin = keyongzijin / mairude[tt]  #购买每只股票的资金,平均分配

            for l in range(mairude[tt]):
                chigushuliang_1 = (goumaizijin / temp3.ix[l + pp, 2])
                chigushuliang.append(chigushuliang_1)

            keyongzijin = 0  #第一次卖完变成0
            chigujiazhi = goumaizijin * mairude[tt]
            pp = pp + mairude[tt]
            tt = tt + 1
            keyongzijin_1.append(keyongzijin)  #将本次结果添加进储存列表
            chigujiazhi_1.append(chigujiazhi)

        elif (t.ix[i, 1] == 'F' and t.ix[i, 2] == 'T'):
            print('第{0}个交易日卖出'.format(i))
            chigujiazhi = 0
            chucunzijin = []
            for y in range(maichude[cc]):
                keyongzijin_2 = chigushuliang[y] * temp2.ix[y + kk, 3]
                chucunzijin.append(keyongzijin_2)
            kk = kk + maichude[cc]
            cc = cc + 1
            keyongzijin = sum(chucunzijin)
            keyongzijin_1.append(keyongzijin)
            chigujiazhi_1.append(chigujiazhi)

        elif (t.ix[i, 1] == 0 and t.ix[i, 2] == 0 and t.ix[i, 3] == 0):
            print('第{0}个交易日,无操作'.format(i))
            xianyoujiazhi = []
            for z in range(zhengchangde[zz]):
                xianyoujiazhi_1 = chigushuliang[z] * temp4.ix[z + gg, 3]
                xianyoujiazhi.append(xianyoujiazhi_1)
            gg = gg + zhengchangde[zz]
            chigujiazhi = sum(xianyoujiazhi)
            keyongzijin_1.append(keyongzijin)
            chigujiazhi_1.append(chigujiazhi)
        '''else:
            print('ok')
            keyongzijin_1.append(keyongzijin)
            chigujiazhi_1.append(chigujiazhi)'''

    c = {"可用资金": keyongzijin_1, "持股价值": chigujiazhi_1}
    data1 = DataFrame(c)
    data1.insert(0, '日期', t['日期'])
    data1['总资产'] = data1['可用资金'] + data1['持股价值']  #计算总资产
    data1 = data1.drop(len(data1) - 1)

    return data1