def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE): his = None data = None try: # print start, end data = ystockquote.get_historical_prices(sym, start, end) except Exception: print "Please check the dates. Data might not be available. 404 returned" # 404 due to data yet not available if data: his = DataFrame(collections.OrderedDict(sorted(data.items()))).T his = his.convert_objects(convert_numeric=True) his.index = pd.to_datetime(his.index) his.insert(0, 'symbol', sym, allow_duplicates=True) # insert the date as dataframe too his.insert(1, 'date', his.index) # his.columns = getColumns('stock_quote_historical') # Removing as db dependency is removed his.columns = getColumnsNoSql('stock_quote_historical') daily = ystockquote.get_all(sym) # print daily # persist(his, daily, sym, end) return his, daily
def compute_commit_periods(self, ticket_frame: pd.DataFrame): commit_dates = ticket_frame.CommitDate commit_periods = self.compute_periods(commit_dates) commit_periods = pd.concat( [pd.Series(data=[pd.Timedelta(days=0)]), commit_periods]).reset_index(drop=True) ticket_frame.insert(8, 'CommitPeriod', commit_periods.dt.days) return ticket_frame
def test_left_join_multi_index(self, left, right, sort): icols = ['1st', '2nd', '3rd'] def bind_cols(df): iord = lambda a: 0 if a != a else ord(a) f = lambda ts: ts.map(iord) - ord('a') return (f(df['1st']) + f(df['3rd']) * 1e2 + df['2nd'].fillna(0) * 1e4) def run_asserts(left, right, sort): res = left.join(right, on=icols, how='left', sort=sort) assert len(left) < len(res) + 1 assert not res['4th'].isna().any() assert not res['5th'].isna().any() tm.assert_series_equal( res['4th'], - res['5th'], check_names=False) result = bind_cols(res.iloc[:, :-2]) tm.assert_series_equal(res['4th'], result, check_names=False) assert result.name is None if sort: tm.assert_frame_equal( res, res.sort_values(icols, kind='mergesort')) out = merge(left, right.reset_index(), on=icols, sort=sort, how='left') res.index = np.arange(len(res)) tm.assert_frame_equal(out, res) lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) left = DataFrame(np.random.choice(lc, (5000, 2)), columns=['1st', '3rd']) left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) i = np.random.permutation(len(left)) right = left.iloc[i].copy() left['4th'] = bind_cols(left) right['5th'] = - bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) # inject some nulls left.loc[1::23, '1st'] = np.nan left.loc[2::37, '2nd'] = np.nan left.loc[3::43, '3rd'] = np.nan left['4th'] = bind_cols(left) i = np.random.permutation(len(left)) right = left.iloc[i, :-1] right['5th'] = - bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort)
def get_result(result): result = result.round() if(len(result.shape)==1): df = DataFrame(result,columns=[0]) else: df = DataFrame(result,columns=['col_'+str(i) for i in range(result.shape[1])]) df.insert(0,'shop_id',[i for i in range(1,2001)]) df = pd.merge(df,df,on='shop_id') return df
def getNgrams(query, corpus, startYear, endYear, smoothing): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=smoothing) req = requests.get('http://books.google.com/ngrams/graph', params=params) response = req.content res = re.findall('var data = (.*?);', response) data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])} df = DataFrame(data) df.insert(0, 'year', range(startYear, endYear+1)) return req.url, params['content'], df
def getNgrams(query, corpus="eng_2012", startYear=1980, endYear=2000, smoothing=10, caseInsensitive=True): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=smoothing, case_insensitive=caseInsensitive) req = requests.get('http://books.google.com/ngrams/graph', params=params) res = re.findall('var data = (.*?);\\n', req.text) data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])} df = DataFrame(data) df.insert(0, 'year', range(startYear, endYear+1)) return list(df.columns.values)[2:]
def _standardize_index(self, df_in: pd.DataFrame, symbol: str = None, datatype: str = None, barsize: str = None, tz: str = None): """Normalize input DataFrame index to MarketDataBlock standard. """ # Add or starndardize index names in the input. if isinstance(df_in.index, pd.MultiIndex): df_in.reset_index(inplace=True) # Rename ambiguous column names. df_in.columns = [ col_rename.get(col.strip().lower(), col.strip().lower()) for col in df_in.columns ] # Insert Symbol, DataType, Barsize columns from arguments if not # found in the input dataframe. for col in MarketDataBlock.data_index: if col not in df_in.columns: if locals().get(col.lower(), None) is None: raise KeyError( 'No {0} argument and no {0} column in the DataFrame.'. format(col)) df_in.insert(0, col, locals()[col.lower()]) # Convert datetime strings to pandas DatetimeIndex df_in['TickerTime'] = pd.DatetimeIndex(df_in['TickerTime'].apply( pd.Timestamp)) # Standardize BarSize strings df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize) # Set index to class-defined MultiIndex df_in.set_index(MarketDataBlock.data_index, inplace=True) # Set time zone so all DatetimeIndex are tz-aware df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz if df_in_tz is None or isinstance(df_in_tz, timezone) or \ isinstance(df_in_tz, pytz._FixedOffset): # Input df has naive time index, or tzinfo is not pytz.timezone() if tz is None: raise ValueError( 'Argument tz=None, and TickerTime.tzinfo is None(naive),' 'datetime.timezone, or pytz._FixedOffset.') if df_in_tz is None: df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel) else: df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel) return df_in
def filter_faculty(data: pd.DataFrame) -> pd.DataFrame(): data['单位'] = [self._keep_zh(x) for x in data['单位'].tolist()] data.insert( 1, "Faculty", [re.search('(.*学院)(.*)', x).groups()[0] for x in data['单位']]) data.insert( 2, "Major", [re.search(r'(.*学院)(.*)', x).groups()[1] for x in data['单位']]) data['Faculty'] = [ x.replace("国际学院国际学院", "国际学院") for x in data['Faculty'] ] return data
def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan result = df.fillna(method="ffill", axis=1) expected = df.T.fillna(method="pad").T tm.assert_frame_equal(result, expected) df.insert(6, "foo", 5) result = df.fillna(method="ffill", axis=1) expected = df.astype(float).fillna(method="ffill", axis=1) tm.assert_frame_equal(result, expected)
def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan result = df.fillna(method='ffill', axis=1) expected = df.T.fillna(method='pad').T assert_frame_equal(result, expected) df.insert(6, 'foo', 5) result = df.fillna(method='ffill', axis=1) expected = df.astype(float).fillna(method='ffill', axis=1) assert_frame_equal(result, expected)
def exam_period(flattened_student_data: pd.DataFrame) -> pd.DataFrame: flattened_student_data.insert(loc=0, column='exam_period', value=deepcopy(flattened_student_data.index)) non_exam_mask = (flattened_student_data['exam_period'] < definitions.MIDTERM_START_DATE) | ( flattened_student_data['exam_period'] > definitions.MIDTERM_END_DATE) flattened_student_data['exam_period_inferred'] = 1 flattened_student_data.loc[non_exam_mask, 'exam_period_inferred'] = 0 return flattened_student_data.drop(columns='exam_period')
def test_left_join_multi_index(self, left, right, sort): icols = ["1st", "2nd", "3rd"] def bind_cols(df): iord = lambda a: 0 if a != a else ord(a) f = lambda ts: ts.map(iord) - ord("a") return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4 def run_asserts(left, right, sort): res = left.join(right, on=icols, how="left", sort=sort) assert len(left) < len(res) + 1 assert not res["4th"].isna().any() assert not res["5th"].isna().any() tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) result = bind_cols(res.iloc[:, :-2]) tm.assert_series_equal(res["4th"], result, check_names=False) assert result.name is None if sort: tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") res.index = np.arange(len(res)) tm.assert_frame_equal(out, res) lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"]) left.insert(1, "2nd", np.random.randint(0, 1000, len(left))) i = np.random.permutation(len(left)) right = left.iloc[i].copy() left["4th"] = bind_cols(left) right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) # inject some nulls left.loc[1::23, "1st"] = np.nan left.loc[2::37, "2nd"] = np.nan left.loc[3::43, "3rd"] = np.nan left["4th"] = bind_cols(left) i = np.random.permutation(len(left)) right = left.iloc[i, :-1] right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort)
def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) df.insert(0, "id", 0) df.insert(5, "dt", "foo") grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix="_right")
def move_columns( df: _pd.DataFrame, colname_idx: dict, inplace: bool = False, ) -> _pd.DataFrame: """ For a given DataFrame, move given column_name keys into the given index values Parameters ---------- df : pd.DataFrame, mandatory colname_idx: dict, mandatory { "colname1" : 2, "colname10" : 1 } Returns ------- pf.DataFrame with updated columns """ if not inplace: df = df.copy() if not isinstance(df, _pd.DataFrame): raise TypeError("df should be a pandas DataFrame") if not isinstance(colname_idx, dict): raise TypeError("colname_idx should be a dict") for column_name in colname_idx.keys(): if not isinstance(column_name, str): raise TypeError(f"key={column_name}, should be an str type") if column_name not in df.columns: raise ValueError( f"key={column_name} is not a column of the given DataFrame") idx = colname_idx[column_name] if not isinstance(idx, int): raise TypeError( f"key={column_name}, val={idx} should be an integer type") if df.columns.get_loc(column_name) == idx: next tmp_col = df[column_name] df.drop(labels=[column_name], axis=1, inplace=True) df.insert(loc=idx, column=column_name, value=tmp_col) return df
def update_dataframe(): df = DataFrame({ 'goods': ['cola', 'egg', 'cookie', 'apple', 'banana', 'milk'], 'quantity': [4, 5, 6, 10, 11, 12], 'color': ['B', 'Y', 'Y', 'R', 'Y', 'W'], 'price': [13, 11, 14, 21, 20, 22] }) df['total'] = df['quantity'] * df['price'] print('添加列 :\r\n', df) df['isQualified'] = True print('添加列 固定值:\r\n', df) df.insert(2, 'allQuantity', [100, 50, 30, 100, 50, 30]) print('插入列:\r\n', df) df.loc[6] = ['shampoo', 'B', 13, 100, 50, 650, True] print('插入行:\r\n', df) df.append({ 'goods': 'pear', 'quantity': 30, 'price': 12 }, ignore_index=True) # 字典不能是列表 print('append添加行:\r\n', df) print( 'append 新值:\r\n', df.append({ 'goods': 'pear', 'quantity': 30, 'price': 12 }, ignore_index=True)) df.drop(labels='isQualified', axis=1, inplace=True) print('删除列:\r\n', df) df.drop(['allQuantity', 'total'], axis=1, inplace=True) print('删除多列:\r\n', df) df.drop(6, axis=0, inplace=True) print('删除一行:\r\n', df) df.drop([4, 5], axis=0, inplace=True) print('删除多行:\r\n', df) df.loc[3] = ['orange', 'O', 5, 12] # 不存在则添加 print('修改某行:\r\n', df) df.loc[:, 'quantity'] = 10 print('修改某列:\r\n', df) df.loc[df.price > 15, 'price'] = 15 print('修改某些:\r\n', df) print('-' * 30, '统计') print('np求和:', np.sum(df['quantity'])) print('df求和:', df['quantity'].sum()) print('df describe 描述性统计:\r\n', df[{'quantity', 'price'}].describe()) print('频次统计:\r\n', df['color'].value_counts()) # 数据类型转category类型 df['color'] = df['color'].astype('category') print('category describe:\r\n', df['color'].describe())
def write_tfs(tfs_path: str, data_frame: DataFrame, headers_dict: dict = None, save_index: Union[str, bool] = False, colwidth: int = DEFAULT_COLUMN_WIDTH): """ Writes the DataFrame into tfs_path with the headers_dict as headers dictionary. If you want to keep the order of the headers, use collections.OrderedDict. Args: tfs_path: path to the output TFS file data_frame: TfsDataFrame or pandas.DataFrame to save headers_dict: Headers of the data_frame, if empty tries to use data_frame.headers save_index: bool or string. Default: False If True, saves the index of the data_frame to a column identifiable by INDEX_ID. If string, it saves the index of the data_frame to a column named by string. colwidth: Column width """ _validate(data_frame, f"to be written in {tfs_path:s}") if save_index: if isinstance(save_index, str): # saves index into column by name given idx_name = save_index else: # saves index into column, which can be found by INDEX_ID try: idx_name = INDEX_ID + data_frame.index.name except TypeError: idx_name = INDEX_ID data_frame.insert(0, idx_name, data_frame.index) LOGGER.debug( f"Attempting to write file: {basename(tfs_path)} in {dirname(tfs_path)}" ) if headers_dict is None: # Tries to get headers from TfsDataFrame try: headers_dict = data_frame.headers except AttributeError: headers_dict = {} colwidth = max(MIN_COLUMN_WIDTH, colwidth) headers_str = _get_headers_str(headers_dict) colnames_str = _get_colnames_str(data_frame.columns, colwidth) coltypes_str = _get_coltypes_str(data_frame.dtypes, colwidth) data_str = _get_data_str(data_frame, colwidth) with open(tfs_path, "w") as tfs_data: tfs_data.write("\n".join( (headers_str, colnames_str, coltypes_str, data_str))) if save_index: # removed inserted column again data_frame.drop(data_frame.columns[0], axis=1, inplace=True)
def write_df(df: pd.DataFrame, worksheet: str, loc: str = "A1", columns: bool = True, worksheet_loc: int = 6): """Write a dataframe to a sheet without changing it""" worksheet = get_worksheet(worksheet, worksheet_loc) df = df.copy() df.insert(0, df.index.name, df.index) df.replace([0, np.inf, np.nan, float("-inf")], "", inplace=True) values = [df.columns.to_list() ] + df.values.tolist() if columns else df.values.tolist() worksheet.update(loc, values, raw=False)
def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right')
def calc_grade(df: pd.DataFrame, periods=2): """ 计算坡度 :param df: pd.DataFrame :param periods: int 计算时的偏移 """ diff_altitude = df['altitude'].diff(periods=periods) diff_distance = df['distance'].diff(periods=periods) grade = round(diff_altitude / diff_distance * 100, 2) # print(grade) grade[:periods] = 0.0 # df['grade'] = grade df.insert(5, 'grade', grade)
def _read_one_data(self, symbol): """ read one data from specified URL """ url = "https://query1.finance.yahoo.com/v8/finance/chart/{}=X".format(symbol) params = self._get_params(symbol) resp = self._get_response(url, params=params) jsn = json.loads(resp.text) data = jsn["chart"]["result"][0] df = DataFrame(data["indicators"]["quote"][0]) df.insert(0, "date", to_datetime(Series(data["timestamp"]), unit="s").dt.date) df.columns = map(str.capitalize, df.columns) return df
def DataFrameColumnCopy(df_from: pd.DataFrame, df_to: pd.DataFrame, columns: [str]) -> (int, int): copied = uncopied = 0 for c in columns: if c not in df_from.columns.tolist(): uncopied += 1 continue col = df_from[c] if c not in df_to.columns.tolist(): df_to.insert(len(df_to.columns), c, col.copy()) else: df_to[c] = col.copy() copied += 1 return copied, uncopied
def coordinator(self, rasa_chatlog_df: pd.DataFrame): rasa_chatlog_df.insert(2, "use_case", "") rasa_chatlog_df.insert(3, "outcome", "") conversation_ids = rasa_chatlog_df["conversation_id"].drop_duplicates(keep="first").to_list() for id in conversation_ids: chatlog_sub_df = rasa_chatlog_df[rasa_chatlog_df["conversation_id"] == id] item_index, use_case = self.set_uc1_and_uc2_for_conversations_2(chatlog_sub_df) if item_index and use_case: rasa_chatlog_df.at[item_index, "use_case"] = use_case item_index, outcome = self.specify_conversation_outcome_2(chatlog_sub_df) if item_index and outcome: rasa_chatlog_df.at[item_index, "outcome"] = outcome return rasa_chatlog_df
def indexcnName(dftopn: pd.DataFrame): # 插入中文名称 codetop = list(dftopn.index.levels[1]) indexList = qa.QA_fetch_index_list_adv() # for c in codetop: # # 打印股票代码及中文名8 # print(c, indexList.loc[c]['name']) print("排名靠前的数量:{}".format(len(codetop))) # 插入中文名 dftopn.reset_index(inplace=True) # # dftopn['name'] = dftopn['code'].apply(lambda x: indexList.loc[x]['name']) dftopn.insert(1, 'name', dftopn['code'].apply(lambda x: indexList.loc[x]['name'])) return dftopn.set_index(['date', 'code'])
def save_training_fitness_information(g_best_dict, number_tasks, name_mha, name_paras, results_folder_path): results_path = f'{results_folder_path}/optimize_process/{name_mha}/{name_paras}' Path(results_path).mkdir(parents=True, exist_ok=True) fitness_file_path = f'{results_path}/training_{number_tasks}_tasks.csv' fit_list = array([[0, 0, 0, 0]]) for key, value in g_best_dict.items(): value = insert(value, 0, key, axis=1) fit_list = concatenate((fit_list, value), axis=0) fitness_df = DataFrame(fit_list) fitness_df = fitness_df.iloc[1:] fitness_df.to_csv(fitness_file_path, index=False, header=["Epoch", "Power", "Latency", "Cost"]) if Config.METRICS_NEED_MIN_OBJECTIVE_VALUES: fitness_df.drop(fitness_df[fitness_df['Epoch'] != key].index, inplace=True) fitness_df.insert(0, 'Name Paras', name_paras) fitness_df.insert(0, 'Name MHA', name_mha) fitness_df.insert(0, 'N Tasks', number_tasks) fitness_df.insert(0, 'Metric', Config.METRICS) fitness_df.to_csv(f'{Config.RESULTS_DATA}/summary.csv', mode='a', header=False)
def convert_meas_to_mW(dataframe: pd.DataFrame, settings): for name, setting in settings.items(): voltage = setting[1] if setting[1] is not None else dataframe[ "Voltage [V]"] dataframe[name] = dataframe[name] * voltage / setting[2] dataframe = dataframe.rename(columns={name: setting[0]}) dataframe.insert( dataframe.columns.get_loc("SoC After DCDC [mW]") + 1, "Efficiency DCDC", dataframe["SoC After DCDC [mW]"] / dataframe["SoC Before DCDC [mW]"]) dataframe.insert( dataframe.columns.get_loc("Latency [ms]") + 1, "Energy [uJ]", dataframe["Latency [ms]"] * dataframe["SoC Before DCDC [mW]"]) return dataframe
def MakeExcelFile(): # 연결 여부 체크 objCpCybos = win32com.client.Dispatch("CpUtil.CpCybos") # 데이터프레임 제작 data1 = MakeDataBase(1) df_1st = DataFrame(data1, columns=['날짜', '시가', '고가', '저가', '종가']) df_1st.sort_index(0, ascending=False) # 기존 데이터에 MA를 추가시켜준다. MovingAverage_3Days = round(df_1st['종가'].rolling(window=3).mean()) df_1st.insert(len(df_1st.columns), "3일이평", MovingAverage_3Days) MovingAverage_5Days = round(df_1st['종가'].rolling(window=5).mean()) df_1st.insert(len(df_1st.columns), "5일이평", MovingAverage_5Days) MovingAverage_10Days = round(df_1st['종가'].rolling(window=10).mean()) df_1st.insert(len(df_1st.columns), "10일이평", MovingAverage_10Days) MovingAverage_20Days = round(df_1st['종가'].rolling(window=20).mean()) df_1st.insert(len(df_1st.columns), "20일이평", MovingAverage_20Days) # 엑셀화 df_1st.to_excel('./MyDB.xlsx', encoding='euc_KR') print('DB 제작 완료!')
class InsertColumns: def setup(self): self.N = 10 ** 3 self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) for i in range(100): self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N)
def change_stock_1(df: pd.DataFrame): """ 规范化因子和股票数据信息,将时间作为index :param df: 数据 :return: 新的数据 """ if "trade_date" in df.columns: date = pd.to_datetime(df["trade_date"], format='%Y%m%d') df.insert(df.shape[1], 'date', date) df.drop("trade_date") return df else: print("There is no DataFrame named trade_date") return 0
def _order_columns(df: pd.DataFrame) -> pd.DataFrame: """Orders the columns of the dataframe as: date, region, observations.""" df.insert(0, _COLUMNS.DATE.value, df.pop(_COLUMNS.DATE.value)) reg_columns = [] obs_columns = [] for col in df.columns[1:]: if col.startswith(constants.REGION_PREFIX): reg_columns.append(col) elif col.startswith(constants.OBSERVATION_PREFIX): obs_columns.append(col) else: raise ValueError(f"Unknown column: '{col}'") columns = [_COLUMNS.DATE.value] + reg_columns + obs_columns return df[columns]
def _build_data_frame(inst, data, picks, long_format, mindex, index, default_index, col_names=None, col_kind='channel'): """Build DataFrame from MNE-object-derived data array.""" # private function; pandas already checked in calling function from pandas import DataFrame from ..source_estimate import _BaseSourceEstimate # build DataFrame if col_names is None: col_names = [inst.ch_names[p] for p in picks] df = DataFrame(data, columns=col_names) for i, (k, v) in enumerate(mindex): df.insert(i, k, v) # build Index if long_format: df.set_index(default_index, inplace=True) df.columns.name = col_kind elif index is not None: df.set_index(index, inplace=True) if set(index) == set(default_index): df.columns.name = col_kind # long format if long_format: df = df.stack().reset_index() df.rename(columns={0: 'value'}, inplace=True) # add column for channel types (as appropriate) ch_map = (None if isinstance(inst, _BaseSourceEstimate) else dict( zip( np.array(inst.ch_names)[picks], np.array(inst.get_channel_types())[picks]))) if ch_map is not None: col_index = len(df.columns) - 1 ch_type = df['channel'].map(ch_map) df.insert(col_index, 'ch_type', ch_type) # restore index if index is not None: df.set_index(index, inplace=True) # convert channel/vertex/ch_type columns to factors to_factor = [ c for c in df.columns.tolist() if c not in ('time', 'value') ] _set_pandas_dtype(df, to_factor, 'category') return df
def getNgrams(self, query, corpus, startYear, endYear, smoothing, caseInsensitive): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=smoothing, case_insensitive=caseInsensitive) if params['case_insensitive'] is False: params.pop('case_insensitive') if '?' in params['content']: params['content'] = params['content'].replace('?', '*') if '@' in params['content']: params['content'] = params['content'].replace('@', '=>') retry_wait_time = 20 while True: try: req = requests.get('http://books.google.com/ngrams/graph', params=params) except Exception: # Wait minute for connection to repair print("Connection error, waiting 60 seconds") sleep(60) continue if req.status_code == 200: res = re.findall('var data = (.*?);\\n', req.text) if res: data = { qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0]) } df = DataFrame(data) df.insert(0, 'year', list(range(startYear, endYear + 1))) else: df = DataFrame() break if req.status_code != 200: retry_wait_time += 1 print("Response %s received, waiting %s seconds" % (req.status_code, retry_wait_time)) sleep(retry_wait_time) return req.url, params['content'], df
def _standardize_index( self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None, barsize: str=None, tz: str=None): """Normalize input DataFrame index to MarketDataBlock standard. """ # Add or starndardize index names in the input. if isinstance(df_in.index, pd.MultiIndex): df_in.reset_index(inplace=True) # Rename ambiguous column names. df_in.columns = [ col_rename.get(col.strip().lower(), col.strip().lower()) for col in df_in.columns] # Insert Symbol, DataType, Barsize columns from arguments if not # found in the input dataframe. for col in MarketDataBlock.data_index: if col not in df_in.columns: if locals().get(col.lower(), None) is None: raise KeyError( 'No {0} argument and no {0} column in the DataFrame.' .format(col)) df_in.insert(0, col, locals()[col.lower()]) # Convert datetime strings to pandas DatetimeIndex df_in['TickerTime'] = pd.DatetimeIndex( df_in['TickerTime'].apply(pd.Timestamp)) # Standardize BarSize strings df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize) # Set index to class-defined MultiIndex df_in.set_index(MarketDataBlock.data_index, inplace=True) # Set time zone so all DatetimeIndex are tz-aware df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz if df_in_tz is None or isinstance(df_in_tz, timezone) or \ isinstance(df_in_tz, pytz._FixedOffset): # Input df has naive time index, or tzinfo is not pytz.timezone() if tz is None: raise ValueError( 'Argument tz=None, and TickerTime.tzinfo is None(naive),' 'datetime.timezone, or pytz._FixedOffset.') if df_in_tz is None: df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel) else: df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel) return df_in
def _read_one_data(self, symbol): """ read one data from specified URL """ url = 'https://query1.finance.yahoo.com/v8/finance/chart/{}=X'\ .format(symbol) params = self._get_params(symbol) resp = self._get_response(url, params=params) jsn = json.loads(resp.text) data = jsn['chart']['result'][0] df = DataFrame(data['indicators']['quote'][0]) df.insert(0, 'date', to_datetime(Series(data['timestamp']), unit='s').dt.date) df.columns = map(str.capitalize, df.columns) return df
def _add_last_seen_column(df: pd.DataFrame) -> pd.DataFrame: """Adds last seen time ago column to a dataframe of clients. Args: df: Dataframe of clients. Returns: Dataframe with a column added. """ if 'last_seen_at' not in df.columns: return df seen_ago = [client_textify.last_seen(tm) for tm in df['last_seen_at']] df.insert(0, 'last_seen_ago', pd.Series(seen_ago)) return df
def replace_time(data: pd.DataFrame): """ Replace the complete datetime with its (day of week, hour) :param data: :return: """ date = pd.to_datetime(data[const.TIME], format=const.T_FORMAT, utc=True) data.insert(loc=0, column='hour', value=date.dt.hour) data.insert(loc=0, column='dayofweek', value=(date.dt.dayofweek + 1) % 7) # 0: monday -> 0: sunday data.drop(columns=[const.TIME], inplace=True)
def test_setitem_clear_caches(self): # see GH#304 df = DataFrame( {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] ) df.insert(2, "z", np.nan) # cache it foo = df["z"] df.loc[df.index[2:], "z"] = 42 expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") assert df["z"] is not foo tm.assert_series_equal(df["z"], expected)
def add_gene_symbols(df: pd.DataFrame) -> pd.DataFrame: """ Adds columns of gene names at column_index = 0. Assumes index is gene ids. """ # if 'symbol' not in df.columns: gene_ids = df.index.to_list() conv = GeneConverter() gene_symbols = conv.convert_to_symbols(gene_ids) df.insert(loc=0, column='symbol', value=gene_symbols) return df
def getNgrams( query, corpus, startYear, endYear, smoothing, caseInsensitive ): params = dict( content = query, year_start = startYear, year_end = endYear, corpus = _corpora[ corpus ], smoothing = smoothing, case_insensitive = caseInsensitive ) if params[ 'case_insensitive' ] is False: params.pop( 'case_insensitive' ) if '?' in params[ 'content' ]: params[ 'content' ] = params[ 'content' ].replace( '?', '*' ) if '@' in params[ 'content' ]: params[ 'content' ] = params[ 'content' ].replace( '@', '=>' ) req = requests.get( 'http://books.google.com/ngrams/graph', params = params ) res = regex.findall( 'var data = (.*?);\\n', req.text ) data = { qry[ 'ngram' ]: qry[ 'timeseries' ] for qry in literal_eval( res[ 0 ] ) } df = DataFrame( data ) df.insert( 0, 'year', range( startYear, endYear + 1 ) ) return req.url, params[ 'content' ], df
class InsertColumns: def setup(self): self.N = 10**3 self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) for i in range(100): self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N)
def export(params, path, paramsToGroupBySize, hasCycles): """Formats extracted data and exports to Data.xlsv""" paramToUnit, Files = extractFolder(params, path, paramsToGroupBySize, hasCycles) channelToFiles = groupFilesByChannel(Files) writer = ExcelWriter(path + 'Data.xlsx') # Needed to save multiple sheets # Iterate through channels currentChannelIndex = 1 numOfChannels = len(channelToFiles) for channel in channelToFiles: extractedValues = {p: [] for p in params} names = [] cyclesColumn = [] # Obtain list of values and names from files in channel for File in channelToFiles[channel]: if hasCycles: appendFileInfoCycles(File, params, extractedValues, names, cyclesColumn) else: appendFileInfo(File, params, extractedValues, names) # Create table / DataFrame table = {'{} ({})'.format(p, paramToUnit[p]): extractedValues[p] for p in params} df = DataFrame(table) df.insert(0, 'File Name', names) if hasCycles: df.insert(1, 'Cycle', cyclesColumn) sheet = 'Ch. ' + channel # Add sheets and autofit column dimesntions df.to_excel(writer, sheet_name=sheet, index=False) writer.sheets[sheet].column_dimensions['A'].width = len( max(names, key=len)) # Message print('--Successfully extracted ' 'from {} ({} of {})'.format(sheet, currentChannelIndex, numOfChannels)) currentChannelIndex += 1 # Export writer.save() print('')
class InsertColumns(object): goal_time = 0.2 def setup(self): self.N = 10**3 self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) for i in range(100): self.df.insert(0, i, np.random.randn(self.N)) def time_assign_with_setitem(self): np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N)
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) tm.assert_series_equal(df['a'], df['foo'], check_names=False) df.insert(2, 'bar', df['c']) tm.assert_index_equal(df.columns, Index(['foo', 'c', 'bar', 'b', 'a'])) tm.assert_almost_equal(df['c'], df['bar'], check_names=False) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float32=1, float64=5)) assert (df.get_dtype_counts().sort_index() == result).all() # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float32=2, float64=4)) assert (df.get_dtype_counts().sort_index() == result).all() df['y'] = df['a'].astype('int32') result = Series(dict(float32=2, float64=4, int32=1)) assert (df.get_dtype_counts().sort_index() == result).all() with pytest.raises(ValueError, match='already exists'): df.insert(1, 'a', df['b']) msg = "cannot insert c, already exists" with pytest.raises(ValueError, match=msg): df.insert(1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) assert df.columns.name == 'some_name' # GH 13522 df = DataFrame(index=['A', 'B', 'C']) df['X'] = df.index df['X'] = ['x', 'y', 'z'] exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp)
def file_prep(file): df = DataFrame(read_csv(file, sep = '\t')) df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True) major_freqs = df.apply(major_prop_find, axis = 1) major_alleles = df.apply(major_find, axis =1 ) df.insert(3,'major_freqs', major_freqs) df.insert(3,'major_alleles', major_alleles) df = df.transpose() chrom, chrom_idx = np.unique(df.loc['chrom'], return_index=True) super_missing_df = df == '.' chromosome_dict = {} for number in np.unique(df.loc['chrom']): chromosome_dict[number] = df.loc['chrom'][df.loc['chrom'] == number].index return df, super_missing_df, chromosome_dict
def get_fingerprint_from_DataFrame(chem_smile,fpfunc): molsmitmp = [Chem.MolFromSmiles(x) for x in chem_smile['smiles']] i = 0 molsmi = [] for x in molsmitmp: if x is not None: x.SetProp("_Name",chem_smile['compound'][i]) molsmi.append(x) i += 1 fps = [fpfunc(x) for x in molsmi] # Note above: multi parameters can be used to generate E/FCFP. fpsmat = np.matrix(fps) df = DataFrame(fpsmat,index = [x.GetProp("_Name") for x in molsmi]) df.insert(0,'chembl',df.index) df.insert(1,'smiles',[Chem.MolToSmiles(x) for x in molsmi]) #df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi] #df['CHEMBL'] = df.index return(df)
def test_insert_column_bug_4032(self): # GH4032, inserting a column and renaming causing errors df = DataFrame({'b': [1.1, 2.2]}) df = df.rename(columns={}) df.insert(0, 'a', [1, 2]) result = df.rename(columns={}) str(result) expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b']) assert_frame_equal(result, expected) df.insert(0, 'c', [1.3, 2.3]) result = df.rename(columns={}) str(result) expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=['c', 'a', 'b']) assert_frame_equal(result, expected)
def data_prep(input_file, bad_samples_file, freq_dict=None): '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py''' min_snpD = 10 tri_allele= 0 output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt' #relaxing conditions because we only have 3000 SNPs to begin with bad_samples = [sample.strip() for sample in open(bad_samples_file)] df = DataFrame(read_csv(input_file, sep = '\t')) #remove bad samples df.drop(bad_samples, inplace = True, axis =1) #remove non-biallelic alleles #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True) #relaxing conditions because we only have 3000 SNPs to begin with '''#remove SNPs that are too close to one another df['diff'] = df.groupby('chrom')['pos'].diff() df.fillna('first', inplace = True) #df.to_csv('test_df.txt', sep = '\t') # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM df = df.query('diff > 10 or diff == "first"') df.drop('diff', axis = 1, inplace = True)''' if not freq_dict: #calculate the major and minor allele major = df.apply(major_find, axis =1 ) minor = df.apply(minor_find, axis =1 ) major_prop = df.apply(major_prop_find, axis =1 ) minor_prop = df.apply(minor_prop_find, axis = 1) else: snp_dict = json.load(open(freq_dict)) df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str) major = df['keys'].apply(lambda x : snp_dict[x]['major']) major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq']) minor = df['keys'].apply(lambda x : snp_dict[x]['minor']) minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq']) df.drop('keys', inplace= True, axis = 1) #inserting this stuff into dataframe for future use df.insert(3, 'minor_prop', minor_prop) df.insert(3, 'minor', minor) df.insert(3, 'major_prop', major_prop) df.insert(3, 'major', major) df.to_csv(output_file, sep = '\t', index= False) return df
def average_by_cell_line(): datasets = load_datasets() ess_train_data = datasets['ess_train_data'] lines_board = load_cell_lines(CELL_LINES_LEADERBOARD_PH1) lines_train = load_cell_lines(CELL_LINES_TRAINING_PH1) data = {} for line in lines_board.index: site = lines_board.at[line, 'Site_primary'] matches = lines_train.index[lines_train['Site_primary'] == site] if matches.size > 0: data[line] = ess_train_data.loc[:, matches].mean(1).tolist() else: data[line] = ess_train_data.mean(1).tolist() ess_avg_data = DataFrame(data, index=ess_train_data.index, columns=lines_board.index) ess_avg_data.insert(0, 'Description', ess_train_data.index) save_gct_data(ess_avg_data, 'avg_per_line.gct')
class InfoTable(DataFrameWidget): def __init__(self, samples=None): self.initVars() super(InfoTable, self).__init__(self.table) def initVars(self): """Initialises variables.""" self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"] self.table = DataFrame(columns=self.columns) ######################################################################## def update(self): plateID = self.table["Plate ID"] plateName = self.table["Plate Name"] plateKea = self.table["Plate Kea"] well = self.table["Well"] self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1) self.table.insert(0, "Plate ID", plateID) self.table.insert(1, "Plate Name", plateName) self.table.insert(2, "Plate Kea", plateKea) self.table.insert(3, "Well", well) self.setDataFrame(self.table) def append(self, appendage): self.table = self.table.append(appendage, ignore_index=True) self.update() def editPlates(self, edits): self.table = self.table.set_index("Plate ID") edits = edits.set_index("ID") self.table.update(edits) self.table = self.table.reset_index() def importPlateData(self, plateData, key): plateData = plateData.set_index(key) self.table = self.table.set_index(key) self.table.update(plateData) self.table = self.table.reset_index() def importSampleData(self, sampleData, tableKey, importKey): sampleData[tableKey] = sampleData[importKey] sampleData = sampleData.set_index(tableKey) self.table = self.table.set_index(tableKey) self.table = self.table.join(sampleData, rsuffix="_new") self.table = self.table.reset_index() def getKeaSexTestingData(self): table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]] table = table.set_index(["Plate ID", "Well"]) table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True) return table
def ingest(app, path=''): c = ConfigParser.ConfigParser() for f in 'ref_perf.txt', 'auto_perf.txt', 'naive_perf.txt', 'sweep_perf.txt': c.read(os.path.join(path, app, f)) df = DataFrame([dict(c.items(s)) for s in c.sections()]) # coerce types for col in df.columns: try: ints = df[col].astype(int) df[col] = ints except: try: floats = df[col].astype(float) df[col] = floats except: pass # keep as string # coerce old data names if present df = df.rename(columns={'nthreads':'threads'}) app_name = app.replace('_', ' ').title() df.insert(0, 'app', app) df.insert(0, 'app_name_pretty', app_name) assert(len(df)) df['throughput'] = 1000.0 / df.runtime # runs/sec df['speedup'] = 0.0 # this is a little bullshit, but DataFrame slice indexing gets confusing ref = df[df.version == 'ref']#.set_index('threads') def compute_speedup(row): r = ref[ref.threads == row.threads].runtime.iloc[0] #FFFfffuuu row.speedup = r / row.runtime return row df = df.apply(compute_speedup, axis=1) df['runtime_norm'] = df.runtime / max(df.runtime) df['throughput_norm'] = df.throughput / max(df.throughput) return df
def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=smoothing, case_insensitive=caseInsensitive) if params['case_insensitive'] is False: params.pop('case_insensitive') if '?' in params['content']: params['content'] = params['content'].replace('?', '*') if '@' in params['content']: params['content'] = params['content'].replace('@', '=>') print "here" # Google blocked the god damn api without ssl... #req = requests.get('https://books.google.com/ngrams/graph', params=params) param_list='&'.join([(x+"="+str('+'.join(str(params[x]).split(' ')))) for x in params.keys()]) return_html = urllib2.urlopen("https://books.google.com/ngrams/graph?"+param_list).read() print len(return_html) res = re.findall('var data = (.*?);\\n', return_html) data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])} print data df = DataFrame(data) df.insert(0, 'year', range(startYear, endYear+1)) return req.url, params['content'], df
def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive): params = dict(content=query, year_start=startYear, year_end=endYear, corpus=corpora[corpus], smoothing=smoothing, case_insensitive=caseInsensitive) if params['case_insensitive'] is False: params.pop('case_insensitive') if '?' in params['content']: params['content'] = params['content'].replace('?', '*') if '@' in params['content']: params['content'] = params['content'].replace('@', '=>') req = requests.get('http://books.google.com/ngrams/graph', params=params) res = re.findall('var data = (.*?);\\n', req.text) if res: data = {qry['ngram'].strip().replace(" ", "_"): qry['timeseries'] for qry in literal_eval(res[0])} df = DataFrame(data) df.insert(0, 'year', list(range(startYear, startYear+len(df)))) df.set_index('year', inplace=True) if len(df.columns) > 1: df = df[list(filter(lambda x:"(All)" in x, df.columns))] df = df.rename(columns={x:x.split("_(All)")[0].strip().replace(" ", "_") for x in df.columns}) else: df = DataFrame() return req.url, params['content'], df
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) self.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) tm.assert_series_equal(df['a'], df['foo'], check_names=False) df.insert(2, 'bar', df['c']) self.assert_index_equal(df.columns, Index(['foo', 'c', 'bar', 'b', 'a'])) tm.assert_almost_equal(df['c'], df['bar'], check_names=False) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float64=5, float32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float64=4, float32=2)) self.assertTrue((df.get_dtype_counts() == result).all()) df['y'] = df['a'].astype('int32') result = Series(dict(float64=4, float32=2, int32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) with assertRaisesRegexp(ValueError, 'already exists'): df.insert(1, 'a', df['b']) self.assertRaises(ValueError, df.insert, 1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name') # GH 13522 df = DataFrame(index=['A', 'B', 'C']) df['X'] = df.index df['X'] = ['x', 'y', 'z'] exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp)
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'b', 'a']) assert_almost_equal(df['a'], df['foo']) df.insert(2, 'bar', df['c']) self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'bar', 'b', 'a']) assert_almost_equal(df['c'], df['bar']) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float64=5, float32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float64=4, float32=2)) self.assertTrue((df.get_dtype_counts() == result).all()) df['y'] = df['a'].astype('int32') result = Series(dict(float64=4, float32=2, int32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) with assertRaisesRegexp(ValueError, 'already exists'): df.insert(1, 'a', df['b']) self.assertRaises(ValueError, df.insert, 1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name')
rf_fit = rf.fit(train_X, train_y, sample_weight = balance_weights(train_y)) # <codecell> rf_fit # <codecell> rf_prob = rf_fit.score(val_X, val_y) # <codecell> rf_prob # <codecell> rf_prob = rf_fit.predict_proba(test_X) # <codecell> df = DataFrame(rf_prob, columns = np.unique(train_y)) df.insert(loc = 0, column = "id", value = test["id"]) # <codecell> df.to_csv("RfWeightedSolution.csv", index = False) # <codecell>
def add_var2(self, varname, target=None, source = 'free'): """ Add a variable in the dataframe Parameters ---------- varname : str name of the variable target : float target for the margin of the variable source : str, default 'free' database source """ w_init = self.weights_init*self.champm w = self.weights*self.champm inputs = self.simulation.survey output_table = self.simulation.output_table varcol = self.simulation.get_col(varname) entity = self.entity enum = inputs.column_by_name.get('qui'+self.entity).enum people = [x[1] for x in enum] if varname in inputs.column_by_name: value = inputs.get_value(varname, index = idx) elif output_table is not None and varname in output_table.column_by_name: value = output_table.get_value(varname, index = idx, opt = people, sum_ = True) label = varcol.label # TODO: rewrite this using pivot table items = [ ('marge' , w[self.champm] ), ('marge initiale' , w_init[self.champm] )] if varcol.__class__ in MODCOLS: items.append(('mod', value[self.champm])) df = DataFrame.from_items(items) res = df.groupby('mod', sort= True).sum() else: res = DataFrame(index = ['total'], data = {'marge' : (value*w).sum(), 'marge initiale' : (value*w_init).sum() } ) res.insert(0, u"modalités",u"") res.insert(2, "cible", 0) res.insert(2, u"cible ajustée", 0) res.insert(4, "source", source) mods = res.index if target is not None: if len(mods) != len(target.keys()): drop_indices = [ (varname, mod) for mod in target.keys()] if source == 'input': self.input_margins_df.drop(drop_indices, inplace=True) self.input_margins_df.index.names = ['var','mod'] if source == 'output': self.output_margins_df.drop(drop_indices, inplace=True) self.output_margins_df.index.names = ['var','mod'] return if isinstance(varcol, EnumCol): if varcol.enum: enum = varcol.enum res[u'modalités'] = [enum._vars[mod] for mod in mods] res['mod'] = mods else: res[u'modalités'] = [mod for mod in mods] res['mod'] = mods elif isinstance(varcol, BoolCol): res[u'modalités'] = bool(mods) res['mod'] = mods elif isinstance(varcol, IntCol): res[u'modalités'] = mods res['mod'] = mods elif isinstance(varcol, AgeCol): res[u'modalités'] = mods res['mod'] = mods else: res[u'modalités'] = "total" res['mod'] = 0 if label is not None: res['variable'] = label else: res['variable'] = varname res['var'] = varname if target is not None: for mod, margin in target.iteritems(): if mod == varname: # dirty to deal with non catgorical data res['cible'][0] = margin else: res['cible'][mod] = margin if self.frame is None: self.frame = res else: self.frame = concat([self.frame, res]) self.frame = self.frame.reset_index(drop=True)
fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1]) roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('SVC', roc_auc)) #RandomForestClassifier probas = model_rfc.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN) fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1]) roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('RandonForest',roc_auc)) #KNeighborsClassifier probas = model_knc.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN) fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1]) roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('KNeighborsClassifier',roc_auc)) #LogisticRegression probas = model_lr.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN) fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1]) roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('LogisticRegression',roc_auc)) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.legend(loc=0, fontsize='small') pl.show() model_rfc.fit(train, target) result.insert(1,'Survived', model_rfc.predict(test)) result.to_csv('Kaggle_Titanic/Result/test.csv', index=False)
class MyTuShare(object): # Column = Enum("Range") """docstring for ClassName""" def __init__(self): TestArr = [["601366","利群股份","预升","2018-04-16","0.1700","30.33"], ["000018","神州长城","预增","2018-04-16","0.0583","140%~170%"], ["000626","远大控股","预亏","2018-04-16","0.1602","0"], ["000430" ,'张家界',"预亏","2018-04-14","0.0330",'-241.78%~-237.06%'], ["300107" ,'张家界2',"预亏","2018-04-19","0.0320",'-30%']] if(Test): self.forecastDF = DataFrame(TestArr, columns=["code", "name","type","report_date", "pre_eps","range"]) else: self.forecastDF = ts.forecast_data(2018,1) self.currentPrice = None # forecastDF = forecastDF[0:59] # code name type report_date pre_eps range # 0 601366 利群股份 预升 2018-04-16 0.1700 30.33 # 1 000018 神州长城 预增 2018-04-16 0.0583 140%~170% # 2 000626 远大控股 预亏 2018-04-16 0.1602 0 # 3 000430 张家界 预亏 2018-04-14 0.0330 -241.78%~-237.06% # 4 000690 宝新能源 预增 2018-04-14 0.0100 120.89%~170.41% # 5 000825 太钢不锈 预增 2018-04-14 0.0560 335.12%~381.74% # 6 000883 湖北能源 预增 2018-04-14 0.0680 51.83%~63.16% # 7 000760 斯太尔 减亏 2018-04-14 -0.0600 0 # def getClosePrice1(self, code): # today = "2018-04-20" # self.currentPrice = ts.get_hist_data('300107', start=today) # return self.currentPrice.loc[today, "close"] def getClosePrice(self, code): today = "2018-04-20" todayKdata = ts.get_hist_data(code, start=today) # print todayKdata # if(todayKdata != None): return todayKdata.loc[today, "close"] def insertClosePriceColumn(self): prices = [] for code in self.forecastDF['code']: print code price = self.getClosePrice(code) prices.append(price) self.forecastDF.insert(self.forecastDF.columns.size, "close", prices) def initCurrentPrice(self): # self.currentPrice = ts.get_today_all() #get previous day close price self.currentPrice = ts.get_hist_data('300107', start="2018-04-20") print self.currentPrice.loc["2018-04-20", "close"] #get today's count # df = ts.get_today_ticks('300107', retry_count=20) # self.currentPrice = df.head(10) #get volume > 500 # self.currentPrice = ts.get_sina_dd('300107', date='2018-04-20', vol=500) #指定大于等于500手的数据 def searchString(self, regExpr, line): patternAck = re.compile(regExpr) # print ">>>>" + str(regExpr) +"=>" + str(line) match = patternAck.search(line) return match def sort(self, columns): weight = 0 for column in columns: # sort if column == Column.Range: # print str(self.forecastDF.index) avgs = [] for rowNum in self.forecastDF.index: rangeStr = self.forecastDF.loc[rowNum]['range'] avg = 0 match1 = self.searchString(r'(-*\d+\.*\d*)%~(-*\d+\.*\d*)%', rangeStr) match2 = self.searchString(r'(-*\d+\.*\d*)%', rangeStr) if match1: low = float(match1.group(1)) high = float(match1.group(2)) avg = (low + high) / 2 elif(match2): avg = float(match2.group(1)) else: avg = float(rangeStr) # print avg avgs.append(avg) # print avgs self.forecastDF.insert(self.forecastDF.columns.size, "avgRange", avgs) self.forecastDF = self.forecastDF.sort_values(by=["avgRange"], axis = 0, ascending = False) # df.sort_values(by=['col1']) # print self.forecastDF.at(3) elif(column == Column.ReportDate): self.forecastDF = self.forecastDF.sort_values(by=["report_date"], axis = 0, ascending = False) else: pass self.forecastDF = self.forecastDF.reindex() print "*" * 20 + "After Sort" print self.forecastDF numRow = self.forecastDF.index.size if ("myrank" in self.forecastDF.columns.tolist() ): # if(myrank == None and myrank.size != self.forecastDF.index.size): # raise Exception("wrong myrank size" + str(myrank)) myrankIndex = self.forecastDF.columns.tolist().index("myrank") for i in range(0, numRow): print "1.>>>>" + str(self.forecastDF.iloc[i, myrankIndex]) + ":" + str(columns[column]) + "=>" +str(i) self.forecastDF.iloc[i, myrankIndex] += (numRow - i) * columns[column] else: arr = range(1, numRow + 1) arr.reverse() for i in range(0, len(arr)): arr[i] = columns[column] * arr[i] self.forecastDF.insert(self.forecastDF.columns.size, "myrank", arr) print "*" * 20 + "add myrank" print self.forecastDF weight += columns[column] myrankIndex = self.forecastDF.columns.tolist().index("myrank") for i in range(0, self.forecastDF.index.size): self.forecastDF.iloc[i, myrankIndex] = round( float(self.forecastDF.iloc[i, myrankIndex]) / weight , 2) print "*" * 30 + "final" self.forecastDF = self.forecastDF.sort_values(by=["myrank"], axis = 0, ascending = False) print self.forecastDF
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( {'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)