def prep_data(data, balance=True): ''' prepares a machine learning dataframe from mitdb HDFStore object Args: data (HDFStore): mitdb HDFStore data balance (bool, opt): balance arrythmia/not arrythmia classes Returns: DataFrame ''' records = filter(lambda x: re.search('record', x), data.keys()) records = [data[key] for key in records] data = DataFrame() for record in records: if record.arrythmia.sum() > 1: data = pd.concat([data, conform_data(record)]) data.reset_index(drop=True, inplace=True) if balance: mask = data.y == 1 size = data[mask].shape[0] index = np.random.choice(data[~mask].index, size) index = np.concatenate([index, data[mask].index]) data = data.ix[index] data.reset_index(drop=True, inplace=True) return data
def get_dataframe_option1(stock_list, date, zs_amplifier = 1): dframe = DataFrame() date = format_date(date, "%Y-%m-%d") stock_list = list(set(stock_list)) # stock_list = ['601800', '600528'] dframe_list = [] for stock in stock_list: if stock == u'ZS000001': # 上证指数 tmp_frame = get_minly_frame(stock, date, id_type=0) zs_amplifier = 1 else: tmp_frame = get_minly_frame(stock, date) tmp_frame = tmp_frame[['bartime', 'closeprice']] yesterday = get_lastN_date(date, 1) yeframe = get_mysqlData([stock],[yesterday]) if len(yeframe) > 0: pre_close = yeframe.loc[0,'CLOSE_PRICE'] else: pre_close = 10000 # 计算涨跌幅,可以扩大幅度 tmp_frame['closeprice'] = zs_amplifier * normalize_frame(tmp_frame['closeprice'], pre_close) tmp_frame.columns = ['barTime', stock] tmp_frame.set_index('barTime', inplace=True) dframe_list.append(tmp_frame) dframe = pd.concat(dframe_list, axis=1) dframe.reset_index(range(len(dframe)), inplace=True) return dframe
def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
def clicksDataframe(clicks_data): clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks']) clicks_dataframe = clicks_dataframe.apply(to_numeric, errors='ignore') clicks_dataframe.drop('date', axis=1, inplace=True) clicks_dataframe = clicks_dataframe.groupby(['cardName','position']).sum().sort_values(by='uniqueClicks',ascending=0) clicks_dataframe.reset_index(inplace=True) return clicks_dataframe
def _count_by_entity(data, var, entity, bornes): ''' Compte le nombre de 'var compris entre les 'bornes' au sein de l''entity' ''' id = 'id' + entity qui = 'qui' + entity data.index = data[id] cond = (bornes[0] <= data[var]) & (data[var] <= bornes[1]) & (data[qui] > 1) col = DataFrame(data.loc[cond, :].groupby(id).size(), index = data.index).fillna(0) col.reset_index() return col
def nearestNeighborsSetup(filename, stateList): df_specimens = formatChecker(filename) print 'Getting the weather stations' with open('input/acis_station_ID.pickle') as f: weatherStationsMetaData = cPickle.load(f) # weatherStationsMetaData = weatherStations(stateList) # weatherStationsMetaData = read_csv('weatherStation/acis_station_ID.csv') df_stations = DataFrame.from_dict(weatherStationsMetaData, orient='index', dtype=None) '''Loads the lat/long coordinates of the specimens and weather stations into numpy arrays. NearestNeighborsResults() will return he number of K (nearest stations) with the index value. Then index will be replaced by the UID to match the ASIC data serve.''' #Number of points np1 = np.array(df_specimens['longitude']).size np2 = np.array(df_stations['longitude']).size #Search radius r = .25 #Number of nearest stations returned k = 10 d1 = np.empty((np1, 2)) d2 = np.empty((np2, 2)) d1[:, 0] = np.array(df_specimens['latitude']) d1[:, 1] = np.array(df_specimens['longitude']) d2[:, 0] = np.array(df_stations['latitude']) d2[:, 1] = np.array(df_stations['longitude']) result, distance = nearestNeighborsResults(d1.copy(), d2.copy(), r, k) columnindex = [] closestStationList = [nearestNeighborsColumnString(x) for x in range(k)] for f in closestStationList: columnindex.append(f()), #temp variable for 0-N array t1 = np.arange(np2) #temp variable for 'uid' ID t2 = np.array(df_stations['uid']) df_results = DataFrame(result, columns=columnindex) #Creates a Pandas DataFrame uid_index = DataFrame({'0_closest_weather_station': t1, 'uid': t2}) for index, column_name in enumerate(columnindex): temp = uid_index.rename(columns={'0_closest_weather_station': column_name, 'uid': column_name + "s"}) df_results = df_results.reset_index().merge(temp, how='left', on= column_name, sort=False).sort('index') if index != 0: del df_results['level_0'] del df_results[column_name] del df_results['index'] df_results = df_results.reset_index() return concat([df_specimens, df_results], axis=1), distance, weatherStationsMetaData
class ResetIndex: params = [None, 'US/Eastern'] param_names = 'tz' def setup(self, tz): idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reest_datetimeindex(self, tz): self.df.reset_index()
def homePageToSubjectPageDataframe(data): subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews']) subject_dataframe = subject_dataframe.apply(to_numeric, errors='ignore') subject_dataframe.drop('date', axis=1, inplace=True) subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort_values(by='uniqueViews',ascending=0) subject_dataframe.reset_index(inplace=True) subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title)) subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum() subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews']) subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)] return subject_dataframe[['subject','uniqueViews','Pct']]
def append_village_areas(divname): im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower()) shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()), lat_offset, lon_offset) areas = shape_helper.get_shape_areas('village') areas_df = DataFrame(areas, index=['area']) areas_df = areas_df.transpose() areas_df.reset_index(inplace=True) areas_df.rename(columns={'index': 'village'}, inplace=True) im_vil_areas = pd.merge(im_vil, areas_df, how='left') im_vil_areas.set_index('image', inplace=True) im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
def sql2pandas(db_url, table_name, locriterion=None): """connects to database at db_url and converts psiturk datatable table_name to a pandas df. Only includes trials that meet all criterion functions given in locriterion (default takes all trials)""" from sqlalchemy import MetaData, Table, create_engine from json import loads from pandas import DataFrame, concat data_column_name = 'datastring' # boilerplace sqlalchemy setup engine = create_engine(db_url) metadata = MetaData() metadata.bind = engine table = Table(table_name, metadata, autoload=True) # make a query and loop through s = table.select() tablerows = s.execute() # convert sql rows to lodicts, each containing a subject's full experiment # fields from orig datatable that you want attached to every trial expFields = ['uniqueid', 'assignmentid', 'workerid', 'hitid', 'status'] expData = [] for row in tablerows: try: subExpData = loads(row[data_column_name]) for field in expFields: subExpData[field] = row[field] expData.append(subExpData) except: continue # turn from nested list to flat list of trials minidicts = [] for subExpData in expData: for trial in subExpData['data']: trialdata = trial['trialdata'] for field in expFields: trialdata[field] = subExpData[field] # check if trial valid if any criterion were passed includeThisTrial = True if locriterion: includeThisTrial = meetsCriterion(trialdata, locriterion) if includeThisTrial: minidicts.append(trialdata) # convert minidicts into dataframe! df = DataFrame(minidicts) # get rid of residue from minidfs df.reset_index(drop=True, inplace=True) return df
class InfoTable(DataFrameWidget): def __init__(self, samples=None): self.initVars() super(InfoTable, self).__init__(self.table) def initVars(self): """Initialises variables.""" self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"] self.table = DataFrame(columns=self.columns) ######################################################################## def update(self): plateID = self.table["Plate ID"] plateName = self.table["Plate Name"] plateKea = self.table["Plate Kea"] well = self.table["Well"] self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1) self.table.insert(0, "Plate ID", plateID) self.table.insert(1, "Plate Name", plateName) self.table.insert(2, "Plate Kea", plateKea) self.table.insert(3, "Well", well) self.setDataFrame(self.table) def append(self, appendage): self.table = self.table.append(appendage, ignore_index=True) self.update() def editPlates(self, edits): self.table = self.table.set_index("Plate ID") edits = edits.set_index("ID") self.table.update(edits) self.table = self.table.reset_index() def importPlateData(self, plateData, key): plateData = plateData.set_index(key) self.table = self.table.set_index(key) self.table.update(plateData) self.table = self.table.reset_index() def importSampleData(self, sampleData, tableKey, importKey): sampleData[tableKey] = sampleData[importKey] sampleData = sampleData.set_index(tableKey) self.table = self.table.set_index(tableKey) self.table = self.table.join(sampleData, rsuffix="_new") self.table = self.table.reset_index() def getKeaSexTestingData(self): table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]] table = table.set_index(["Plate ID", "Well"]) table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True) return table
def _fill(self, df, year = None): """ Takes age, sex profile (per capita transfers) found in df to fill year 'year' or all empty years if year is None This is a private method. Parameters ---------- df : DataFrame a dataframe containing the profiles year : int, default None if None fill all the years else only the given year """ if not isinstance(df, DataFrame): df = DataFrame(df) for col_name in df.columns: if col_name not in self._types: self.new_type(col_name) typ = col_name tmp = df[typ] tmp = tmp.unstack(level="year") tmp = tmp.dropna(axis=1, how="all") self._types_years[typ] = tmp.columns else: raise Exception("column already exists") if year is None: df_insert = df.reset_index(level='year', drop=True) years = sorted(self.index_sets['year']) list_df = [df_insert] * len(years) df_tot = concat(list_df, keys = years, names =['year']) df_tot = df_tot.reorder_levels(['age','sex','year'], axis=0) else: yr = year df_tot = None df_insert = df.reset_index() df_insert['year'] = yr if df_tot is None: df_tot = df_insert else: df_tot.append(df_insert, ignore_index=True) df_tot = df_tot.set_index(['age','sex','year']) # print df_tot # print len(df_tot) self.update(df_tot)
def _decode_solutions(self, solutions): decoded_solutions = DataFrame(columns=["targets", "fitness"]) index = 0 for solution in solutions: combinations = self._decoder(solution.candidate, flat=True, decompose=True) for targets in combinations: if len(targets) > 0: decoded_solutions.loc[index] = [tuple(targets), solution.fitness] index += 1 decoded_solutions.drop_duplicates(inplace=True, subset="targets") decoded_solutions.reset_index(inplace=True) return decoded_solutions
def get_cpu_sw_map(dfds, cap_time_usec, task_re): df_list = [] dfsw_list = [] for dfd in dfds: df = filter_df_core(dfd.df, task_re, True) # at this point we have a set of df that look like this: # task_name duration # 0 ASA.1.vcpu0 7954 # 1 ASA.1.vcpu0 5475 # 2 ASA.1.vcpu0 4151 if df.empty: continue gb = df.groupby("task_name", as_index=False) # sum all duration for each task df = gb.aggregate(np.sum) if dfd.multiplier > 1.0: df["duration"] = (df["duration"] * dfd.multiplier).astype(int) df["percent"] = ((df["duration"] * 100 * 10) // cap_time_usec) / 10 if len(dfds) > 1: df["task_name"] = df["task_name"] + "." + dfd.short_name df_list.append(df) # count number of rows with same task and cpu dfsw = DataFrame(gb.size()) dfsw.reset_index(inplace=True) dfsw.rename(columns={0: "count"}, inplace=True) if dfd.multiplier > 1.0: dfsw["count"] = (dfsw["count"] * dfd.multiplier).astype(int) else: dfsw["count"] = dfsw["count"].astype(int) dfsw_list.append(dfsw) if not df_list: return None df = pandas.concat(df_list) df = df.drop("duration", axis=1) dfsw = pandas.concat(dfsw_list) df = pandas.merge(df, dfsw, on="task_name") # Result: # task_name percent count # 0 ASA.01.vcpu0.1x218 72.0 1998 # 1 ASA.01.vcpu0.2x208 61.8 2128 # 2 ASA.02.vcpu0.2x208 58.9 2177 # transform this into a dict where the key is the task_name and the value # is a list [percent, count] return df.set_index("task_name").T.to_dict("list")
def _standardize_index( self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None, barsize: str=None, tz: str=None): """Normalize input DataFrame index to MarketDataBlock standard. """ # Add or starndardize index names in the input. if isinstance(df_in.index, pd.MultiIndex): df_in.reset_index(inplace=True) # Rename ambiguous column names. df_in.columns = [ col_rename.get(col.strip().lower(), col.strip().lower()) for col in df_in.columns] # Insert Symbol, DataType, Barsize columns from arguments if not # found in the input dataframe. for col in MarketDataBlock.data_index: if col not in df_in.columns: if locals().get(col.lower(), None) is None: raise KeyError( 'No {0} argument and no {0} column in the DataFrame.' .format(col)) df_in.insert(0, col, locals()[col.lower()]) # Convert datetime strings to pandas DatetimeIndex df_in['TickerTime'] = pd.DatetimeIndex( df_in['TickerTime'].apply(pd.Timestamp)) # Standardize BarSize strings df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize) # Set index to class-defined MultiIndex df_in.set_index(MarketDataBlock.data_index, inplace=True) # Set time zone so all DatetimeIndex are tz-aware df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz if df_in_tz is None or isinstance(df_in_tz, timezone) or \ isinstance(df_in_tz, pytz._FixedOffset): # Input df has naive time index, or tzinfo is not pytz.timezone() if tz is None: raise ValueError( 'Argument tz=None, and TickerTime.tzinfo is None(naive),' 'datetime.timezone, or pytz._FixedOffset.') if df_in_tz is None: df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel) else: df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel) return df_in
def build_data(symbol_list, n = 15, flag = 1, blag = 10): train = DataFrame() test = DataFrame() for i in symbol_list: print i try: path = '45-165caps/' + i + '.csv' data = pd.read_csv(path) forward = forward_lag(data, i, flag) back = back_lag(data, i, blag) today_back = prediction_back_lag(data, i, blag) combined = combine_lags(forward, back) combined = combined.ix[combined['Forward Lag 1'] < .2,:].reset_index() del combined['index'] #Train------------------------------------------------------------------ random_sample = [] for j in range(n): random_sample.append(random.randint(0,(len(combined) - 1))) data_slice = combined.ix[random_sample,:].reset_index() if len(train) == 0: train = data_slice else: train = pd.concat([train, data_slice], axis = 0) #Test------------------------------------------------------------------- data_slice = DataFrame(today_back.ix[len(today_back) - 1,:]).T if len(test) == 0: test = data_slice else: test = pd.concat([test, data_slice], axis = 0) except: print '\tSkipped' pass train = train.reset_index() del train['level_0'] del train['index'] test = test.reset_index() del test['level_0'] del test['index'] combined.to_csv('combined1.csv', sep = ',', index = False) today_back.to_csv('today_back1.csv', sep = ',', index = False) return train, test
def test_dti_reset_index_round_trip(): dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() assert d2.dtypes[0] == np.dtype('M8[ns]') d3 = d2.set_index('index') assert_frame_equal(d1, d3, check_names=False) # #2329 stamp = datetime(2012, 11, 22) df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) df = df.set_index('Date') assert df.index[0] == stamp assert df.reset_index()['Date'][0] == stamp
def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled["prm1"])) self.assert_(com.is_float_dtype(deleveled["prm2"]))
def test_frame_reset_index(self): dr = date_range('2012-06-02', periods=10, tz='US/Eastern') df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index('index') xp = df.index.tz rs = roundtripped.index.tz self.assertEquals(xp, rs)
def test_dti_reset_index_round_trip(self): dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') d1 = DataFrame({'v' : np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() self.assert_(d2.dtypes[0] == np.datetime64) d3 = d2.set_index('index') assert_frame_equal(d1, d3)
def get_travel_times(df): df = df[df['section'] != 0] g = df['time'].groupby([df['veh_id'], df['section']]) res = DataFrame([g.max() - g.min(), g.min()]).T res.columns = ['tt', 'time'] res = res.reset_index() return res
def test_dti_reset_index_round_trip(self): dti = DatetimeIndex(start="1/1/2001", end="6/1/2001", freq="D") d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() self.assert_(d2.dtypes[0] == np.datetime64) d3 = d2.set_index("index") assert_frame_equal(d1, d3)
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def test_infer_objects(self): # GH 11221 df = DataFrame({'a': ['a', 1, 2, 3], 'b': ['b', 2.0, 3.0, 4.1], 'c': ['c', datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], 'd': [1, 2, 3, 'd']}, columns=['a', 'b', 'c', 'd']) df = df.iloc[1:].infer_objects() assert df['a'].dtype == 'int64' assert df['b'].dtype == 'float64' assert df['c'].dtype == 'M8[ns]' assert df['d'].dtype == 'object' expected = DataFrame({'a': [1, 2, 3], 'b': [2.0, 3.0, 4.1], 'c': [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], 'd': [2, 3, 'd']}, columns=['a', 'b', 'c', 'd']) # reconstruct frame to verify inference is same tm.assert_frame_equal(df.reset_index(drop=True), expected)
def test_frame_reset_index(self): dr = date_range("2012-06-02", periods=10, tz=self.tzstr("US/Eastern")) df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index("index") xp = df.index.tz rs = roundtripped.index.tz self.assertEqual(xp, rs)
def test_frame_reset_index(self, tz): dr = date_range('2012-06-02', periods=10, tz=tz) df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index('index') xp = df.index.tz rs = roundtripped.index.tz assert xp == rs
def trim_index_df(df: pd.DataFrame, index_names_to_keep: list, inplace=False): '''Drops all indexes except for specified index names.''' indexes_to_drop = list(df.index.names) try: indexes_to_drop.remove(index_names_to_keep) except ValueError: try: for idxn in index_names_to_keep: indexes_to_drop.remove(idxn) except ValueError: pass if inplace: df.reset_index(level=indexes_to_drop, drop=True, inplace=True) else: return df.reset_index(level=indexes_to_drop, drop=True)
def test_set_reset_index(self): df = DataFrame({'A': range(10)}) s = pd.cut(df.A, 5) df['B'] = s df = df.set_index('B') df = df.reset_index()
def seriesPosPrc_fitTimeFrame(dfList, PosPrc, ts, PosDirList): dataList = [] for i in range(len(dfList)): tf = DataFrame(index = ts) df = dfList[i] tf[PosPrc] = df[PosPrc] tf = tf.fillna(-99999) tf.reset_index(inplace = True) tf['PosDir'] = PosDirList[i] for j in range(len(tf)): if tf.ix[j, PosPrc] == -99999: if j == 0: tf.ix[j, PosPrc] = 0 elif tf.ix[j-1, 'PosDir'] != 0: tf.ix[j, PosPrc] = tf.ix[j-1, PosPrc] else: tf.ix[j, PosPrc] = 0 dataList.append(np.asarray(tf[PosPrc])) return dataList
def test_join_multi_levels2(self): # some more advanced merges # GH6360 household = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], }, columns=["household_id", "asset_id", "share"], ).set_index(["household_id", "asset_id"]) log_return = DataFrame({ "asset_id": [ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", ], "t": [233, 234, 235, 180, 181], "log_return": [ 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, ], }).set_index(["asset_id", "t"]) expected = (DataFrame({ "household_id": [2, 2, 2, 3, 3, 3, 3, 3], "asset_id": [ "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", ], "t": [233, 234, 235, 233, 234, 235, 180, 181], "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], "log_return": [ 0.09604978, -0.06524096, 0.03532373, 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, ], }).set_index(["household_id", "asset_id", "t"]).reindex(columns=["share", "log_return"])) # this is the equivalency result = merge( household.reset_index(), log_return.reset_index(), on=["asset_id"], how="inner", ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) expected = (DataFrame({ "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None, ], "t": [ None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None, ], "share": [ 1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0, ], "log_return": [ None, None, 0.09604978, -0.06524096, 0.03532373, 0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997, None, None, ], }).set_index(["household_id", "asset_id", "t"]).reindex(columns=["share", "log_return"])) result = merge( household.reset_index(), log_return.reset_index(), on=["asset_id"], how="outer", ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected)
def keywords_extract(input_data, d, word_num, gram=(1, 2)): word_graph, vocab = build_word_graph(input_data, gram) rank_idx = get_ranks(word_graph, d) keywords = [(vocab[idx], round(wght, 5)) for idx, wght in rank_idx.items()] keywords = DataFrame(keywords, columns=['keyword', 'weight']).sort_values(by='weight', ascending=False) # keywords['weight'] = [round(float(v)/sum(keywords['weight']), 5) for v in keywords['weight']] keywords = keywords.reset_index(drop=True).iloc[:word_num, :] # keywords = dict(zip(keywords.noun, keywords.weight)) for i, _ in enumerate(range(len(keywords))): check = keywords['keyword'][i].split(' ') c = len(check) if c == 3: if check[0] == check[1] and check[1] == check[2]: keywords['keyword'] = check[0] c = 1 elif check[0] == check[1]: keywords['keyword'][i] = check[0] + ' ' + check[2] check[1] = check[2] print('removed check[0]' + check[0]) del check[2] c = 2 elif check[1] == check[2]: keywords['keyword'][i] = check[0] + ' ' + check[1] print('removed check[1]' + check[1]) del check[2] c = 2 elif check[0] == check[2]: keywords['keyword'][i] = check[0] + ' ' + check[1] print('removed check[2]' + check[2]) del check[2] c = 2 else: if check[0] in check_values1: check[0] = check_key1[check_values1.index(check[0])] if check[1] in check_values1: check[1] = check_key1[check_values1.index(check[1])] if check[2] in check_values2: check[2] = check_key2[check_values2.index(check[2])] keywords['keyword'][ i] = check[0] + ' ' + check[1] + ' ' + check[2] if c == 2: if check[0] == check[1]: keywords['keyword'][i] = check[0] if keywords['keyword'][i] in check_values2: keywords['keyword'][i] = check_key2[check_values2.index( check[0])] else: if check[0] in check_values1: check[0] = check_key1[check_values1.index(check[0])] if check[1] in check_values2: check[1] = check_key2[check_values2.index(check[1])] keywords['keyword'][i] = check[0] + ' ' + check[1] if c == 1: if keywords['keyword'][i] in check_values2: keywords['keyword'][i] = check_key2[check_values2.index( check[0])] for i, _ in enumerate(range(len(keywords))): check = keywords['keyword'][i].replace(" ", "") if check in check_values3: keywords['keyword'][i] = check_key3[check_values3.index(check)] return keywords
def call_alleles( alignments: pd.DataFrame, ref_filepath: Optional[str] = None, ref: Optional[str] = None, barcode_interval: Tuple[int, int] = (20, 34), cutsite_locations: List[int] = [112, 166, 220], cutsite_width: int = 12, context: bool = True, context_size: int = 5, ) -> pd.DataFrame: """Call indels from CIGAR strings. Given many alignments, we extract the indels by comparing the CIGAR strings of each alignment to the reference sequence. Args: alignments: Alignments provided in DataFrame ref_filepath: Filepath to the reference sequence ref: Nucleotide sequence of the reference barcode_interval: Interval in reference corresponding to the integration barcode cutsite_locations: A list of all cutsite positions in the reference cutsite_width: Number of nucleotides left and right of cutsite location that indels can appear in. context: Include sequence context around indels context_size: Number of bases to the right and left to include as context Returns: A DataFrame mapping each sequence alignment to the called indels. """ if (ref is None) == (ref_filepath is None): raise PreprocessError( "Either `ref_filepath` or `ref` must be provided.") alignment_to_indel = {} alignment_to_intBC = {} if ref_filepath: ref = str(list(SeqIO.parse(ref_filepath, "fasta"))[0].seq) for _, row in tqdm( alignments.iterrows(), total=alignments.shape[0], desc="Parsing CIGAR strings into indels", ): intBC, indels = alignment_utilities.parse_cigar( row.CIGAR, row.Seq, ref, row.ReferenceBegin, row.QueryBegin, barcode_interval, cutsite_locations, cutsite_width, context=context, context_size=context_size, ) alignment_to_indel[row.readName] = indels alignment_to_intBC[row.readName] = intBC indel_df = pd.DataFrame.from_dict( alignment_to_indel, orient="index", columns=[f"r{i}" for i in range(1, len(cutsite_locations) + 1)], ) indel_df["allele"] = indel_df.apply( lambda x: "".join([str(i) for i in x.values]), axis=1) indel_df["intBC"] = indel_df.index.map(alignment_to_intBC) alignments.set_index("readName", inplace=True) alignments = alignments.join(indel_df) alignments.reset_index(inplace=True) # check cut-sites and raise a warning if any missing data is detected cutsites = utilities.get_default_cut_site_columns(alignments) if np.any((alignments[cutsites] == "").sum(axis=0) > 0): warnings.warn( "Detected missing data in alleles. You might" " consider re-running align_sequences with a" " lower gap-open penalty, or using a separate" " alignment strategy.", PreprocessWarning, ) return alignments
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby( [lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [ lev.astype('i8') for lev in self.ymd.index.levels ] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform( np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_getitem_corner(self): s = self.ymd['A'] # don't segfault, GH #495 # out of bounds access self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) # generator result = s[(x > 0 for x in s)] expected = s[s > 0] assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s.values[42:65]).all()) self.assert_(notnull(s.values[:42]).all()) self.assert_(notnull(s.values[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() cp.ix[:4] = 0 self.assert_((cp.values[:4] == 0).all()) self.assert_((cp.values[4:] != 0).all()) def test_frame_getitem_setitem_multislice(self): levels = [['t1', 't2'], ['a', 'b', 'c']] labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) result = df.ix[:, 'value'] assert_series_equal(df['value'], result) result = df.ix[1:3, 'value'] assert_series_equal(df['value'][1:3], result) result = df.ix[:, :] assert_frame_equal(df, result) result = df df.ix[:, 'value'] = 10 result['value'] = 10 assert_frame_equal(df, result) df.ix[:, :] = 10 assert_frame_equal(df, result) def test_getitem_tuple_plus_slice(self): # GH #671 df = DataFrame({ 'a': range(10), 'b': range(10), 'c': np.random.randn(10), 'd': np.random.randn(10) }) idf = df.set_index(['a', 'b']) result = idf.ix[(0, 0), :] expected = idf.ix[0, 0] expected2 = idf.xs((0, 0)) assert_series_equal(result, expected) assert_series_equal(result, expected2) def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_xs_level(self): result = self.frame.xs('two', level='second') expected = self.frame[self.frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ('p', 'q', 'r')]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected) def test_xs_level_multiple(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') assert_frame_equal(result, expected) def test_xs_level0(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs('a', level=0) expected = df.xs('a') self.assertEqual(len(result), 2) assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame['A'] result = s[:, 'two'] expected = self.frame.xs('two', level=1)['A'] assert_series_equal(result, expected) s = self.ymd['A'] result = s[2000, 5] expected = self.ymd.ix[2000, 5]['A'] assert_series_equal(result, expected) # not implementing this for now self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) # result = s[2000, 3:4] # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) # assert_series_equal(result, expected) # can do this though def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) frame.ix[1:2] = 7 self.assert_((frame.ix[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) series.ix[1:2] = 7 self.assert_((series.ix[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [ tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1]) ] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) deleveled = self.series.reset_index() self.assert_(isinstance(deleveled, DataFrame)) self.assert_( len(deleveled.columns) == len(self.series.index.levels) + 1) deleveled = self.series.reset_index(drop=True) self.assert_(isinstance(deleveled, Series)) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({ 'state': ['naive', 'naive', 'naive', 'activ', 'activ', 'activ'], 'exp': ['a', 'b', 'b', 'b', 'a', 'a'], 'barcode': [1, 2, 3, 4, 1, 3], 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'], 'extra': np.arange(6.) }) result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], labels=[[0], [0], [0]], names=['one', 'two', 'three']) df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'], index=midx) # should work df.groupby(level='three') def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA': self.frame, 'ItemB': self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])) y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] result = frame.count(level='b') expect = self.frame.count(level=1) assert_frame_equal(result, expect) result = frame.count(level='a') expect = self.frame.count(level=0) assert_frame_equal(result, expect) series = self.series.copy() series.index.names = ['a', 'b'] result = series.count(level='b') expect = self.series.count(level=1) assert_series_equal(result, expect) result = series.count(level='a') expect = self.series.count(level=0) assert_series_equal(result, expect) self.assertRaises(Exception, series.count, 'x') self.assertRaises(Exception, frame.count, level='x') AGG_FUNCTIONS = [ 'sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var' ] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_getitem_lowerdim_corner(self): self.assertRaises(KeyError, self.frame.ix.__getitem__, (('bar', 'three'), 'B')) self.assertRaises(KeyError, self.frame.ix.__setitem__, (('bar', 'three'), 'B'), 0) #---------------------------------------------------------------------- # AMBIGUOUS CASES! def test_partial_ix_missing(self): raise nose.SkipTest result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_fancy_2d(self): raise nose.SkipTest result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) #---------------------------------------------------------------------- def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ('foo', 'qux', 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar', 0)] result2 = series.ix[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) result = frame.ix[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ('foo', 'qux')], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar')] result2 = series.ix[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd['A'] result = s[5:] expected = s.reindex(s.index[5:]) assert_series_equal(result, expected) exp = self.ymd['A'].copy() s[5:] = 0 exp.values[5:] = 0 self.assert_(np.array_equal(s.values, exp.values)) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df['a'] expected = df['a', '', ''] assert_series_equal(result, expected) self.assertEquals(result.name, 'a') result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] assert_series_equal(result, expected) self.assertEquals(result.name, ('routine1', 'result1')) def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.copy() expected = df.copy() result['b'] = [1, 2, 3, 4] expected['b', '', ''] = [1, 2, 3, 4] assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.drop('a', axis=1) expected = df.drop([('a', '', '')], axis=1) assert_frame_equal(expected, result) result = df.drop(['top'], axis=1) expected = df.drop([('top', 'OD', 'wx')], axis=1) expected = expected.drop([('top', 'OD', 'wy')], axis=1) assert_frame_equal(expected, result) def test_mixed_depth_pop(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) df1 = df.copy() df2 = df.copy() result = df1.pop('a') expected = df2.pop(('a', '', '')) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name, 'a') expected = df1['top'] df1 = df1.drop(['top'], axis=1) result = df2.pop('top') assert_frame_equal(expected, result) assert_frame_equal(df1, df2) def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]] assert_frame_equal(result, expected) result = self.frame.drop(['two'], level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] assert_frame_equal(result, expected) result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]].T assert_frame_equal(result, expected) result = self.frame.T.drop(['two'], axis=1, level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected)
patient_list.append(patient['diagnoseID']) else: patient_list.append(-1) patients_list.append(patient_list) #################################################### # Map strings to integers, fill missing values # #################################################### patient_df = DataFrame(patients_list, columns=patient_column_names) patient_df['age'] = [DataCollector.calculate_age(datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")) if x != "null" else np.NaN for x in patient_df['age']] patient_df.age.replace(np.NaN, patient_df["age"].mean(), inplace=True) patient_df['age'] = patient_df['age'].astype(int) patient_df = patient_df[patient_df.id > 10] # All patients with id higher than 10 are test accounts patient_df = patient_df[patient_df.diagnosis != -1] patient_df = patient_df.reset_index(drop=True) diagnose_mapping = {1: "MIGRAINE W/ AURA", 2: "MIGRAINE W/O AURA", 3: "CLUSTER", 4: "TENSION"} diagnose_mapping_reverse = {"MIGRAINE W/ AURA": 1, "MIGRAINE W/O AURA": 2, "CLUSTER": 3, "TENSION": 4} patient_df['sex'] = patient_df['sex'].map(lambda x: "MALE" if x else "FEMALE") patient_df['employment'] = patient_df['employment'].map(lambda x: "EMPLOYED" if x else "UNEMPLOYED") patient_df['diagnosis'] = patient_df["diagnosis"].map(diagnose_mapping) ################################################### # Plot some demographic plots # ################################################### def get_distribution(values): distribution = {} for value in values: if value not in distribution: distribution[value] = 1
def __init__(self, data: pd.DataFrame): QAbstractTableModel.__init__(self) self._data = data.reset_index()
def assert_frame_equal(cls, left: pd.DataFrame, right: pd.DataFrame, *args: Any, **kwargs: Any) -> None: left = left.reset_index(drop=True) right = right.reset_index(drop=True) tm.assert_frame_equal(left, right, *args, **kwargs)
def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) names = ['first', 'second'] stacked.index.names = names deleveled = stacked.reset_index() for i, (lev, lab) in enumerate(zip(stacked.index.levels, stacked.index.labels)): values = lev.take(lab) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], check_names=False) tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], check_names=False) # default name assigned rdf = self.frame.reset_index() exp = pd.Series(self.frame.index.values, name='index') tm.assert_series_equal(rdf['index'], exp) # default name assigned, corner case df = self.frame.copy() df['index'] = 'foo' rdf = df.reset_index() exp = pd.Series(self.frame.index.values, name='level_0') tm.assert_series_equal(rdf['level_0'], exp) # but this is ok self.frame.index.name = 'index' deleveled = self.frame.reset_index() tm.assert_series_equal(deleveled['index'], pd.Series(self.frame.index)) tm.assert_index_equal(deleveled.index, pd.Index(np.arange(len(deleveled)))) # preserve column names self.frame.columns.name = 'columns' resetted = self.frame.reset_index() assert resetted.columns.name == 'columns' # only remove certain columns frame = self.frame.reset_index().set_index(['index', 'A', 'B']) rs = frame.reset_index(['A', 'B']) # TODO should reset_index check_names ? assert_frame_equal(rs, self.frame, check_names=False) rs = frame.reset_index(['index', 'A', 'B']) assert_frame_equal(rs, self.frame.reset_index(), check_names=False) rs = frame.reset_index(['index', 'A', 'B']) assert_frame_equal(rs, self.frame.reset_index(), check_names=False) rs = frame.reset_index('A') xp = self.frame.reset_index().set_index(['index', 'B']) assert_frame_equal(rs, xp, check_names=False) # test resetting in place df = self.frame.copy() resetted = self.frame.reset_index() df.reset_index(inplace=True) assert_frame_equal(df, resetted, check_names=False) frame = self.frame.reset_index().set_index(['index', 'A', 'B']) rs = frame.reset_index('A', drop=True) xp = self.frame.copy() del xp['A'] xp = xp.set_index(['B'], append=True) assert_frame_equal(rs, xp, check_names=False)
def fill_fields_with_data_source( existing_df: pd.DataFrame, data_source: pd.DataFrame, index_fields: List[str], columns_to_fill: List[str], ) -> pd.DataFrame: """Pull columns from an existing data source into an existing data frame. Example: existing_df: ---------------- | date | cases | | 4/2 | 1 | | 4/3 | 2 | | 4/4 | 3 | ---------------- data_source: ---------------------- | date | current_icu | | 4/3 | 4 | | 4/5 | 5 | ---------------------- index_fields: ['date'] columns_to_fill: ['current_icu'] output: ------------------------------ | date | cases | current_icu | | 4/2 | 1 | Na | | 4/3 | 2 | 4 | | 4/4 | 3 | Na | | 4/5 | Na | 5 | ------------------------------ Args: existing_df: Existing data frame data_source: Data used to fill existing df columns index_fields: List of columns to use as common index. columns_to_fill: List of columns to add into existing_df from data_source Returns: Updated dataframe with requested columns filled from data_source data. """ new_data = data_source.set_index(index_fields) # If no data exists, return all rows from new data with just the requested columns. if not len(existing_df): for column in columns_to_fill: if column not in new_data.columns: new_data[column] = None return new_data[columns_to_fill].reset_index() existing_df = existing_df.set_index(index_fields) # Sort indices so that we have chunks of equal length in the # correct order so that we can splice in values. existing_df = existing_df.sort_index() new_data = new_data.sort_index() # Build series that point to rows that match in each data frame. existing_df_in_new_data = existing_df.index.isin(new_data.index) new_data_in_existing_df = new_data.index.isin(existing_df.index) if not sum(existing_df_in_new_data) == sum(new_data_in_existing_df): print(new_data.loc[new_data_in_existing_df, columns_to_fill]) existing_in_new = sum(existing_df_in_new_data) new_in_existing = sum(new_data_in_existing_df) raise ValueError( f"Number of rows should be the for data to replace: {existing_in_new} -> {new_in_existing}: {columns_to_fill}" ) # If a column doesn't exist in the existing data, add it (throws an error) # otherwise. for column in columns_to_fill: if column not in existing_df.columns: existing_df[column] = None # Fill in values for rows that match in both data frames. existing_df.loc[existing_df_in_new_data, columns_to_fill] = new_data.loc[new_data_in_existing_df, columns_to_fill] # Get rows that do not exist in the existing data frame missing_new_data = new_data[~new_data_in_existing_df] data = pd.concat([ existing_df.reset_index(), missing_new_data[columns_to_fill].reset_index(), ]) return data
def test_set_index2(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), 'E': np.random.randn(5)}) # new object, single-column result = df.set_index('C') result_nodrop = df.set_index('C', drop=False) index = Index(df['C'], name='C') expected = df.loc[:, ['A', 'B', 'D', 'E']] expected.index = index expected_nodrop = df.copy() expected_nodrop.index = index assert_frame_equal(result, expected) assert_frame_equal(result_nodrop, expected_nodrop) assert result.index.name == index.name # inplace, single df2 = df.copy() df2.set_index('C', inplace=True) assert_frame_equal(df2, expected) df3 = df.copy() df3.set_index('C', drop=False, inplace=True) assert_frame_equal(df3, expected_nodrop) # create new object, multi-column result = df.set_index(['A', 'B']) result_nodrop = df.set_index(['A', 'B'], drop=False) index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) expected = df.loc[:, ['C', 'D', 'E']] expected.index = index expected_nodrop = df.copy() expected_nodrop.index = index assert_frame_equal(result, expected) assert_frame_equal(result_nodrop, expected_nodrop) assert result.index.names == index.names # inplace df2 = df.copy() df2.set_index(['A', 'B'], inplace=True) assert_frame_equal(df2, expected) df3 = df.copy() df3.set_index(['A', 'B'], drop=False, inplace=True) assert_frame_equal(df3, expected_nodrop) # corner case with tm.assert_raises_regex(ValueError, 'Index has duplicate keys'): df.set_index('A', verify_integrity=True) # append result = df.set_index(['A', 'B'], append=True) xp = df.reset_index().set_index(['index', 'A', 'B']) xp.index.names = [None, 'A', 'B'] assert_frame_equal(result, xp) # append to existing multiindex rdf = df.set_index(['A'], append=True) rdf = rdf.set_index(['B', 'C'], append=True) expected = df.set_index(['A', 'B', 'C'], append=True) assert_frame_equal(rdf, expected) # Series result = df.set_index(df.C) assert result.index.name == 'C'
left_on='WORDS', how='left') #Palabras=Palabras[(Palabras['IDIOMA']=='en')&(Palabras['TIPO']=='POSITIVO')][['OPERATION_NUMBER','WORDS']] #Esta versión arroja nube de palabras incompleta Palabras = Palabras[(Palabras['TIPO'] == 'POSITIVO') | (Palabras['TIPO'] == 'NEUTRO POSITIVO')][[ 'OPERATION_NUMBER', 'WORDS', 'TIPO' ]] Palabras["WORDS2"] = Palabras["WORDS"].apply(singular) Palabras = Palabras[["OPERATION_NUMBER", "WORDS2", "TIPO"]] Palabras.rename(columns={'WORDS2': 'WORDS'}, inplace=True) #Palabras=DataFrame(Palabras["PALABRAS","WORDS"].groupby([Palabras['OPERATION_NUMBER']],Palabras['WORDS','PALABRAS']).count()) #Esta línea no corre, lo puse como está en la versión de EDU_IADB_cartera_digital que si corre #Palabras=DataFrame(Palabras["WORDS"].groupby([Palabras['OPERATION_NUMBER'],Palabras['WORDS']]).count()) Palabras = DataFrame(Palabras['WORDS'].groupby( [Palabras['OPERATION_NUMBER'], Palabras['WORDS'], Palabras['TIPO']]).count()) Palabras.rename(columns={'WORDS': 'COUNT_WORDS'}, inplace=True) Palabras.rename(columns={'PALABRAS': 'COUNT_WORDS'}, inplace=True) Palabras.reset_index(inplace=True) ########EXPORTAR ARCHIVOS############# with pd.ExcelWriter(path + "/Outputs/output.xlsx") as writer: Titulo.to_excel(writer, sheet_name="Operation_Name", index=False) Objetivo.to_excel(writer, sheet_name="Objetivo", index=False) Componentes1.to_excel(writer, sheet_name="Component", index=False) Producto1.to_excel(writer, sheet_name="Output_Name", index=False) Bas.to_excel(writer, sheet_name="Metadata", index=False) Palabras.to_excel(writer, sheet_name="palabras", index=False)
def add_difficulty(df: pd.DataFrame, config: Config): df = df.reset_index('difficulty') df = df[df['difficulty'].isin(config.training.use_difficulties)] df['difficulty'] = df['difficulty'].replace( config.dataset.difficulty_mapping) return df
def csv_stack(dframe: pd.DataFrame, stackmatcher: Pattern, stackseparator: str, newcolumn: str) -> pd.DataFrame: """Reshape an incoming dataframe by stacking/pivoting. The dataframe object will be modified in-place. Args: dframe (pd.DataFrame): Data to reshape stackmatcher (Pattern): Regular expression that matches columns to be stacked. stackseparator (str): String to use for splitting columns names newcolumn (str): Name of new column containing the latter part of the stacked column names. Returns: pd.DataFrame """ if isinstance(stackmatcher, str): stackmatcher = re.compile(stackmatcher) if newcolumn in dframe: raise ValueError("Column name %s already exists in the data") tuplecols = [] dostack = False colstostack = 0 logger.info( "Will stack columns matching '%s' with separator '%s'", stackmatcher, stackseparator, ) logger.info("Name of new identifying column will be '%s'", newcolumn) nostackcolumnnames = [] for col in dframe.columns: if stackmatcher.match(col): tuplecols.append(tuple(col.split(stackseparator))) colstostack = colstostack + 1 dostack = True else: tuplecols.append(tuple([col, ""])) nostackcolumnnames.append(col) logger.info("Found %d out of %d columns to stack", colstostack, len(dframe.columns)) if dostack: # Convert to MultiIndex columns dframe.columns = pd.MultiIndex.from_tuples(tuplecols, names=["", newcolumn]) # Stack the multiindex columns, this will add a lot of rows to # our ensemble, and condense the number of columns dframe = dframe.stack() # The values from non-multiindex-columns must be propagated to # the rows that emerged from the stacking. If you use the # 'all' pivottype, then you will get some NaN-values in the # MultiIndex columns that are intentional. dframe[nostackcolumnnames] = dframe[nostackcolumnnames].fillna( method="ffill") dframe = dframe.reset_index() # Now we have rows that does not belong to any well, we should # delete those rows dframe = dframe[dframe[newcolumn] != ""] # And delete a byproduct of our reshaping (this is the index # prior to stacking) del dframe["level_0"] return dframe.reset_index(drop=True)
def add_data_to_day_price(day_price: DataFrame, index_day_price: DataFrame) -> DataFrame: group_list = [] grouped = day_price.groupby("ticker_id") for key, group in grouped: group = group.reset_index(drop=True) group["sma_10"] = sma(group["close"], 10) group["sma_20"] = sma(group["close"], 20) group["sma_tr_val_5"] = sma(group["tr_val"], 5) group["pct_change_1"] = pct_change(group["close"], 1) group["ts_max_20"] = ts_max(group["close"], 20) group["ibs"] = ibs(group["high"], group["low"], group["close"]) group["increase_ratio_3"] = increase_from_lowest_price( group["low"], group["close"], 3) group["pdi_5"] = pdi(group["high"], group["low"], group["close"], 5, MovingAverage.ema) group["pdi_5_sto"] = stochastic_fast_k(group["pdi_5"], group["pdi_5"], group["pdi_5"], 20) group["pdi_5_sto_pct_change_3"] = pct_change(group["pdi_5_sto"], 3) group["pivot_standard"] = pivot_standard(group["high"], group["low"], group["close"]) group_list.append(group) day_price = pd.concat(group_list, axis=0) day_price = day_price.reset_index(drop=True) group_list = [] grouped = day_price.groupby("date") for key, group in grouped: group = group.reset_index(drop=True) group["rank_tr_val_5"] = rank(group["sma_tr_val_5"]) group_list.append(group) day_price = pd.concat(group_list, axis=0) day_price = day_price.reset_index(drop=True) group_list = [] grouped = day_price.groupby("ticker_id") for key, group in grouped: increase_condition1 = (group["pdi_5_sto_pct_change_3"].shift(1) >= 0.1) | (group["pdi_5_sto_pct_change_3"].shift(2) >= 0.1) increase_condition = increase_condition1 decrease_condition1 = group["open"] > group["close"] decrease_condition2 = group["pct_change_1"] < 0 decrease_condition3 = group["ibs"] < 0.25 decrease_condition = (decrease_condition1 | decrease_condition2) & decrease_condition3 liquidity_condition1 = group["rank_tr_val_5"] > 0.8 liquidity_condition = liquidity_condition1 # Result group[ "#result"] = increase_condition & decrease_condition & liquidity_condition group["#priority"] = group["increase_ratio_3"].shift( 1) + group["increase_ratio_3"].shift(2) group_list.append(group) day_price = pd.concat(group_list, axis=0) day_price = day_price.reset_index(drop=True) # Market Timing index_day_price["sma_3"] = sma(index_day_price["close"], 3) index_day_price["sma_5"] = sma(index_day_price["close"], 5) index_day_price["sma_10"] = sma(index_day_price["close"], 10) index_day_price["#market_timing"] = ( (index_day_price["close"] > index_day_price["sma_3"]) | (index_day_price["close"] > index_day_price["sma_5"]) | (index_day_price["close"] > index_day_price["sma_10"])) index_day_price = index_day_price.set_index("date") return day_price, index_day_price
def acc_one(self, entity_id, df: pd.DataFrame, acc_df: pd.DataFrame, state: dict) -> (pd.DataFrame, dict): self.logger.info(f'acc_one:{entity_id}') if pd_is_not_null(acc_df): df = df[df.index > acc_df.index[-1]] if pd_is_not_null(df): self.logger.info(f'compute from {df.iloc[0]["timestamp"]}') # 遍历的开始位置 start_index = len(acc_df) acc_df = pd.concat([acc_df, df]) zen_state = state acc_df = acc_df.reset_index(drop=True) else: self.logger.info('no need to compute') return acc_df, state else: acc_df = df # 笔的底 acc_df['bi_di'] = False # 笔的顶 acc_df['bi_ding'] = False # 记录笔顶/底分型的值,bi_di取low,bi_ding取high,其他为None,绘图时取有值的连线即为 笔 acc_df['bi_value'] = np.NAN # 记录临时分型,不变 acc_df['tmp_ding'] = False acc_df['tmp_di'] = False # 分型的力度 acc_df['fenxing_power'] = np.NAN acc_df['duan_state'] = 'yi' # 段的底 acc_df['duan_di'] = False # 段的顶 acc_df['duan_ding'] = False # 记录段顶/底的值,为duan_di时取low,为duan_ding时取high,其他为None,绘图时取有值的连线即为 段 acc_df['duan_value'] = np.NAN # 记录在确定中枢的最后一个段的终点x1,值为Rect(x0,y0,x1,y1) acc_df['zhongshu'] = np.NAN acc_df = acc_df.reset_index(drop=True) zen_state = ZenState( dict(fenxing_list=[], direction=None, can_fenxing=None, can_fenxing_index=None, opposite_count=0, current_duan_state='yi', duans=[], pre_bi=None, pre_duan=None)) zen_state.fenxing_list: List[Fenxing] = [] # 取前11条k线,至多出现一个顶分型+底分型 # 注:只是一种方便的确定第一个分型的办法,有了第一个分型,后面的处理就比较统一 # start_index 为遍历开始的位置 # direction为一个确定分型后的方向,即顶分型后为:down,底分型后为:up fenxing, start_index, direction = handle_first_fenxing(acc_df, step=11) if not fenxing: return None, None zen_state.fenxing_list.append(fenxing) zen_state.direction = direction # list of (timestamp,value) zen_state.duans = [] pre_kdata = acc_df.iloc[start_index - 1] pre_index = start_index - 1 tmp_direction = zen_state.direction for index, kdata in acc_df.iloc[start_index:].iterrows(): # print(f'timestamp: {kdata.timestamp}') # 临时方向 tmp_direction = get_direction(kdata, pre_kdata, current=tmp_direction) # 处理包含关系 handle_including(one_df=acc_df, index=index, kdata=kdata, pre_index=pre_index, pre_kdata=pre_kdata, tmp_direction=tmp_direction) # 根据方向,寻找对应的分型 和 段 if zen_state.direction == Direction.up: tmp_fenxing_col = 'tmp_ding' fenxing_col = 'bi_ding' else: tmp_fenxing_col = 'tmp_di' fenxing_col = 'bi_di' # 方向一致,延续中 if tmp_direction == zen_state.direction: zen_state.opposite_count = 0 # 反向,寻找反 分型 else: zen_state.opposite_count = zen_state.opposite_count + 1 # 第一次反向 if zen_state.opposite_count == 1: acc_df.loc[pre_index, tmp_fenxing_col] = True acc_df.loc[pre_index, 'fenxing_power'] = fenxing_power( acc_df.loc[pre_index - 1], pre_kdata, kdata, fenxing=tmp_fenxing_col) if pd_is_not_null(zen_state.can_fenxing): # 候选底分型 if tmp_direction == Direction.up: # 取小的 if pre_kdata['low'] <= zen_state.can_fenxing['low']: zen_state.can_fenxing = pre_kdata zen_state.can_fenxing_index = pre_index # 候选顶分型 else: # 取大的 if pre_kdata['high'] >= zen_state.can_fenxing[ 'high']: zen_state.can_fenxing = pre_kdata zen_state.can_fenxing_index = pre_index else: zen_state.can_fenxing = pre_kdata zen_state.can_fenxing_index = pre_index # 分型确立 if pd_is_not_null(zen_state.can_fenxing): if zen_state.opposite_count >= 4 or ( index - zen_state.can_fenxing_index >= 8): acc_df.loc[zen_state.can_fenxing_index, fenxing_col] = True # 记录笔的值 if fenxing_col == 'bi_ding': bi_value = acc_df.loc[zen_state.can_fenxing_index, 'high'] else: bi_value = acc_df.loc[zen_state.can_fenxing_index, 'low'] acc_df.loc[zen_state.can_fenxing_index, 'bi_value'] = bi_value zen_state.pre_bi = (zen_state.can_fenxing_index, bi_value) zen_state.opposite_count = 0 zen_state.direction = zen_state.direction.opposite() zen_state.can_fenxing = None # 确定第一个段 if zen_state.fenxing_list != None: zen_state.fenxing_list.append( Fenxing(state=fenxing_col, kdata=acc_df.loc[ zen_state.can_fenxing_index, ['open', 'close', 'high', 'low']], index=zen_state.can_fenxing_index)) if len(zen_state.fenxing_list) == 4: duan_state = handle_duan( fenxing_list=zen_state.fenxing_list, pre_duan_state=zen_state.current_duan_state ) change = duan_state != zen_state.current_duan_state if change: zen_state.current_duan_state = duan_state # 确定状态 acc_df.loc[ zen_state.fenxing_list[0]. index:zen_state.fenxing_list[-1].index, 'duan_state'] = zen_state.current_duan_state duan_index = zen_state.fenxing_list[ 0].index if zen_state.current_duan_state == 'up': acc_df.loc[duan_index, 'duan_di'] = True duan_value = acc_df.loc[duan_index, 'low'] else: duan_index = zen_state.fenxing_list[ 0].index acc_df.loc[duan_index, 'duan_ding'] = True duan_value = acc_df.loc[duan_index, 'high'] # 记录段的值 acc_df.loc[duan_index, 'duan_value'] = duan_value # 记录用于计算中枢的段 zen_state.duans.append( (acc_df.loc[duan_index, 'timestamp'], duan_value)) # 计算中枢 if len(zen_state.duans) == 4: x1 = zen_state.duans[0][0] x2 = zen_state.duans[3][0] if zen_state.duans[0][ 1] < zen_state.duans[1][1]: # 向下段 range = intersect( (zen_state.duans[0][1], zen_state.duans[1][1]), (zen_state.duans[2][1], zen_state.duans[3][1])) if range: y1, y2 = range # 记录中枢 acc_df.loc[duan_index, 'zhongshu'] = Rect( x0=x1, x1=x2, y0=y1, y1=y2) zen_state.duans = zen_state.duans[ -1:] else: zen_state.duans = zen_state.duans[ 1:] else: # 向上段 range = intersect( (zen_state.duans[1][1], zen_state.duans[0][1]), (zen_state.duans[3][1], zen_state.duans[2][1])) if range: y1, y2 = range # 记录中枢 acc_df.loc[duan_index, 'zhongshu'] = Rect( x0=x1, x1=x2, y0=y1, y1=y2) zen_state.duans = zen_state.duans[ -1:] else: zen_state.duans = zen_state.duans[ 1:] # 只留最后一个 zen_state.fenxing_list = zen_state.fenxing_list[ -1:] else: # 保持之前的状态并踢出候选 acc_df.loc[ zen_state.fenxing_list[0].index, 'duan_state'] = zen_state.current_duan_state zen_state.fenxing_list = zen_state.fenxing_list[ 1:] pre_kdata = kdata pre_index = index acc_df = acc_df.set_index('timestamp', drop=False) return acc_df, zen_state
def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = DataFrame( { "household_id": [1, 2, 3], "male": [0, 1, 0], "wealth": [196087.3, 316478.7, 294750], }, columns=["household_id", "male", "wealth"], ).set_index("household_id") portfolio = DataFrame( { "household_id": [1, 2, 2, 3, 3, 3, 4], "asset_id": [ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", np.nan, ], "name": [ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", np.nan, ], "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], }, columns=["household_id", "asset_id", "name", "share"], ).set_index(["household_id", "asset_id"]) result = household.join(portfolio, how="inner") expected = (DataFrame({ "male": [0, 1, 1, 0, 0, 0], "wealth": [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0, ], "name": [ "ABN Amro", "Robeco", "Royal Dutch Shell", "Royal Dutch Shell", "AAB Eastern Europe Equity Fund", "Postbank BioTech Fonds", ], "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25], "household_id": [1, 2, 2, 3, 3, 3], "asset_id": [ "nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "nl0000289965", ], }).set_index(["household_id", "asset_id" ]).reindex(columns=["male", "wealth", "name", "share"])) tm.assert_frame_equal(result, expected) # equivalency result = merge( household.reset_index(), portfolio.reset_index(), on=["household_id"], how="inner", ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) result = household.join(portfolio, how="outer") expected = concat( [ expected, (DataFrame( {"share": [1.00]}, index=MultiIndex.from_tuples( [(4, np.nan)], names=["household_id", "asset_id"]), )), ], axis=0, sort=True, ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases household.index.name = "foo" with pytest.raises( ValueError, match="cannot join with no overlapping index names"): household.join(portfolio, how="inner") portfolio2 = portfolio.copy() portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError, match="columns overlap but no suffix specified"): portfolio2.join(portfolio, how="inner")
class dataNormalization: """正式处理数据的类\n 提供对数据的筛选、去重、去除不符合要求的课程等高级筛选功能""" def __init__(self, dataFrame: DataFrame) -> None: self.oriDataFrame = dataFrame # 保留一份原始数据 self.rowNum, self.colNum = dataFrame.shape self.tmpDataFrame = DataFrame() # 期间可能用到的临时数据存放点 self.finDataFrame = DataFrame() # 最终处理完返回的数据 self.error = False def __call__(self) -> DataFrame: self.normalization() return self.finDataFrame def normalization(self): """数据处理程序""" self.finDataFrame = self.oriDataFrame.copy() for i in self.finDataFrame.index: one_row = self.oriDataFrame.loc[i] cr, gd, sc = self.classroom_processor( one_row['上课教室']), self.grade_processor( one_row['年级']), self.school_processor(one_row['上课院系']) if False in [cr, gd, sc]: self.finDataFrame = self.finDataFrame.drop(index=i) else: self.finDataFrame.at[i, '上课教室'] = cr self.finDataFrame.at[i, '年级'] = gd self.finDataFrame.at[i, '上课院系'] = sc self.finDataFrame = self.finDataFrame.reset_index(drop=True) def classroom_processor(self, cr): """对上课教室列的处理""" pattern = re.compile( r'立人楼[A-Za-z]-?[0-9]{3}|品学楼[A-Za-z]-?[0-9]{3}-?[A-Za-z]?') location_set = set(pattern.findall(cr)) # 获取符合要求的所有教室,并做去重复处理 if len( location_set ) == 1: # 如果只有一个上课地点则保留所在行,理由:每一行对应一个老师周内的一个时间,只可能有一个上课地点,多个地点说明会根据周的不同改变,但所提供数据无法提供具体从哪一周起改变教室,故做删除处理 return location_set.pop() else: return False def grade_processor(self, gd): """对年级列的处理""" count = 0 name = '' for n in GradeNum: # 如果只有一个年级则保留,多年级则删去行 if gd.find(n) != -1: count += 1 name = n if count == 1: return name else: return False def school_processor(self, sc): """对学院列的处理""" count = 0 name = '' for n in SchoolName: # 只有一个学院则保留,多学院则删除行 if sc.find(n) != -1: count += 1 name = SchoolName_whole[n] if count == 1: return name else: return False
def pivot( # pylint: disable=too-many-arguments df: DataFrame, index: List[str], aggregates: Dict[str, Dict[str, Any]], columns: Optional[List[str]] = None, metric_fill_value: Optional[Any] = None, column_fill_value: Optional[str] = None, drop_missing_columns: Optional[bool] = True, combine_value_with_metric: bool = False, marginal_distributions: Optional[bool] = None, marginal_distribution_name: Optional[str] = None, flatten_columns: bool = True, ) -> DataFrame: """ Perform a pivot operation on a DataFrame. :param df: Object on which pivot operation will be performed :param index: Columns to group by on the table index (=rows) :param columns: Columns to group by on the table columns :param metric_fill_value: Value to replace missing values with :param column_fill_value: Value to replace missing pivot columns with :param drop_missing_columns: Do not include columns whose entries are all missing :param combine_value_with_metric: Display metrics side by side within each column, as opposed to each column being displayed side by side for each metric. :param aggregates: A mapping from aggregate column name to the the aggregate config. :param marginal_distributions: Add totals for row/column. Default to False :param marginal_distribution_name: Name of row/column with marginal distribution. Default to 'All'. :param flatten_columns: Convert column names to strings :return: A pivot table :raises QueryObjectValidationError: If the request in incorrect """ if not index: raise QueryObjectValidationError( _("Pivot operation requires at least one index")) if not aggregates: raise QueryObjectValidationError( _("Pivot operation must include at least one aggregate")) if column_fill_value: df[columns] = df[columns].fillna(value=column_fill_value) aggregate_funcs = _get_aggregate_funcs(df, aggregates) # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table. # Remove once/if support is added. aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()} df = df.pivot_table( values=aggfunc.keys(), index=index, columns=columns, aggfunc=aggfunc, fill_value=metric_fill_value, dropna=drop_missing_columns, margins=marginal_distributions, margins_name=marginal_distribution_name, ) if combine_value_with_metric: df = df.stack(0).unstack() # Make index regular column if flatten_columns: df.columns = [ _flatten_column_after_pivot(col, aggregates) for col in df.columns ] # return index as regular column df.reset_index(level=0, inplace=True) return df
import ast from pprint import pprint import time import pandas as pd from pandas import DataFrame as df start_t = time.time() df = pd.read_csv("/root/NeoMeetup/csv/struttura/relations_topics.csv") df.sort_values('urlkey', inplace=True) #df.head() # In[5]: df.reset_index(inplace=True) #df.head() # In[6]: df.drop('index', axis=1, inplace=True) # In[7]: df['topic_id'] = 0 temp = df.urlkey.at[0] count = 0 index = 0 # In[8]:
def remove_names(df: pd.DataFrame) -> pd.DataFrame: """Convert personal names to numerical values.""" df = df.reset_index() df.drop(columns='Name', inplace=True) return df
def generate_pretreatment_variables( data: pd.DataFrame, level_index: str, pre_treatment_year: int ): """Merge descriptive statistics from pre-treatment year to all years Args: data (pd.DataFrame): data containing pre-treatment variables level_index (str): campus or district pre_treatment_year (int): pre-treatment year Returns: data: data with new columns """ data_pre = data.loc[data.year == pre_treatment_year] data_pre = data_pre.rename( columns={ "students_hisp": "pre_hisp", "students_ell": "pre_ell", "students_white": "pre_white", "students_black": "pre_black", "students_sped": "pre_sped", "students_frpl": "pre_frpl", "avescores": "pre_avescore", "students_num": "pre_num", "teachers_exp": "pre_exp", "teachers_turnover_ratio_d": "pre_turnover", "teachers_tenure_ave": "pre_tenure", "students_teacher_ratio": "pre_ratio", } ) for var in [ "pre_hisp", "pre_ell", "pre_white", "pre_black", "pre_sped", "pre_num", "pre_turnover", "pre_avescore", ]: for p in [0.25, 0.5, 0.75, 1]: num = str(int(p * 100)) newvar = var + num if p == 0.25: data_pre[newvar] = np.where( data_pre[var] <= data_pre[var].quantile(p), 1, 0 ) if p > 0.25: lp = p - 0.25 data_pre[newvar] = np.where( ( (data_pre[var] > data_pre[var].quantile(lp)) & (data_pre[var] <= data_pre[var].quantile(p)) ), 1, 0, ) variables = [level_index] variables = variables + (list(data_pre.filter(regex=("pre_")))) data_pre = data_pre[variables] data_pre_geo_vars = [ level_index, "type_urban", "type_suburban", "type_town", "type_rural", ] data_pre_geo = data[data.year == 2016][data_pre_geo_vars] data_pre = data_pre.merge( data_pre_geo, how="left", left_on=[level_index], right_on=[level_index], validate="one_to_one", ) data_pre = data_pre.rename( columns={ "type_urban": "pre_urban", "type_suburban": "pre_suburban", "type_town": "pre_town", "type_rural": "pre_rural", } ) data_pre["pre_turnover"] = data_pre.pre_turnover / 100 data = data.reset_index().merge( data_pre, left_on=level_index, right_on=level_index, how="left", validate="m:1" ) return data
def test_reset_index(self, float_frame): stacked = float_frame.stack()[::2] stacked = DataFrame({"foo": stacked, "bar": stacked}) names = ["first", "second"] stacked.index.names = names deleveled = stacked.reset_index() for i, (lev, level_codes) in enumerate( zip(stacked.index.levels, stacked.index.codes)): values = lev.take(level_codes) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() tm.assert_series_equal(deleveled["first"], deleveled2["level_0"], check_names=False) tm.assert_series_equal(deleveled["second"], deleveled2["level_1"], check_names=False) # default name assigned rdf = float_frame.reset_index() exp = Series(float_frame.index.values, name="index") tm.assert_series_equal(rdf["index"], exp) # default name assigned, corner case df = float_frame.copy() df["index"] = "foo" rdf = df.reset_index() exp = Series(float_frame.index.values, name="level_0") tm.assert_series_equal(rdf["level_0"], exp) # but this is ok float_frame.index.name = "index" deleveled = float_frame.reset_index() tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) tm.assert_index_equal(deleveled.index, Index(np.arange(len(deleveled)))) # preserve column names float_frame.columns.name = "columns" resetted = float_frame.reset_index() assert resetted.columns.name == "columns" # only remove certain columns df = float_frame.reset_index().set_index(["index", "A", "B"]) rs = df.reset_index(["A", "B"]) # TODO should reset_index check_names ? tm.assert_frame_equal(rs, float_frame, check_names=False) rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) rs = df.reset_index("A") xp = float_frame.reset_index().set_index(["index", "B"]) tm.assert_frame_equal(rs, xp, check_names=False) # test resetting in place df = float_frame.copy() resetted = float_frame.reset_index() return_value = df.reset_index(inplace=True) assert return_value is None tm.assert_frame_equal(df, resetted, check_names=False) df = float_frame.reset_index().set_index(["index", "A", "B"]) rs = df.reset_index("A", drop=True) xp = float_frame.copy() del xp["A"] xp = xp.set_index(["B"], append=True) tm.assert_frame_equal(rs, xp, check_names=False)
def process(data: DataFrame) -> GeneralProcessor: """ Process the data and structure them in a 2D table. Parameters ---------- data: DataFrame Original data. Returns ------- GeneralProcessor Processed and structured data. """ columns = ["value", "date", "areaCode", "areaType", "areaName", "category"] dt_final = DataFrame(columns=columns) # Because of the hierarchical nature of the original data, there is # no easy way to automate this process using a generic solution # without prolonging the execution time. The iterative method appears # to produce the optimal time. for area_type in CATEGORY_LABELS: dt_label = DataFrame(columns=columns) for area_code in data[area_type]: area_name = data[area_type][area_code]['name']['value'] df_code = DataFrame(columns=columns) for category in VALUE_COLUMNS: if category not in data[area_type][area_code]: continue df_value = json_normalize(data[area_type][area_code], [category], []) df_value["areaCode"] = area_code df_value["areaType"] = area_type df_value["areaName"] = area_name df_value["category"] = category df_code = df_code.append(df_value) dt_label = dt_label.append(df_code) dt_final = dt_final.append(dt_label) # Reset index to appear incrementally. dt_final = dt_final.reset_index()[columns] logging.info(">> Data was successfully processed.") # Convert date strings to timestamp objects (needed for sorting). dt_final[DATE_COLUMN] = to_datetime(dt_final[DATE_COLUMN]) logging.info(">> Dates were successfully converted to datetime object.") # Create a hierarchy that allows aggregation as required # in output data. dt_final = dt_final.groupby( ["areaType", "category", "date", "areaName", "areaCode"]) logging.info(">> Data has been grouped.") # Given that the aggregation grouping produces rows with unique # value, the `sum()` function will produce the original value # or `NaN`. dt_final = dt_final.sum().unstack(["areaType", "category"]) # Sort the data dt_final = dt_final.sort_values(["date", "areaName"], ascending=False).reset_index() logging.info( ">> Data was successfully sorted by date and area name - descending.") metadata = Metadata(lastUpdatedAt=data['lastUpdatedAt'], disclaimer=data['disclaimer']) logging.info(">> Metadata extracted.") daily_records = DailyRecords(areaName="United Kingdom", totalLabConfirmedCases=None, dailyLabConfirmedCases=None) if (overview := data.get("overview")) is None: logging.warning(f'Missing data - Key: overview')
def test_reset_index_datetime(self, tz_naive_fixture): # GH#3950 tz = tz_naive_fixture idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") idx2 = Index(range(5), name="idx2", dtype="int64") idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame( { "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"] }, index=idx, ) expected = DataFrame( { "idx1": [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5), ], "idx2": np.arange(5, dtype="int64"), "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "a", "b"], ) expected["idx1"] = expected["idx1"].apply( lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) idx3 = pd.date_range("1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3") idx = MultiIndex.from_arrays([idx1, idx2, idx3]) df = DataFrame( { "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"] }, index=idx, ) expected = DataFrame( { "idx1": [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5), ], "idx2": np.arange(5, dtype="int64"), "idx3": [ datetime(2012, 1, 1), datetime(2012, 2, 1), datetime(2012, 3, 1), datetime(2012, 4, 1), datetime(2012, 5, 1), ], "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "idx3", "a", "b"], ) expected["idx1"] = expected["idx1"].apply( lambda d: Timestamp(d, tz=tz)) expected["idx3"] = expected["idx3"].apply( lambda d: Timestamp(d, tz="Europe/Paris")) tm.assert_frame_equal(df.reset_index(), expected) # GH#7793 idx = MultiIndex.from_product([["a", "b"], pd.date_range("20130101", periods=3, tz=tz)]) df = DataFrame(np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx) expected = DataFrame( { "level_0": "a a a b b b".split(), "level_1": [ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), ] * 2, "a": np.arange(6, dtype="int64"), }, columns=["level_0", "level_1", "a"], ) expected["level_1"] = expected["level_1"].apply( lambda d: Timestamp(d, freq="D", tz=tz)) result = df.reset_index() tm.assert_frame_equal(result, expected)
def lithotrack(df:pd.DataFrame, codecols:list, percols:list, dtick:bool=False, lims:list=None, codedict: dict=None, fontsize=8, ax=None, correlation: pd.DataFrame = None, grid_numbers : list = [11,51], steps: list = None, corr_kw={}): """lithotrack [summary] Parameters ---------- df : pd.DataFrame [description] codecols : list [description] percols : list [description] dtick : bool, optional [description], by default False lims : list, optional [description], by default None codedict : dict, optional [description], by default None fontsize : int, optional [description], by default 8 ax : [type], optional [description], by default None correlation : pd.DataFrame, optional [description], by default None grid_numbers : list, optional [description], by default [11,51] steps : list, optional [description], by default None corr_kw : dict, optional [description], by default {} """ lit=ax or plt.gca() def_corr_kw = { 'color': 'red', 'linestyle':'--', 'linewidth': 2 } for (k,v) in def_corr_kw.items(): if k not in corr_kw: corr_kw[k]=v df.index.names=['depth'] df=df.reset_index() df=df.loc[(df.index>=lims[0])&(df.index<=lims[1]),:] #Create a pivot table concatenating the lithology code names mm=pd.DataFrame() for (k,v) in enumerate(codecols): m=df.pivot_table(index=['depth'],columns=[v],values=percols[k]) mm=pd.concat([mm,m],axis=1) #Merge in a single dataframe the repeated colnames mm=mm.fillna(0) lm=pd.DataFrame() for i in mm.columns.unique(): if mm[i].ndim>1: lm[i]=mm[i].max(axis=1) elif mm[i].ndim==1: lm[i]=mm[i] try: lm=lm.drop(columns=[0]) except: pass lmc=np.cumsum(lm,axis=1) for i, col in enumerate(lmc.columns): lit.fill_betweenx(lmc.index, lmc.iloc[:,i], label=codedict[col], zorder=-i) if lims==None: #Depth Limits lims=[df.index.min(),df.index.max()] lit.set_ylim([lims[1],lims[0]]) #Set the vertical grid spacing if steps is None: mayor_grid = np.linspace(lims[0],lims[1],grid_numbers[0]) minor_grid = np.linspace(lims[0],lims[1],grid_numbers[1]) else: mayor_grid = np.arange(lims[0],lims[1],steps[0]) minor_grid = np.arange(lims[0],lims[1],steps[1]) lit.legend() lit.set_xlim([0,100]) lit.set_yticks(mayor_grid) lit.set_yticks(minor_grid,minor=True) if dtick==True: lit.set_yticklabels(mayor_grid) else: lit.set_yticklabels([]) lit.set_xlabel("Lithology") lit.xaxis.tick_top() lit.xaxis.set_label_position("top") lit.tick_params("both",labelsize=fontsize)
def merge_preserve_left_index(left: pd.DataFrame, right: pd.DataFrame, **kwargs): return left.reset_index().merge( right, **kwargs).set_index('index').rename_axis(None)
f_House_yn['f_marsupwt'] = family_marsupwt f_House_yn['h_marsupwt'] = house_marsupwt family_gestfips = CPS_dataset.groupby(['fh_seq', 'ffpos'])['gestfips'].mean() f_House_yn['f_gestfips'] = family_gestfips # Under 30 percent of median income f_House_yn['under_30_inc'] = f_House_yn.apply(lambda x: income_lim_indicator30( x['family_net'], x['family_size'], x['f_gestfips']), axis=1) # Under 50 percent of median income f_House_yn['under_50_inc'] = f_House_yn.apply(lambda x: income_lim_indicator50( x['family_net'], x['family_size'], x['f_gestfips']), axis=1) f_House_yn = f_House_yn.reset_index() # f_House_yn.to_csv('use_df_both.csv') # f_House_yn = pd.read_csv('use_df_both.csv') f_House_yn['RfYes'] = Rf_probs[:, 1] # CPS total benefits and Administrative total benefits state_benefit = {} state_recipients = {} for fip in Admin_totals.Fips: this_state = (f_House_yn.f_gestfips == fip) CPS_totalb = (f_House_yn.fVouch_val[f_House_yn.indicator == 1] * f_House_yn.f_marsupwt )[this_state].sum() # The CPS subsidy amount is montly admin_totalb = Admin_totals['housing_value'][ Admin_totals.Fips == fip].values / 12 # to match montly
def prophet( # pylint: disable=too-many-arguments df: DataFrame, time_grain: str, periods: int, confidence_interval: float, yearly_seasonality: Optional[Union[bool, int]] = None, weekly_seasonality: Optional[Union[bool, int]] = None, daily_seasonality: Optional[Union[bool, int]] = None, ) -> DataFrame: """ Add forecasts to each series in a timeseries dataframe, along with confidence intervals for the prediction. For each series, the operation creates three new columns with the column name suffixed with the following values: - `__yhat`: the forecast for the given date - `__yhat_lower`: the lower bound of the forecast for the given date - `__yhat_upper`: the upper bound of the forecast for the given date - `__yhat_upper`: the upper bound of the forecast for the given date :param df: DataFrame containing all-numeric data (temporal column ignored) :param time_grain: Time grain used to specify time period increments in prediction :param periods: Time periods (in units of `time_grain`) to predict into the future :param confidence_interval: Width of predicted confidence interval :param yearly_seasonality: Should yearly seasonality be applied. An integer value will specify Fourier order of seasonality. :param weekly_seasonality: Should weekly seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :param daily_seasonality: Should daily seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :return: DataFrame with contributions, with temporal column at beginning if present """ # validate inputs if not time_grain: raise QueryObjectValidationError(_("Time grain missing")) if time_grain not in PROPHET_TIME_GRAIN_MAP: raise QueryObjectValidationError( _( "Unsupported time grain: %(time_grain)s", time_grain=time_grain, )) freq = PROPHET_TIME_GRAIN_MAP[time_grain] # check type at runtime due to marhsmallow schema not being able to handle # union types if not periods or periods < 0 or not isinstance(periods, int): raise QueryObjectValidationError( _("Periods must be a positive integer value")) if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1: raise QueryObjectValidationError( _("Confidence interval must be between 0 and 1 (exclusive)")) if DTTM_ALIAS not in df.columns: raise QueryObjectValidationError( _("DataFrame must include temporal column")) if len(df.columns) < 2: raise QueryObjectValidationError( _("DataFrame include at least one series")) target_df = DataFrame() for column in [column for column in df.columns if column != DTTM_ALIAS]: fit_df = _prophet_fit_and_predict( df=df[[DTTM_ALIAS, column]].rename(columns={ DTTM_ALIAS: "ds", column: "y" }), confidence_interval=confidence_interval, yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality), weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality), daily_seasonality=_prophet_parse_seasonality(daily_seasonality), periods=periods, freq=freq, ) new_columns = [ f"{column}__yhat", f"{column}__yhat_lower", f"{column}__yhat_upper", f"{column}", ] fit_df.columns = new_columns if target_df.empty: target_df = fit_df else: for new_column in new_columns: target_df = target_df.assign( **{new_column: fit_df[new_column]}) target_df.reset_index(level=0, inplace=True) return target_df.rename(columns={"ds": DTTM_ALIAS})
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
def globalfuncion(Base, Diccionario, Variable_Analizar, listStopWords): ''' Función de generación de resultados Descripcion ---------------------------------------------------------------------------------------------------------- Esta función genera una data frame mostrando el nombre del proyecto, la columna que se esta analizando y tipo de producto a lo cual se clasifico el texto. Parametros: ---------------------------------------------------------------------------------------------------------- Base (DataFrame) --- Base de datos donde se encuentran la información que se va a procesar. Diccionario (DataFrame) --- Diccionario donde se encuentran las palabras y el tipo de producto por cada palabra. Variable_Analizar (String) --- Nombre de la columna sobre la cual se va a realizar el procesamiento del texto. listStopWords (list) --- Lista de stopwords Retorno: ----------------------------------------------------------------------------------------------------------- DataFrame Nota: ------------------------------------------------------------------------------------------------------------ Esta función depende de las siguientes funciones: repeticiones(....) corpusword(.....) search_tec_inno(......) limpieza_texto1(text) ''' Idioma = 'PALABRAS' if (Variable_Analizar == 'OUTPUT_NAME'): Base_Aux = Base[{ 'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'OUTPUT_DESCRIPTION' }] Base_Aux = Base_Aux[( pd.isnull(Base_Aux['OUTPUT_DESCRIPTION']) == False) | (pd.isnull(Base_Aux['COMPONENT_NAME']) == False)] Base_Aux['OUTPUT_NAME'] = Base_Aux['OUTPUT_NAME'].fillna('') Base_Aux['OUTPUT_DESCRIPTION'] = Base_Aux['OUTPUT_DESCRIPTION'].fillna( '') a = list([str(i) for i in (Base_Aux['OUTPUT_NAME'])]) b = list([str(j) for j in (Base_Aux['OUTPUT_DESCRIPTION'])]) c = [] for i in range(len(a)): if (a[i] != '') & (b[i] != ''): c.append(a[i] + str(' ') + b[i]) elif b[i] == '': c.append(a[i]) else: c.append(b[i]) Base_Aux['OUTPUT_NAME'] = c Base_Aux.drop(['OUTPUT_DESCRIPTION'], axis=1, inplace=True) else: Base_Aux = DataFrame() Base_Aux = Base[['OPERATION_NUMBER', Variable_Analizar]] Base_Aux.drop_duplicates(inplace=True) Base_Aux.dropna(inplace=True) Base_Aux[Variable_Analizar] = Base_Aux[Variable_Analizar].apply(str) list_of_words = Base_Aux[Variable_Analizar].apply( corpusword, args=( Diccionario_Total[Diccionario_Total.TOKENS == 1]['PALABRAS'], listStopwords, )) list_of_words2 = Base_Aux[Variable_Analizar].apply( limpieza_texto2, args=(Diccionario_Total, )) list_of_words3 = Base_Aux[Variable_Analizar].apply(searchsysteminformation) list_of_words = list_of_words + list_of_words2 + list_of_words3 rep_name = repeticiones(list_of_words, Base_Aux, 'OPERATION_NUMBER') rep_variable = repeticiones(list_of_words, Base_Aux, Variable_Analizar) dframe = DataFrame() if (Variable_Analizar == 'OUTPUT_NAME'): Base_Aux['COMPONENT_NAME'] = Base_Aux['COMPONENT_NAME'].astype(str) rep_component = repeticiones(list_of_words, Base_Aux, 'COMPONENT_NAME') dframe['COMPONENT_NAME'] = rep_component list_of_words = list(chain(*list_of_words)) # dframe['OPERATION_NUMBER'] = rep_name dframe[Variable_Analizar] = rep_variable dframe['WORDS'] = list_of_words # # Base_Aux[Variable_Analizar]=Base_Aux[Variable_Analizar].str.replace(' xxxxxx ',' Red ') dframe = dframe.merge(Diccionario[['TIPO', Idioma]], left_on='WORDS', right_on=Idioma, how='left') if (Variable_Analizar == 'OUTPUT_NAME'): dframe2 = dframe[[ 'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'WORDS' ]] dframe2.drop_duplicates(inplace=True) dframe = dframe[[ 'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'TIPO' ]].drop_duplicates() dframe = pd.crosstab([ dframe['OPERATION_NUMBER'], dframe['COMPONENT_NAME'], dframe[Variable_Analizar] ], columns=dframe['TIPO']) else: dframe2 = dframe[[ 'OPERATION_NUMBER', Variable_Analizar, 'WORDS', 'TIPO' ]] # dframe2.drop_duplicates(inplace=True) dframe = dframe[['OPERATION_NUMBER', Variable_Analizar, 'TIPO']].drop_duplicates() dframe = pd.crosstab( [dframe['OPERATION_NUMBER'], dframe[Variable_Analizar]], columns=dframe['TIPO']) # dframe.reset_index(inplace=True) X = set(dframe.columns) #####conjunto de las columnas Y = set({'NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO'}) ###columnas necesarias para aplicar la condicion b = list(Y - X) if len(b) > 0: aux = DataFrame(np.repeat(0, len(b) * dframe.shape[0]).reshape( (dframe.shape[0], len(b))), columns=b) dframe = pd.concat([dframe, aux], axis=1) Base_Aux.index = range(len(Base_Aux)) dframe = Base_Aux.merge(dframe, how='left') dframe.fillna(np.nan, inplace=True) if (Variable_Analizar == 'OUTPUT_NAME'): dframe = dframe[[ 'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO' ]] dframe = dframe.groupby([ dframe['OPERATION_NUMBER'], dframe['COMPONENT_NAME'], dframe[Variable_Analizar] ]).sum() else: dframe = dframe[[ 'OPERATION_NUMBER', Variable_Analizar, 'NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO' ]] dframe = dframe.groupby( [dframe['OPERATION_NUMBER'], dframe[Variable_Analizar]]).sum() #Aplicar condiciones dframe['RESULT' + '_' + Variable_Analizar] = np.where( (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] == 0) & (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0), 'NO DIGITAL', np.where( (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] == 0) & (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0), 'NO DIGITAL', np.where( (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] >= 1) & (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0), 'NO DIGITAL', np.where( (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] == 0) & (dframe['NEUTRO POSITIVO'] >= 1) & (dframe['POSITIVO'] == 0), 'NO DIGITAL', np.where( (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] == 0) & (dframe['NEUTRO POSITIVO'] >= 1) & (dframe['POSITIVO'] == 0), 'NO DIGITAL', np.where( (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] >= 1) & (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0), 'SIN DEFINIR', 'DIGITAL')))))) dframe.drop(['NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO'], axis=1, inplace=True) dframe.reset_index(inplace=True) dframe['RESULT' + '_' + Variable_Analizar] = np.where( (dframe[Variable_Analizar] == ' ') | (dframe[Variable_Analizar] == 'x') | (dframe[Variable_Analizar] == 'xx') | (dframe[Variable_Analizar] == '.') | (dframe[Variable_Analizar] == ',') | [x in str(range(20)) for x in dframe[Variable_Analizar]] | (dframe[Variable_Analizar].apply(type) == int) | (dframe[Variable_Analizar] == '*') | (dframe[Variable_Analizar] == '#') | (dframe[Variable_Analizar] == '-') | (dframe[Variable_Analizar] == '_') | (dframe[Variable_Analizar] == '- ') | (dframe[Variable_Analizar] == ' -') | (dframe[Variable_Analizar] == '. -'), np.nan, dframe['RESULT' + '_' + Variable_Analizar]) dframe['RESULT_' + Variable_Analizar + '_TECN-INNOV'] = dframe[Variable_Analizar].apply(search_tec_inno) return [dframe, dframe2]
def process_matebook_data(directory, paramlist, storage_location): vidname = parse_screen_filename(directory) for filename in find_files(directory, 'track.tsv'): vidpath, flyID = parse_filename(filename) tag = vidname + "_" + flyID if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'): fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3]) tempdf = DataFrame(index = fi.index) if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2: print "arena dropped for poor quality: ", tag continue elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0: print "arena dropped because quality = 1: ", tag continue elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1: print "arena dropped because courtship = nan: ", tag continue else: for j in paramlist: tempdf[j[1]] = fi[j[0],j[1]] if 'movedAbs_u' in j: tempdf[j[1]] = tempdf[j[1]] * FPS tempdf['Time'] = tempdf.index/FPS time_ID = vidpath.split('_',1)[-1].split('.',1)[0] tempdf = merge_jvision_data(tempdf.reset_index(), time_ID) tempdf.to_pickle(storage_location + '/'+ tag + '_arena.pickle') print ".....", tag, " processed to pickling." return