Example #1
0
def prep_data(data, balance=True):
	'''
	prepares a machine learning dataframe from mitdb HDFStore object

	Args:
		data (HDFStore): mitdb HDFStore data
		balance (bool, opt): balance arrythmia/not arrythmia classes

	Returns:
		DataFrame
	'''
	records = filter(lambda x: re.search('record', x), data.keys())
	records = [data[key] for key in records]
	
	data = DataFrame()
	for record in records:
		if record.arrythmia.sum() > 1:
			data = pd.concat([data, conform_data(record)])

	data.reset_index(drop=True, inplace=True)
	
	if balance:
		mask = data.y == 1
		size = data[mask].shape[0]
		index = np.random.choice(data[~mask].index, size)
		index = np.concatenate([index, data[mask].index])
		data = data.ix[index]
		data.reset_index(drop=True, inplace=True)
		
	return data
Example #2
0
def get_dataframe_option1(stock_list, date, zs_amplifier = 1):
    dframe = DataFrame()
    date = format_date(date, "%Y-%m-%d")
    stock_list = list(set(stock_list))
    # stock_list = ['601800', '600528']
    dframe_list = []
    for stock in stock_list:
        if stock == u'ZS000001':    # 上证指数
            tmp_frame = get_minly_frame(stock, date, id_type=0)
            zs_amplifier = 1
        else:
            tmp_frame = get_minly_frame(stock, date)
        tmp_frame = tmp_frame[['bartime', 'closeprice']]
        yesterday = get_lastN_date(date, 1)
        yeframe = get_mysqlData([stock],[yesterday])
        if len(yeframe) > 0:
            pre_close = yeframe.loc[0,'CLOSE_PRICE']
        else:
            pre_close = 10000
        # 计算涨跌幅,可以扩大幅度
        tmp_frame['closeprice'] = zs_amplifier * normalize_frame(tmp_frame['closeprice'], pre_close)
        tmp_frame.columns = ['barTime', stock]
        tmp_frame.set_index('barTime', inplace=True)
        dframe_list.append(tmp_frame)
    dframe = pd.concat(dframe_list, axis=1)

    dframe.reset_index(range(len(dframe)), inplace=True)
    return dframe
Example #3
0
    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
                                            names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product([list('abc'), list('xy')],
                                             names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index()
                        .merge(right.reset_index(),
                               on=['abc', 'xy'], how=join_type)
                        .set_index(['abc', 'xy', 'num'])
                    )
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)
def clicksDataframe(clicks_data):
    clicks_dataframe = DataFrame(clicks_data, columns=['date', 'cardName', 'position', 'totalClicks', 'uniqueClicks'])
    clicks_dataframe = clicks_dataframe.apply(to_numeric, errors='ignore')
    clicks_dataframe.drop('date', axis=1, inplace=True)
    clicks_dataframe = clicks_dataframe.groupby(['cardName','position']).sum().sort_values(by='uniqueClicks',ascending=0)
    clicks_dataframe.reset_index(inplace=True)

    return clicks_dataframe
Example #5
0
 def _count_by_entity(data, var, entity, bornes):
     ''' Compte le nombre de 'var compris entre les 'bornes' au sein de l''entity' '''
     id = 'id' + entity
     qui = 'qui' + entity
     data.index = data[id]
     cond = (bornes[0] <= data[var]) & (data[var] <= bornes[1]) & (data[qui] > 1)
     col = DataFrame(data.loc[cond, :].groupby(id).size(), index = data.index).fillna(0)
     col.reset_index()
     return col
def nearestNeighborsSetup(filename, stateList):
  df_specimens = formatChecker(filename)
  print 'Getting the weather stations'
  with open('input/acis_station_ID.pickle') as f:
      weatherStationsMetaData = cPickle.load(f)

  # weatherStationsMetaData = weatherStations(stateList)
  # weatherStationsMetaData = read_csv('weatherStation/acis_station_ID.csv')
  df_stations = DataFrame.from_dict(weatherStationsMetaData, orient='index', dtype=None)
  '''Loads the lat/long coordinates of the specimens and weather stations into numpy arrays.
  NearestNeighborsResults() will return he number of K (nearest stations) with the index value.
  Then index will be replaced by the UID to match the ASIC data serve.'''
	#Number of points
  np1 = np.array(df_specimens['longitude']).size
  np2 = np.array(df_stations['longitude']).size

  #Search radius
  r = .25

  #Number of nearest stations returned
  k = 10

  d1 = np.empty((np1, 2))
  d2 = np.empty((np2, 2))
  d1[:, 0] = np.array(df_specimens['latitude'])
  d1[:, 1] = np.array(df_specimens['longitude'])

  d2[:, 0] = np.array(df_stations['latitude'])
  d2[:, 1] = np.array(df_stations['longitude'])
 
  result, distance = nearestNeighborsResults(d1.copy(), d2.copy(), r, k)
  columnindex = []
  closestStationList = [nearestNeighborsColumnString(x) for x in range(k)]
  for f in closestStationList: columnindex.append(f()),
  #temp variable for 0-N array
  t1 = np.arange(np2)
  #temp variable for 'uid' ID
  t2 = np.array(df_stations['uid'])
  df_results = DataFrame(result, columns=columnindex)
  #Creates a Pandas DataFrame
  uid_index = DataFrame({'0_closest_weather_station':  t1,
    'uid': t2})

  for index, column_name in enumerate(columnindex):
    temp = uid_index.rename(columns={'0_closest_weather_station': column_name, 'uid': column_name + "s"})
    df_results = df_results.reset_index().merge(temp, how='left', on= column_name, sort=False).sort('index')
    
    if index != 0:
      del df_results['level_0']

    del df_results[column_name]

  del df_results['index']
  df_results = df_results.reset_index()
  return concat([df_specimens, df_results], axis=1), distance, weatherStationsMetaData
Example #7
0
class ResetIndex:

    params = [None, 'US/Eastern']
    param_names = 'tz'

    def setup(self, tz):
        idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz)
        self.df = DataFrame(np.random.randn(1000, 2), index=idx)

    def time_reest_datetimeindex(self, tz):
        self.df.reset_index()
def homePageToSubjectPageDataframe(data):
    subject_dataframe = DataFrame(data,columns=['date','page_title','views','uniqueViews'])
    subject_dataframe = subject_dataframe.apply(to_numeric, errors='ignore')
    subject_dataframe.drop('date', axis=1, inplace=True)
    subject_dataframe = subject_dataframe.groupby(['page_title']).sum().sort_values(by='uniqueViews',ascending=0)
    subject_dataframe.reset_index(inplace=True)
    subject_dataframe['subject'] = subject_dataframe['page_title'].apply(lambda title: strip_edx_page_title(title))
    subject_dataframe['totalViews'] = subject_dataframe['uniqueViews'].sum()
    subject_dataframe['Pct'] = (subject_dataframe['uniqueViews'] / subject_dataframe['totalViews'])
    subject_dataframe = subject_dataframe[(subject_dataframe['Pct']>0.0001)]

    return subject_dataframe[['subject','uniqueViews','Pct']]
Example #9
0
def append_village_areas(divname):
    im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower())
    shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()),
                               lat_offset, lon_offset)
    areas = shape_helper.get_shape_areas('village')
    areas_df = DataFrame(areas, index=['area'])
    areas_df = areas_df.transpose()
    areas_df.reset_index(inplace=True)
    areas_df.rename(columns={'index': 'village'}, inplace=True)
    im_vil_areas = pd.merge(im_vil, areas_df, how='left')
    im_vil_areas.set_index('image', inplace=True)
    im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
Example #10
0
def sql2pandas(db_url, table_name, locriterion=None):
    """connects to database at db_url and converts psiturk datatable table_name
       to a pandas df.  Only includes trials that meet all criterion functions
       given in locriterion (default takes all trials)"""
    from sqlalchemy import MetaData, Table, create_engine
    from json import loads
    from pandas import DataFrame, concat

    data_column_name = 'datastring'
    # boilerplace sqlalchemy setup
    engine = create_engine(db_url)
    metadata = MetaData()
    metadata.bind = engine
    table = Table(table_name, metadata, autoload=True)
    # make a query and loop through
    s = table.select()
    tablerows = s.execute()

    # convert sql rows to lodicts, each containing a subject's full experiment
    # fields from orig datatable that you want attached to every trial
    expFields = ['uniqueid', 'assignmentid', 'workerid', 'hitid', 'status']
    expData = []
    for row in tablerows:
        try:
            subExpData = loads(row[data_column_name])
            for field in expFields:
                subExpData[field] = row[field]
            expData.append(subExpData)
        except:
            continue

    # turn from nested list to flat list of trials
    minidicts = []
    for subExpData in expData:
        for trial in subExpData['data']:
            trialdata = trial['trialdata']
            for field in expFields:
                trialdata[field] = subExpData[field]

            # check if trial valid if any criterion were passed
            includeThisTrial = True
            if locriterion:
                includeThisTrial = meetsCriterion(trialdata, locriterion)

            if includeThisTrial:
                minidicts.append(trialdata)

    # convert minidicts into dataframe!
    df = DataFrame(minidicts)
    # get rid of residue from minidfs
    df.reset_index(drop=True, inplace=True)
    return df
Example #11
0
class InfoTable(DataFrameWidget):
    def __init__(self, samples=None):
        self.initVars()
        super(InfoTable, self).__init__(self.table)

    def initVars(self):
        """Initialises variables."""
        self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"]
        self.table = DataFrame(columns=self.columns)

    ########################################################################
    def update(self):
        plateID = self.table["Plate ID"]
        plateName = self.table["Plate Name"]
        plateKea = self.table["Plate Kea"]
        well = self.table["Well"]
        self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1)
        self.table.insert(0, "Plate ID", plateID)
        self.table.insert(1, "Plate Name", plateName)
        self.table.insert(2, "Plate Kea", plateKea)
        self.table.insert(3, "Well", well)
        self.setDataFrame(self.table)

    def append(self, appendage):
        self.table = self.table.append(appendage, ignore_index=True)
        self.update()

    def editPlates(self, edits):
        self.table = self.table.set_index("Plate ID")
        edits = edits.set_index("ID")
        self.table.update(edits)
        self.table = self.table.reset_index()

    def importPlateData(self, plateData, key):
        plateData = plateData.set_index(key)
        self.table = self.table.set_index(key)
        self.table.update(plateData)
        self.table = self.table.reset_index()

    def importSampleData(self, sampleData, tableKey, importKey):
        sampleData[tableKey] = sampleData[importKey]
        sampleData = sampleData.set_index(tableKey)
        self.table = self.table.set_index(tableKey)
        self.table = self.table.join(sampleData, rsuffix="_new")
        self.table = self.table.reset_index()

    def getKeaSexTestingData(self):
        table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]]
        table = table.set_index(["Plate ID", "Well"])
        table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True)
        return table
Example #12
0
    def _fill(self, df, year = None):
        """
        Takes age, sex profile (per capita transfers) found in df
        to fill year 'year' or all empty years if year is None
        This is a private method.
        Parameters
        ----------
        
        df : DataFrame
             a dataframe containing the profiles
        
        year : int, default None
               if None fill all the years else only the given year
        
        """        
        if not isinstance(df, DataFrame): 
            df = DataFrame(df)

        for col_name in df.columns:
            if col_name not in self._types:
                self.new_type(col_name)
                typ = col_name
                tmp = df[typ]
                tmp = tmp.unstack(level="year")
                tmp = tmp.dropna(axis=1, how="all")
                self._types_years[typ] = tmp.columns
                
            else:
                raise Exception("column already exists")
        
        if year is None:
            df_insert = df.reset_index(level='year', drop=True)
            years = sorted(self.index_sets['year'])
            list_df = [df_insert] * len(years)
            df_tot = concat(list_df, keys = years, names =['year'])
            df_tot = df_tot.reorder_levels(['age','sex','year'], axis=0)
            
        else:
            yr = year
            df_tot = None
            df_insert = df.reset_index()
            df_insert['year'] = yr
            if df_tot is None:
                df_tot = df_insert
            else:
                df_tot.append(df_insert, ignore_index=True)
                df_tot = df_tot.set_index(['age','sex','year'])
        
#         print df_tot
#         print len(df_tot)
        self.update(df_tot)
Example #13
0
    def _decode_solutions(self, solutions):
        decoded_solutions = DataFrame(columns=["targets", "fitness"])
        index = 0
        for solution in solutions:
            combinations = self._decoder(solution.candidate, flat=True, decompose=True)
            for targets in combinations:
                if len(targets) > 0:
                    decoded_solutions.loc[index] = [tuple(targets), solution.fitness]
                    index += 1

        decoded_solutions.drop_duplicates(inplace=True, subset="targets")
        decoded_solutions.reset_index(inplace=True)

        return decoded_solutions
Example #14
0
def get_cpu_sw_map(dfds, cap_time_usec, task_re):
    df_list = []
    dfsw_list = []
    for dfd in dfds:
        df = filter_df_core(dfd.df, task_re, True)
        # at this point we have a set of df that look like this:
        #         task_name  duration
        # 0     ASA.1.vcpu0      7954
        # 1     ASA.1.vcpu0      5475
        # 2     ASA.1.vcpu0      4151
        if df.empty:
            continue
        gb = df.groupby("task_name", as_index=False)

        # sum all duration for each task
        df = gb.aggregate(np.sum)
        if dfd.multiplier > 1.0:
            df["duration"] = (df["duration"] * dfd.multiplier).astype(int)
        df["percent"] = ((df["duration"] * 100 * 10) // cap_time_usec) / 10
        if len(dfds) > 1:
            df["task_name"] = df["task_name"] + "." + dfd.short_name
        df_list.append(df)

        # count number of rows with same task and cpu
        dfsw = DataFrame(gb.size())
        dfsw.reset_index(inplace=True)
        dfsw.rename(columns={0: "count"}, inplace=True)

        if dfd.multiplier > 1.0:
            dfsw["count"] = (dfsw["count"] * dfd.multiplier).astype(int)
        else:
            dfsw["count"] = dfsw["count"].astype(int)
        dfsw_list.append(dfsw)

    if not df_list:
        return None

    df = pandas.concat(df_list)
    df = df.drop("duration", axis=1)
    dfsw = pandas.concat(dfsw_list)
    df = pandas.merge(df, dfsw, on="task_name")
    # Result:
    #             task_name  percent  count
    # 0  ASA.01.vcpu0.1x218     72.0  1998
    # 1  ASA.01.vcpu0.2x208     61.8  2128
    # 2  ASA.02.vcpu0.2x208     58.9  2177

    # transform this into a dict where the key is the task_name and the value
    # is a list [percent, count]
    return df.set_index("task_name").T.to_dict("list")
Example #15
0
    def _standardize_index(
            self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None,
            barsize: str=None, tz: str=None):
        """Normalize input DataFrame index to MarketDataBlock standard.
        """
        # Add or starndardize index names in the input.
        if isinstance(df_in.index, pd.MultiIndex):
            df_in.reset_index(inplace=True)

        # Rename ambiguous column names.
        df_in.columns = [
            col_rename.get(col.strip().lower(), col.strip().lower())
            for col in df_in.columns]

        # Insert Symbol, DataType, Barsize columns from arguments if not
        # found in the input dataframe.
        for col in MarketDataBlock.data_index:
            if col not in df_in.columns:
                if locals().get(col.lower(), None) is None:
                    raise KeyError(
                        'No {0} argument and no {0} column in the DataFrame.'
                        .format(col))
                df_in.insert(0, col, locals()[col.lower()])

        # Convert datetime strings to pandas DatetimeIndex
        df_in['TickerTime'] = pd.DatetimeIndex(
            df_in['TickerTime'].apply(pd.Timestamp))

        # Standardize BarSize strings
        df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize)

        # Set index to class-defined MultiIndex
        df_in.set_index(MarketDataBlock.data_index, inplace=True)

        # Set time zone so all DatetimeIndex are tz-aware
        df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz
        if df_in_tz is None or isinstance(df_in_tz, timezone) or \
           isinstance(df_in_tz, pytz._FixedOffset):
            # Input df has naive time index, or tzinfo is not pytz.timezone()
            if tz is None:
                raise ValueError(
                    'Argument tz=None, and TickerTime.tzinfo is None(naive),'
                    'datetime.timezone, or pytz._FixedOffset.')
            if df_in_tz is None:
                df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel)
            else:
                df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel)

        return df_in
Example #16
0
File: trade.py Project: iswdp/trade
def build_data(symbol_list, n = 15, flag = 1, blag = 10):
    train = DataFrame()
    test = DataFrame()
    for i in symbol_list:
        print i

        try:
            path = '45-165caps/' + i + '.csv'
            data = pd.read_csv(path)
            forward = forward_lag(data, i, flag)
            back = back_lag(data, i, blag)
            today_back = prediction_back_lag(data, i, blag)
            combined = combine_lags(forward, back)
            combined = combined.ix[combined['Forward Lag  1'] < .2,:].reset_index()
            del combined['index']

            #Train------------------------------------------------------------------
            random_sample = []
            for j in range(n):
                random_sample.append(random.randint(0,(len(combined) - 1)))
            data_slice = combined.ix[random_sample,:].reset_index()
            if len(train) == 0:
                train = data_slice
            else:
                train = pd.concat([train, data_slice], axis = 0)

            #Test-------------------------------------------------------------------
            data_slice = DataFrame(today_back.ix[len(today_back) - 1,:]).T

            if len(test) == 0:
                test = data_slice
            else:
                test = pd.concat([test, data_slice], axis = 0)
        except:
            print '\tSkipped'
            pass

    train = train.reset_index()
    del train['level_0']
    del train['index']

    test = test.reset_index()  
    del test['level_0']
    del test['index']

    combined.to_csv('combined1.csv', sep = ',', index = False)
    today_back.to_csv('today_back1.csv', sep = ',', index = False)

    return train, test
Example #17
0
def test_dti_reset_index_round_trip():
    dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D')
    d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti)
    d2 = d1.reset_index()
    assert d2.dtypes[0] == np.dtype('M8[ns]')
    d3 = d2.set_index('index')
    assert_frame_equal(d1, d3, check_names=False)

    # #2329
    stamp = datetime(2012, 11, 22)
    df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value'])
    df = df.set_index('Date')

    assert df.index[0] == stamp
    assert df.reset_index()['Date'][0] == stamp
Example #18
0
 def test_delevel_infer_dtype(self):
     tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])]
     index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
     df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index)
     deleveled = df.reset_index()
     self.assert_(com.is_integer_dtype(deleveled["prm1"]))
     self.assert_(com.is_float_dtype(deleveled["prm2"]))
Example #19
0
 def test_frame_reset_index(self):
     dr = date_range('2012-06-02', periods=10, tz='US/Eastern')
     df = DataFrame(np.random.randn(len(dr)), dr)
     roundtripped = df.reset_index().set_index('index')
     xp = df.index.tz
     rs = roundtripped.index.tz
     self.assertEquals(xp, rs)
Example #20
0
 def test_dti_reset_index_round_trip(self):
     dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D')
     d1 = DataFrame({'v' : np.random.rand(len(dti))}, index=dti)
     d2 = d1.reset_index()
     self.assert_(d2.dtypes[0] == np.datetime64)
     d3 = d2.set_index('index')
     assert_frame_equal(d1, d3)
def get_travel_times(df):
    df = df[df['section'] != 0]
    g = df['time'].groupby([df['veh_id'], df['section']])
    res = DataFrame([g.max() - g.min(), g.min()]).T
    res.columns = ['tt', 'time']
    res = res.reset_index()
    return res
Example #22
0
 def test_dti_reset_index_round_trip(self):
     dti = DatetimeIndex(start="1/1/2001", end="6/1/2001", freq="D")
     d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti)
     d2 = d1.reset_index()
     self.assert_(d2.dtypes[0] == np.datetime64)
     d3 = d2.set_index("index")
     assert_frame_equal(d1, d3)
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Example #24
0
    def test_infer_objects(self):
        # GH 11221
        df = DataFrame({'a': ['a', 1, 2, 3],
                        'b': ['b', 2.0, 3.0, 4.1],
                        'c': ['c', datetime(2016, 1, 1),
                              datetime(2016, 1, 2),
                              datetime(2016, 1, 3)],
                        'd': [1, 2, 3, 'd']},
                       columns=['a', 'b', 'c', 'd'])
        df = df.iloc[1:].infer_objects()

        assert df['a'].dtype == 'int64'
        assert df['b'].dtype == 'float64'
        assert df['c'].dtype == 'M8[ns]'
        assert df['d'].dtype == 'object'

        expected = DataFrame({'a': [1, 2, 3],
                              'b': [2.0, 3.0, 4.1],
                              'c': [datetime(2016, 1, 1),
                                    datetime(2016, 1, 2),
                                    datetime(2016, 1, 3)],
                              'd': [2, 3, 'd']},
                             columns=['a', 'b', 'c', 'd'])
        # reconstruct frame to verify inference is same
        tm.assert_frame_equal(df.reset_index(drop=True), expected)
 def test_frame_reset_index(self):
     dr = date_range("2012-06-02", periods=10, tz=self.tzstr("US/Eastern"))
     df = DataFrame(np.random.randn(len(dr)), dr)
     roundtripped = df.reset_index().set_index("index")
     xp = df.index.tz
     rs = roundtripped.index.tz
     self.assertEqual(xp, rs)
Example #26
0
 def test_frame_reset_index(self, tz):
     dr = date_range('2012-06-02', periods=10, tz=tz)
     df = DataFrame(np.random.randn(len(dr)), dr)
     roundtripped = df.reset_index().set_index('index')
     xp = df.index.tz
     rs = roundtripped.index.tz
     assert xp == rs
def trim_index_df(df: pd.DataFrame, index_names_to_keep: list, inplace=False):
    '''Drops all indexes except for specified index names.'''
    
    indexes_to_drop = list(df.index.names)
    try:
        indexes_to_drop.remove(index_names_to_keep)
    except ValueError:
        try:
            for idxn in index_names_to_keep:
                indexes_to_drop.remove(idxn)
        except ValueError:
            pass
    
    if inplace:
        df.reset_index(level=indexes_to_drop, drop=True, inplace=True)
    else:
        return df.reset_index(level=indexes_to_drop, drop=True)
Example #28
0
    def test_set_reset_index(self):

        df = DataFrame({'A': range(10)})
        s = pd.cut(df.A, 5)
        df['B'] = s
        df = df.set_index('B')

        df = df.reset_index()
Example #29
0
def seriesPosPrc_fitTimeFrame(dfList, PosPrc, ts, PosDirList):
	dataList = []
	for i in range(len(dfList)):
		tf = DataFrame(index = ts)
		df = dfList[i]
		tf[PosPrc] = df[PosPrc]
		tf = tf.fillna(-99999)
		tf.reset_index(inplace = True)
		tf['PosDir'] = PosDirList[i]
		for j in range(len(tf)):
			if tf.ix[j, PosPrc] == -99999:
				if j == 0: tf.ix[j, PosPrc] = 0
				elif tf.ix[j-1, 'PosDir'] != 0:
					tf.ix[j, PosPrc] = tf.ix[j-1, PosPrc]
				else:
					tf.ix[j, PosPrc] = 0
		dataList.append(np.asarray(tf[PosPrc]))
	return dataList	
Example #30
0
    def test_join_multi_levels2(self):

        # some more advanced merges
        # GH6360
        household = DataFrame(
            {
                "household_id": [1, 2, 2, 3, 3, 3, 4],
                "asset_id": [
                    "nl0000301109",
                    "nl0000301109",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            },
            columns=["household_id", "asset_id", "share"],
        ).set_index(["household_id", "asset_id"])

        log_return = DataFrame({
            "asset_id": [
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
            ],
            "t": [233, 234, 235, 180, 181],
            "log_return": [
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
            ],
        }).set_index(["asset_id", "t"])

        expected = (DataFrame({
            "household_id": [2, 2, 2, 3, 3, 3, 3, 3],
            "asset_id": [
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
            ],
            "t": [233, 234, 235, 233, 234, 235, 180, 181],
            "share": [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
            "log_return": [
                0.09604978,
                -0.06524096,
                0.03532373,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
            ],
        }).set_index(["household_id", "asset_id",
                      "t"]).reindex(columns=["share", "log_return"]))

        # this is the equivalency
        result = merge(
            household.reset_index(),
            log_return.reset_index(),
            on=["asset_id"],
            how="inner",
        ).set_index(["household_id", "asset_id", "t"])
        tm.assert_frame_equal(result, expected)

        expected = (DataFrame({
            "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
            "asset_id": [
                "nl0000301109",
                "nl0000301109",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "lu0197800237",
                "nl0000289965",
                None,
            ],
            "t": [
                None,
                None,
                233,
                234,
                235,
                233,
                234,
                235,
                180,
                181,
                None,
                None,
            ],
            "share": [
                1.0,
                0.4,
                0.6,
                0.6,
                0.6,
                0.15,
                0.15,
                0.15,
                0.6,
                0.6,
                0.25,
                1.0,
            ],
            "log_return": [
                None,
                None,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.09604978,
                -0.06524096,
                0.03532373,
                0.03025441,
                0.036997,
                None,
                None,
            ],
        }).set_index(["household_id", "asset_id",
                      "t"]).reindex(columns=["share", "log_return"]))

        result = merge(
            household.reset_index(),
            log_return.reset_index(),
            on=["asset_id"],
            how="outer",
        ).set_index(["household_id", "asset_id", "t"])

        tm.assert_frame_equal(result, expected)
Example #31
0
def keywords_extract(input_data, d, word_num, gram=(1, 2)):
    word_graph, vocab = build_word_graph(input_data, gram)
    rank_idx = get_ranks(word_graph, d)
    keywords = [(vocab[idx], round(wght, 5)) for idx, wght in rank_idx.items()]
    keywords = DataFrame(keywords,
                         columns=['keyword',
                                  'weight']).sort_values(by='weight',
                                                         ascending=False)
    #     keywords['weight'] = [round(float(v)/sum(keywords['weight']), 5) for v in keywords['weight']]
    keywords = keywords.reset_index(drop=True).iloc[:word_num, :]
    #     keywords = dict(zip(keywords.noun, keywords.weight))

    for i, _ in enumerate(range(len(keywords))):
        check = keywords['keyword'][i].split(' ')
        c = len(check)
        if c == 3:
            if check[0] == check[1] and check[1] == check[2]:
                keywords['keyword'] = check[0]
                c = 1
            elif check[0] == check[1]:
                keywords['keyword'][i] = check[0] + ' ' + check[2]
                check[1] = check[2]
                print('removed check[0]' + check[0])
                del check[2]
                c = 2
            elif check[1] == check[2]:
                keywords['keyword'][i] = check[0] + ' ' + check[1]
                print('removed check[1]' + check[1])
                del check[2]
                c = 2
            elif check[0] == check[2]:
                keywords['keyword'][i] = check[0] + ' ' + check[1]
                print('removed check[2]' + check[2])
                del check[2]
                c = 2
            else:
                if check[0] in check_values1:
                    check[0] = check_key1[check_values1.index(check[0])]
                if check[1] in check_values1:
                    check[1] = check_key1[check_values1.index(check[1])]
                if check[2] in check_values2:
                    check[2] = check_key2[check_values2.index(check[2])]
                keywords['keyword'][
                    i] = check[0] + ' ' + check[1] + ' ' + check[2]
        if c == 2:
            if check[0] == check[1]:
                keywords['keyword'][i] = check[0]
                if keywords['keyword'][i] in check_values2:
                    keywords['keyword'][i] = check_key2[check_values2.index(
                        check[0])]
            else:
                if check[0] in check_values1:
                    check[0] = check_key1[check_values1.index(check[0])]
                if check[1] in check_values2:
                    check[1] = check_key2[check_values2.index(check[1])]
                keywords['keyword'][i] = check[0] + ' ' + check[1]
        if c == 1:
            if keywords['keyword'][i] in check_values2:
                keywords['keyword'][i] = check_key2[check_values2.index(
                    check[0])]

    for i, _ in enumerate(range(len(keywords))):
        check = keywords['keyword'][i].replace(" ", "")
        if check in check_values3:
            keywords['keyword'][i] = check_key3[check_values3.index(check)]

    return keywords
Example #32
0
def call_alleles(
    alignments: pd.DataFrame,
    ref_filepath: Optional[str] = None,
    ref: Optional[str] = None,
    barcode_interval: Tuple[int, int] = (20, 34),
    cutsite_locations: List[int] = [112, 166, 220],
    cutsite_width: int = 12,
    context: bool = True,
    context_size: int = 5,
) -> pd.DataFrame:
    """Call indels from CIGAR strings.

    Given many alignments, we extract the indels by comparing the CIGAR strings
    of each alignment to the reference sequence.

    Args:
        alignments: Alignments provided in DataFrame
        ref_filepath: Filepath to the reference sequence
        ref: Nucleotide sequence of the reference
        barcode_interval: Interval in reference corresponding to the integration
            barcode
        cutsite_locations: A list of all cutsite positions in the reference
        cutsite_width: Number of nucleotides left and right of cutsite location
            that indels can appear in.
        context: Include sequence context around indels
        context_size: Number of bases to the right and left to include as
            context

    Returns:
        A DataFrame mapping each sequence alignment to the called indels.
    """
    if (ref is None) == (ref_filepath is None):
        raise PreprocessError(
            "Either `ref_filepath` or `ref` must be provided.")

    alignment_to_indel = {}
    alignment_to_intBC = {}

    if ref_filepath:
        ref = str(list(SeqIO.parse(ref_filepath, "fasta"))[0].seq)

    for _, row in tqdm(
            alignments.iterrows(),
            total=alignments.shape[0],
            desc="Parsing CIGAR strings into indels",
    ):

        intBC, indels = alignment_utilities.parse_cigar(
            row.CIGAR,
            row.Seq,
            ref,
            row.ReferenceBegin,
            row.QueryBegin,
            barcode_interval,
            cutsite_locations,
            cutsite_width,
            context=context,
            context_size=context_size,
        )

        alignment_to_indel[row.readName] = indels
        alignment_to_intBC[row.readName] = intBC

    indel_df = pd.DataFrame.from_dict(
        alignment_to_indel,
        orient="index",
        columns=[f"r{i}" for i in range(1,
                                        len(cutsite_locations) + 1)],
    )

    indel_df["allele"] = indel_df.apply(
        lambda x: "".join([str(i) for i in x.values]), axis=1)
    indel_df["intBC"] = indel_df.index.map(alignment_to_intBC)

    alignments.set_index("readName", inplace=True)

    alignments = alignments.join(indel_df)

    alignments.reset_index(inplace=True)

    # check cut-sites and raise a warning if any missing data is detected
    cutsites = utilities.get_default_cut_site_columns(alignments)
    if np.any((alignments[cutsites] == "").sum(axis=0) > 0):
        warnings.warn(
            "Detected missing data in alleles. You might"
            " consider re-running align_sequences with a"
            " lower gap-open penalty, or using a separate"
            " alignment strategy.",
            PreprocessWarning,
        )

    return alignments
Example #33
0
class TestMultiLevel(unittest.TestCase):
    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.frame = DataFrame(np.random.randn(10, 3),
                               index=index,
                               columns=Index(['A', 'B', 'C'], name='exp'))

        self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                       labels=[[0, 1, 2, 3]],
                                       names=['first'])

        # create test series object
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby(
            [lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [
            lev.astype('i8') for lev in self.ymd.index.levels
        ]
        self.ymd.index.names = ['year', 'month', 'day']

    def test_append(self):
        a, b = self.frame[:5], self.frame[5:]

        result = a.append(b)
        tm.assert_frame_equal(result, self.frame)

        result = a['A'].append(b['A'])
        tm.assert_series_equal(result, self.frame['A'])

    def test_reindex_level(self):
        # axis=0
        month_sums = self.ymd.sum(level='month')
        result = month_sums.reindex(self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum)

        assert_frame_equal(result, expected)

        # Series
        result = month_sums['A'].reindex(self.ymd.index, level=1)
        expected = self.ymd['A'].groupby(level='month').transform(np.sum)
        assert_series_equal(result, expected)

        # axis=1
        month_sums = self.ymd.T.sum(axis=1, level='month')
        result = month_sums.reindex(columns=self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum).T
        assert_frame_equal(result, expected)

    def test_binops_level(self):
        def _check_op(opname):
            op = getattr(DataFrame, opname)
            month_sums = self.ymd.sum(level='month')
            result = op(self.ymd, month_sums, level='month')
            broadcasted = self.ymd.groupby(level='month').transform(np.sum)
            expected = op(self.ymd, broadcasted)
            assert_frame_equal(result, expected)

            # Series
            op = getattr(Series, opname)
            result = op(self.ymd['A'], month_sums['A'], level='month')
            broadcasted = self.ymd['A'].groupby(level='month').transform(
                np.sum)
            expected = op(self.ymd['A'], broadcasted)
            assert_series_equal(result, expected)

        _check_op('sub')
        _check_op('add')
        _check_op('mul')
        _check_op('div')

    def test_pickle(self):
        import cPickle

        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_sort_index_preserve_levels(self):
        result = self.frame.sort_index()
        self.assertEquals(result.index.names, self.frame.index.names)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s.reindex(s.index[42:65])
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s.reindex(s.index[49:51])
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_getitem_corner(self):
        s = self.ymd['A']

        # don't segfault, GH #495
        # out of bounds access
        self.assertRaises(IndexError, s.__getitem__, len(self.ymd))

        # generator
        result = s[(x > 0 for x in s)]
        expected = s[s > 0]
        assert_series_equal(result, expected)

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s.values[42:65]).all())
        self.assert_(notnull(s.values[:42]).all())
        self.assert_(notnull(s.values[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_frame_getitem_setitem_slice(self):
        # getitem
        result = self.frame.ix[:4]
        expected = self.frame[:4]
        assert_frame_equal(result, expected)

        # setitem
        cp = self.frame.copy()
        cp.ix[:4] = 0

        self.assert_((cp.values[:4] == 0).all())
        self.assert_((cp.values[4:] != 0).all())

    def test_frame_getitem_setitem_multislice(self):
        levels = [['t1', 't2'], ['a', 'b', 'c']]
        labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]]
        midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id'])
        df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx)

        result = df.ix[:, 'value']
        assert_series_equal(df['value'], result)

        result = df.ix[1:3, 'value']
        assert_series_equal(df['value'][1:3], result)

        result = df.ix[:, :]
        assert_frame_equal(df, result)

        result = df
        df.ix[:, 'value'] = 10
        result['value'] = 10
        assert_frame_equal(df, result)

        df.ix[:, :] = 10
        assert_frame_equal(df, result)

    def test_getitem_tuple_plus_slice(self):
        # GH #671
        df = DataFrame({
            'a': range(10),
            'b': range(10),
            'c': np.random.randn(10),
            'd': np.random.randn(10)
        })

        idf = df.set_index(['a', 'b'])

        result = idf.ix[(0, 0), :]
        expected = idf.ix[0, 0]
        expected2 = idf.xs((0, 0))

        assert_series_equal(result, expected)
        assert_series_equal(result, expected2)

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_xs_level(self):
        result = self.frame.xs('two', level='second')
        expected = self.frame[self.frame.index.get_level_values(1) == 'two']
        expected.index = expected.index.droplevel(1)

        assert_frame_equal(result, expected)

        index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'),
                                        ('p', 'q', 'r')])
        df = DataFrame(np.random.randn(3, 5), index=index)
        result = df.xs('c', level=2)
        expected = df[1:2]
        expected.index = expected.index.droplevel(2)
        assert_frame_equal(result, expected)

    def test_xs_level_multiple(self):
        from pandas import read_table
        from StringIO import StringIO
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep='\s+')

        result = df.xs(('a', 4), level=['one', 'four'])
        expected = df.xs('a').xs(4, level='four')
        assert_frame_equal(result, expected)

    def test_xs_level0(self):
        from pandas import read_table
        from StringIO import StringIO
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep='\s+')

        result = df.xs('a', level=0)
        expected = df.xs('a')
        self.assertEqual(len(result), 2)
        assert_frame_equal(result, expected)

    def test_xs_level_series(self):
        s = self.frame['A']
        result = s[:, 'two']
        expected = self.frame.xs('two', level=1)['A']
        assert_series_equal(result, expected)

        s = self.ymd['A']
        result = s[2000, 5]
        expected = self.ymd.ix[2000, 5]['A']
        assert_series_equal(result, expected)

        # not implementing this for now

        self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4)))

        # result = s[2000, 3:4]
        # lv =s.index.get_level_values(1)
        # expected = s[(lv == 3) | (lv == 4)]
        # expected.index = expected.index.droplevel(0)
        # assert_series_equal(result, expected)

        # can do this though

    def test_get_loc_single_level(self):
        s = Series(np.random.randn(len(self.single_level)),
                   index=self.single_level)
        for k in self.single_level.values:
            s[k]

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_setitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]],
                           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        frame = DataFrame(np.random.randn(len(index), 4),
                          index=index,
                          columns=['a', 'b', 'c', 'd'])
        res = frame.ix[1:2]
        exp = frame.reindex(frame.index[2:])
        assert_frame_equal(res, exp)

        frame.ix[1:2] = 7
        self.assert_((frame.ix[1:2] == 7).values.all())

        series = Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series.reindex(series.index[2:])
        assert_series_equal(res, exp)

        series.ix[1:2] = 7
        self.assert_((series.ix[1:2] == 7).values.all())

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_getitem_slice_not_sorted(self):
        df = self.frame.sortlevel(1).T

        # buglet with int typechecking
        result = df.ix[:, :np.int32(3)]
        expected = df.reindex(columns=df.columns[:3])
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_frame_setitem_ix(self):
        self.frame.ix[('bar', 'two'), 'B'] = 5
        self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5)

        # with integer labels
        df = self.frame.copy()
        df.columns = range(3)
        df.ix[('bar', 'two'), 1] = 7
        self.assertEquals(df.ix[('bar', 'two'), 1], 7)

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000, 2):(2000, 4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel)

        # preserve names
        self.assertEquals(a_sorted.index.names, self.frame.index.names)

    def test_delevel_infer_dtype(self):
        tuples = [
            tuple
            for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])
        ]
        index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2'])
        df = DataFrame(np.random.randn(8, 3),
                       columns=['A', 'B', 'C'],
                       index=index)
        deleveled = df.reset_index()
        self.assert_(com.is_integer_dtype(deleveled['prm1']))
        self.assert_(com.is_float_dtype(deleveled['prm2']))

    def test_reset_index_with_drop(self):
        deleveled = self.ymd.reset_index(drop=True)
        self.assertEquals(len(deleveled.columns), len(self.ymd.columns))

        deleveled = self.series.reset_index()
        self.assert_(isinstance(deleveled, DataFrame))
        self.assert_(
            len(deleveled.columns) == len(self.series.index.levels) + 1)

        deleveled = self.series.reset_index(drop=True)
        self.assert_(isinstance(deleveled, Series))

    def test_sortlevel_by_name(self):
        self.frame.index.names = ['first', 'second']
        result = self.frame.sortlevel(level='second')
        expected = self.frame.sortlevel(level=1)
        assert_frame_equal(result, expected)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)
                expected = expected.reindex_like(result).astype('i8')
                assert_frame_equal(result, expected)

        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan
        self.ymd.ix[1, [1, 2]] = np.nan
        self.ymd.ix[7, [0, 1]] = np.nan

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

        self.frame['D'] = 'foo'
        result = self.frame.count(level=0, numeric_only=True)
        assert_almost_equal(result.columns, ['A', 'B', 'C'])

    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

    def test_count_level_corner(self):
        s = self.frame['A'][:0]
        result = s.count(level=0)
        expected = Series(0, index=s.index.levels[0])
        assert_series_equal(result, expected)

        df = self.frame[:0]
        result = df.count(level=0)
        expected = DataFrame({}, index=s.index.levels[0],
                             columns=df.columns).fillna(0).astype(int)
        assert_frame_equal(result, expected)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

        # stack with negative number
        result = self.ymd.unstack(0).stack(-2)
        expected = self.ymd.unstack(0).stack(0)

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked['foo'], df['foo'].stack())
        self.assert_(stacked['bar'].dtype == np.float_)

    def test_unstack_bug(self):
        df = DataFrame({
            'state': ['naive', 'naive', 'naive', 'activ', 'activ', 'activ'],
            'exp': ['a', 'b', 'b', 'b', 'a', 'a'],
            'barcode': [1, 2, 3, 4, 1, 3],
            'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'],
            'extra':
            np.arange(6.)
        })

        result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len)

        unstacked = result.unstack()
        restacked = unstacked.stack()
        assert_series_equal(restacked,
                            result.reindex(restacked.index).astype(float))

    def test_stack_unstack_preserve_names(self):
        unstacked = self.frame.unstack()
        self.assertEquals(unstacked.index.name, 'first')
        self.assertEquals(unstacked.columns.names, ['exp', 'second'])

        restacked = unstacked.stack()
        self.assertEquals(restacked.index.names, self.frame.index.names)

    def test_unstack_level_name(self):
        result = self.frame.unstack('second')
        expected = self.frame.unstack(level=1)
        assert_frame_equal(result, expected)

    def test_stack_level_name(self):
        unstacked = self.frame.unstack('second')
        result = unstacked.stack('exp')
        expected = self.frame.unstack().stack(0)
        assert_frame_equal(result, expected)

        result = self.frame.stack('exp')
        expected = self.frame.stack()
        assert_series_equal(result, expected)

    def test_stack_unstack_multiple(self):
        unstacked = self.ymd.unstack(['year', 'month'])
        expected = self.ymd.unstack('year').unstack('month')
        assert_frame_equal(unstacked, expected)
        self.assertEquals(unstacked.columns.names, expected.columns.names)

        # series
        s = self.ymd['A']
        s_unstacked = s.unstack(['year', 'month'])
        assert_frame_equal(s_unstacked, expected['A'])

        restacked = unstacked.stack(['year', 'month'])
        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
        restacked = restacked.sortlevel(0)

        assert_frame_equal(restacked, self.ymd)
        self.assertEquals(restacked.index.names, self.ymd.index.names)

        # GH #451
        unstacked = self.ymd.unstack([1, 2])
        expected = self.ymd.unstack(1).unstack(1)
        assert_frame_equal(unstacked, expected)

        unstacked = self.ymd.unstack([2, 1])
        expected = self.ymd.unstack(2).unstack(1)
        assert_frame_equal(unstacked, expected)

    def test_groupby_transform(self):
        s = self.frame['A']
        grouper = s.index.get_level_values(0)

        grouped = s.groupby(grouper)

        applied = grouped.apply(lambda x: x * 2)
        expected = grouped.transform(lambda x: x * 2)
        assert_series_equal(applied.reindex(expected.index), expected)

    def test_groupby_corner(self):
        midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']],
                          labels=[[0], [0], [0]],
                          names=['one', 'two', 'three'])
        df = DataFrame([np.random.rand(4)],
                       columns=['a', 'b', 'c', 'd'],
                       index=midx)
        # should work
        df.groupby(level='three')

    def test_join(self):
        a = self.frame.ix[:5, ['A']]
        b = self.frame.ix[2:, ['B', 'C']]

        joined = a.join(b, how='outer').reindex(self.frame.index)
        expected = self.frame.copy()
        expected.values[np.isnan(joined.values)] = np.nan

        self.assert_(not np.isnan(joined.values).all())

        assert_frame_equal(joined, expected)

    def test_swaplevel(self):
        swapped = self.frame['A'].swaplevel(0, 1)
        swapped2 = self.frame['A'].swaplevel('first', 'second')
        self.assert_(not swapped.index.equals(self.frame.index))
        assert_series_equal(swapped, swapped2)

        back = swapped.swaplevel(0, 1)
        back2 = swapped.swaplevel('second', 'first')
        self.assert_(back.index.equals(self.frame.index))
        assert_series_equal(back, back2)

        ft = self.frame.T
        swapped = ft.swaplevel('first', 'second', axis=1)
        exp = self.frame.swaplevel('first', 'second').T
        assert_frame_equal(swapped, exp)

    def test_swaplevel_panel(self):
        panel = Panel({'ItemA': self.frame, 'ItemB': self.frame * 2})

        result = panel.swaplevel(0, 1, axis='major')
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)

    def test_reorder_levels(self):
        result = self.ymd.reorder_levels(['month', 'day', 'year'])
        expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
        assert_frame_equal(result, expected)

        result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
        expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
        assert_series_equal(result, expected)

        result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
        expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
        assert_frame_equal(result, expected)

        self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3])

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        x = Series(data=[1, 2, 3],
                   index=MultiIndex.from_tuples([("A", 1), ("A", 2),
                                                 ("B", 3)]))

        y = Series(data=[4, 5, 6],
                   index=MultiIndex.from_tuples([("Z", 1), ("Z", 2),
                                                 ("B", 3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df['foo'].values[:] = 0
        self.assert_((df['foo'].values == 0).all())

        # but not if it's mixed-type
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(0, axis=1)
        df['foo']['one'] = 2
        self.assert_((df['foo', 'one'] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df['foo']
        result2 = df.ix[:, 'foo']
        expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs('foo')
        result2 = df.ix['foo']
        expected = df.reindex(df.index[arrays[0] == 'foo'])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s['qux']
        result2 = s.ix['qux']
        expected = s[arrays[0] == 'qux']
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

    def test_count(self):
        frame = self.frame.copy()
        frame.index.names = ['a', 'b']

        result = frame.count(level='b')
        expect = self.frame.count(level=1)
        assert_frame_equal(result, expect)

        result = frame.count(level='a')
        expect = self.frame.count(level=0)
        assert_frame_equal(result, expect)

        series = self.series.copy()
        series.index.names = ['a', 'b']

        result = series.count(level='b')
        expect = self.series.count(level=1)
        assert_series_equal(result, expect)

        result = series.count(level='a')
        expect = self.series.count(level=0)
        assert_series_equal(result, expect)

        self.assertRaises(Exception, series.count, 'x')
        self.assertRaises(Exception, frame.count, level='x')

    AGG_FUNCTIONS = [
        'sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std',
        'var'
    ]

    def test_series_group_min_max(self):
        for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2),
                                              [False, True]):
            grouped = self.series.groupby(level=level)
            aggf = lambda x: getattr(x, op)(skipna=skipna)
            # skipna=True
            leftside = grouped.agg(aggf)
            rightside = getattr(self.series, op)(level=level, skipna=skipna)
            assert_series_equal(leftside, rightside)

    def test_frame_group_ops(self):
        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan

        for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
                                                    range(2), range(2),
                                                    [False, True]):
            if axis == 0:
                frame = self.frame
            else:
                frame = self.frame.T

            grouped = frame.groupby(level=level, axis=axis)

            aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis)
            leftside = grouped.agg(aggf)
            rightside = getattr(frame, op)(level=level,
                                           axis=axis,
                                           skipna=skipna)

            # for good measure, groupby detail
            level_index = frame._get_axis(axis).levels[level]

            self.assert_(leftside._get_axis(axis).equals(level_index))
            self.assert_(rightside._get_axis(axis).equals(level_index))

            assert_frame_equal(leftside, rightside)

    def test_frame_series_agg_multiple_levels(self):
        result = self.ymd.sum(level=['year', 'month'])
        expected = self.ymd.groupby(level=['year', 'month']).sum()
        assert_frame_equal(result, expected)

        result = self.ymd['A'].sum(level=['year', 'month'])
        expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
        assert_series_equal(result, expected)

    def test_groupby_multilevel(self):
        result = self.ymd.groupby(level=[0, 1]).mean()

        k1 = self.ymd.index.get_level_values(0)
        k2 = self.ymd.index.get_level_values(1)

        expected = self.ymd.groupby([k1, k2]).mean()

        assert_frame_equal(result, expected)
        self.assertEquals(result.index.names, self.ymd.index.names[:2])

        result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
        assert_frame_equal(result, result2)

    def test_groupby_multilevel_with_transform(self):
        pass

    def test_multilevel_consolidate(self):
        index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                        ('bar', 'one'), ('bar', 'two')])
        df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
        df['Totals', ''] = df.sum(1)
        df = df.consolidate()

    def test_ix_preserve_names(self):
        result = self.ymd.ix[2000]
        result2 = self.ymd['A'].ix[2000]
        self.assertEquals(result.index.names, self.ymd.index.names[1:])
        self.assertEquals(result2.index.names, self.ymd.index.names[1:])

        result = self.ymd.ix[2000, 2]
        result2 = self.ymd['A'].ix[2000, 2]
        self.assertEquals(result.index.name, self.ymd.index.names[2])
        self.assertEquals(result2.index.name, self.ymd.index.names[2])

    def test_partial_set(self):
        # GH #397
        df = self.ymd.copy()
        exp = self.ymd.copy()
        df.ix[2000, 4] = 0
        exp.ix[2000, 4].values[:] = 0
        assert_frame_equal(df, exp)

        df['A'].ix[2000, 4] = 1
        exp['A'].ix[2000, 4].values[:] = 1
        assert_frame_equal(df, exp)

        df.ix[2000] = 5
        exp.ix[2000].values[:] = 5
        assert_frame_equal(df, exp)

        # this works...for now
        df['A'].ix[14] = 5
        self.assertEquals(df['A'][14], 5)

    def test_unstack_preserve_types(self):
        # GH #403
        self.ymd['E'] = 'foo'
        self.ymd['F'] = 2

        unstacked = self.ymd.unstack('month')
        self.assert_(unstacked['A', 1].dtype == np.float64)
        self.assert_(unstacked['E', 1].dtype == np.object_)
        self.assert_(unstacked['F', 1].dtype == np.float64)

    def test_getitem_lowerdim_corner(self):
        self.assertRaises(KeyError, self.frame.ix.__getitem__,
                          (('bar', 'three'), 'B'))

        self.assertRaises(KeyError, self.frame.ix.__setitem__,
                          (('bar', 'three'), 'B'), 0)

    #----------------------------------------------------------------------
    # AMBIGUOUS CASES!

    def test_partial_ix_missing(self):
        raise nose.SkipTest

        result = self.ymd.ix[2000, 0]
        expected = self.ymd.ix[2000]['A']
        assert_series_equal(result, expected)

        # need to put in some work here

        # self.ymd.ix[2000, 0] = 0
        # self.assert_((self.ymd.ix[2000]['A'] == 0).all())

        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6))
        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0)

    def test_fancy_2d(self):
        raise nose.SkipTest

        result = self.frame.ix['foo', 'B']
        expected = self.frame.xs('foo')['B']
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix['B', 'foo']
        expected = ft.xs('B')['foo']
        assert_series_equal(result, expected)

    #----------------------------------------------------------------------

    def test_to_html(self):
        self.ymd.columns.name = 'foo'
        self.ymd.to_html()
        self.ymd.T.to_html()

    def test_level_with_tuples(self):
        index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0),
                                    ('foo', 'qux', 0)], [0, 1]],
                           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[('foo', 'bar', 0)]
        result2 = series.ix[('foo', 'bar', 0)]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2))

        result = frame.ix[('foo', 'bar', 0)]
        result2 = frame.xs(('foo', 'bar', 0))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'),
                                    ('foo', 'qux')], [0, 1]],
                           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[('foo', 'bar')]
        result2 = series.ix[('foo', 'bar')]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        result = frame.ix[('foo', 'bar')]
        result2 = frame.xs(('foo', 'bar'))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_int_series_slicing(self):
        s = self.ymd['A']
        result = s[5:]
        expected = s.reindex(s.index[5:])
        assert_series_equal(result, expected)

        exp = self.ymd['A'].copy()
        s[5:] = 0
        exp.values[5:] = 0
        self.assert_(np.array_equal(s.values, exp.values))

        result = self.ymd[5:]
        expected = self.ymd.reindex(s.index[5:])
        assert_frame_equal(result, expected)

    def test_mixed_depth_get(self):
        arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
                  ['', 'wx', 'wy', '', '', '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df['a']
        expected = df['a', '', '']
        assert_series_equal(result, expected)
        self.assertEquals(result.name, 'a')

        result = df['routine1', 'result1']
        expected = df['routine1', 'result1', '']
        assert_series_equal(result, expected)
        self.assertEquals(result.name, ('routine1', 'result1'))

    def test_mixed_depth_insert(self):
        arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
                  ['', 'wx', 'wy', '', '', '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df.copy()
        expected = df.copy()
        result['b'] = [1, 2, 3, 4]
        expected['b', '', ''] = [1, 2, 3, 4]
        assert_frame_equal(result, expected)

    def test_mixed_depth_drop(self):
        arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
                  ['', 'wx', 'wy', '', '', '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df.drop('a', axis=1)
        expected = df.drop([('a', '', '')], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(['top'], axis=1)
        expected = df.drop([('top', 'OD', 'wx')], axis=1)
        expected = expected.drop([('top', 'OD', 'wy')], axis=1)
        assert_frame_equal(expected, result)

    def test_mixed_depth_pop(self):
        arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  ['', 'OD', 'OD', 'result1', 'result2', 'result1'],
                  ['', 'wx', 'wy', '', '', '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        df1 = df.copy()
        df2 = df.copy()
        result = df1.pop('a')
        expected = df2.pop(('a', '', ''))
        assert_series_equal(expected, result)
        assert_frame_equal(df1, df2)
        self.assertEquals(result.name, 'a')

        expected = df1['top']
        df1 = df1.drop(['top'], axis=1)
        result = df2.pop('top')
        assert_frame_equal(expected, result)
        assert_frame_equal(df1, df2)

    def test_drop_level(self):
        result = self.frame.drop(['bar', 'qux'], level='first')
        expected = self.frame.ix[[0, 1, 2, 5, 6]]
        assert_frame_equal(result, expected)

        result = self.frame.drop(['two'], level='second')
        expected = self.frame.ix[[0, 2, 3, 6, 7, 9]]
        assert_frame_equal(result, expected)

        result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first')
        expected = self.frame.ix[[0, 1, 2, 5, 6]].T
        assert_frame_equal(result, expected)

        result = self.frame.T.drop(['two'], axis=1, level='second')
        expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T
        assert_frame_equal(result, expected)
Example #34
0
        patient_list.append(patient['diagnoseID'])
    else:
        patient_list.append(-1)
    patients_list.append(patient_list)

####################################################
#   Map strings to integers, fill missing values   #
####################################################
patient_df = DataFrame(patients_list, columns=patient_column_names)
patient_df['age'] = [DataCollector.calculate_age(datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ")) if x != "null"
                     else np.NaN for x in patient_df['age']]
patient_df.age.replace(np.NaN, patient_df["age"].mean(), inplace=True)
patient_df['age'] = patient_df['age'].astype(int)
patient_df = patient_df[patient_df.id > 10]  # All patients with id higher than 10 are test accounts
patient_df = patient_df[patient_df.diagnosis != -1]
patient_df = patient_df.reset_index(drop=True)
diagnose_mapping = {1: "MIGRAINE W/ AURA", 2: "MIGRAINE W/O AURA", 3: "CLUSTER", 4: "TENSION"}
diagnose_mapping_reverse = {"MIGRAINE W/ AURA": 1, "MIGRAINE W/O AURA": 2, "CLUSTER": 3, "TENSION": 4}
patient_df['sex'] = patient_df['sex'].map(lambda x: "MALE" if x else "FEMALE")
patient_df['employment'] = patient_df['employment'].map(lambda x: "EMPLOYED" if x else "UNEMPLOYED")
patient_df['diagnosis'] = patient_df["diagnosis"].map(diagnose_mapping)

###################################################
#           Plot some demographic plots           #
###################################################

def get_distribution(values):
    distribution = {}
    for value in values:
        if value not in distribution:
            distribution[value] = 1
Example #35
0
 def __init__(self, data: pd.DataFrame):
     QAbstractTableModel.__init__(self)
     self._data = data.reset_index()
Example #36
0
 def assert_frame_equal(cls, left: pd.DataFrame, right: pd.DataFrame,
                        *args: Any, **kwargs: Any) -> None:
     left = left.reset_index(drop=True)
     right = right.reset_index(drop=True)
     tm.assert_frame_equal(left, right, *args, **kwargs)
Example #37
0
    def test_reset_index(self):
        stacked = self.frame.stack()[::2]
        stacked = DataFrame({'foo': stacked, 'bar': stacked})

        names = ['first', 'second']
        stacked.index.names = names
        deleveled = stacked.reset_index()
        for i, (lev, lab) in enumerate(zip(stacked.index.levels,
                                           stacked.index.labels)):
            values = lev.take(lab)
            name = names[i]
            tm.assert_index_equal(values, Index(deleveled[name]))

        stacked.index.names = [None, None]
        deleveled2 = stacked.reset_index()
        tm.assert_series_equal(deleveled['first'], deleveled2['level_0'],
                               check_names=False)
        tm.assert_series_equal(deleveled['second'], deleveled2['level_1'],
                               check_names=False)

        # default name assigned
        rdf = self.frame.reset_index()
        exp = pd.Series(self.frame.index.values, name='index')
        tm.assert_series_equal(rdf['index'], exp)

        # default name assigned, corner case
        df = self.frame.copy()
        df['index'] = 'foo'
        rdf = df.reset_index()
        exp = pd.Series(self.frame.index.values, name='level_0')
        tm.assert_series_equal(rdf['level_0'], exp)

        # but this is ok
        self.frame.index.name = 'index'
        deleveled = self.frame.reset_index()
        tm.assert_series_equal(deleveled['index'],
                               pd.Series(self.frame.index))
        tm.assert_index_equal(deleveled.index,
                              pd.Index(np.arange(len(deleveled))))

        # preserve column names
        self.frame.columns.name = 'columns'
        resetted = self.frame.reset_index()
        assert resetted.columns.name == 'columns'

        # only remove certain columns
        frame = self.frame.reset_index().set_index(['index', 'A', 'B'])
        rs = frame.reset_index(['A', 'B'])

        # TODO should reset_index check_names ?
        assert_frame_equal(rs, self.frame, check_names=False)

        rs = frame.reset_index(['index', 'A', 'B'])
        assert_frame_equal(rs, self.frame.reset_index(), check_names=False)

        rs = frame.reset_index(['index', 'A', 'B'])
        assert_frame_equal(rs, self.frame.reset_index(), check_names=False)

        rs = frame.reset_index('A')
        xp = self.frame.reset_index().set_index(['index', 'B'])
        assert_frame_equal(rs, xp, check_names=False)

        # test resetting in place
        df = self.frame.copy()
        resetted = self.frame.reset_index()
        df.reset_index(inplace=True)
        assert_frame_equal(df, resetted, check_names=False)

        frame = self.frame.reset_index().set_index(['index', 'A', 'B'])
        rs = frame.reset_index('A', drop=True)
        xp = self.frame.copy()
        del xp['A']
        xp = xp.set_index(['B'], append=True)
        assert_frame_equal(rs, xp, check_names=False)
Example #38
0
def fill_fields_with_data_source(
    existing_df: pd.DataFrame,
    data_source: pd.DataFrame,
    index_fields: List[str],
    columns_to_fill: List[str],
) -> pd.DataFrame:
    """Pull columns from an existing data source into an existing data frame.

    Example:

        existing_df:
        ----------------
        | date | cases |
        | 4/2  | 1     |
        | 4/3  | 2     |
        | 4/4  | 3     |
        ----------------

        data_source:
        ----------------------
        | date | current_icu |
        | 4/3  | 4           |
        | 4/5  | 5           |
        ----------------------

        index_fields: ['date']

        columns_to_fill: ['current_icu']

        output:
        ------------------------------
        | date | cases | current_icu |
        | 4/2  | 1     | Na          |
        | 4/3  | 2     | 4           |
        | 4/4  | 3     | Na          |
        | 4/5  | Na    | 5           |
        ------------------------------

    Args:
        existing_df: Existing data frame
        data_source: Data used to fill existing df columns
        index_fields: List of columns to use as common index.
        columns_to_fill: List of columns to add into existing_df from data_source

    Returns: Updated dataframe with requested columns filled from data_source data.
    """
    new_data = data_source.set_index(index_fields)

    # If no data exists, return all rows from new data with just the requested columns.
    if not len(existing_df):
        for column in columns_to_fill:
            if column not in new_data.columns:
                new_data[column] = None
        return new_data[columns_to_fill].reset_index()
    existing_df = existing_df.set_index(index_fields)

    # Sort indices so that we have chunks of equal length in the
    # correct order so that we can splice in values.
    existing_df = existing_df.sort_index()
    new_data = new_data.sort_index()

    # Build series that point to rows that match in each data frame.
    existing_df_in_new_data = existing_df.index.isin(new_data.index)
    new_data_in_existing_df = new_data.index.isin(existing_df.index)

    if not sum(existing_df_in_new_data) == sum(new_data_in_existing_df):
        print(new_data.loc[new_data_in_existing_df, columns_to_fill])
        existing_in_new = sum(existing_df_in_new_data)
        new_in_existing = sum(new_data_in_existing_df)
        raise ValueError(
            f"Number of rows should be the for data to replace: {existing_in_new} -> {new_in_existing}: {columns_to_fill}"
        )

    # If a column doesn't exist in the existing data, add it (throws an error)
    # otherwise.
    for column in columns_to_fill:
        if column not in existing_df.columns:
            existing_df[column] = None

    # Fill in values for rows that match in both data frames.
    existing_df.loc[existing_df_in_new_data,
                    columns_to_fill] = new_data.loc[new_data_in_existing_df,
                                                    columns_to_fill]
    # Get rows that do not exist in the existing data frame
    missing_new_data = new_data[~new_data_in_existing_df]

    data = pd.concat([
        existing_df.reset_index(),
        missing_new_data[columns_to_fill].reset_index(),
    ])

    return data
Example #39
0
    def test_set_index2(self):
        df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
                        'B': ['one', 'two', 'three', 'one', 'two'],
                        'C': ['a', 'b', 'c', 'd', 'e'],
                        'D': np.random.randn(5),
                        'E': np.random.randn(5)})

        # new object, single-column
        result = df.set_index('C')
        result_nodrop = df.set_index('C', drop=False)

        index = Index(df['C'], name='C')

        expected = df.loc[:, ['A', 'B', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        assert result.index.name == index.name

        # inplace, single
        df2 = df.copy()

        df2.set_index('C', inplace=True)

        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index('C', drop=False, inplace=True)

        assert_frame_equal(df3, expected_nodrop)

        # create new object, multi-column
        result = df.set_index(['A', 'B'])
        result_nodrop = df.set_index(['A', 'B'], drop=False)

        index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])

        expected = df.loc[:, ['C', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        assert result.index.names == index.names

        # inplace
        df2 = df.copy()
        df2.set_index(['A', 'B'], inplace=True)
        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index(['A', 'B'], drop=False, inplace=True)
        assert_frame_equal(df3, expected_nodrop)

        # corner case
        with tm.assert_raises_regex(ValueError,
                                    'Index has duplicate keys'):
            df.set_index('A', verify_integrity=True)

        # append
        result = df.set_index(['A', 'B'], append=True)
        xp = df.reset_index().set_index(['index', 'A', 'B'])
        xp.index.names = [None, 'A', 'B']
        assert_frame_equal(result, xp)

        # append to existing multiindex
        rdf = df.set_index(['A'], append=True)
        rdf = rdf.set_index(['B', 'C'], append=True)
        expected = df.set_index(['A', 'B', 'C'], append=True)
        assert_frame_equal(rdf, expected)

        # Series
        result = df.set_index(df.C)
        assert result.index.name == 'C'
Example #40
0
                          left_on='WORDS',
                          how='left')

#Palabras=Palabras[(Palabras['IDIOMA']=='en')&(Palabras['TIPO']=='POSITIVO')][['OPERATION_NUMBER','WORDS']] #Esta versión arroja nube de palabras incompleta
Palabras = Palabras[(Palabras['TIPO'] == 'POSITIVO') |
                    (Palabras['TIPO'] == 'NEUTRO POSITIVO')][[
                        'OPERATION_NUMBER', 'WORDS', 'TIPO'
                    ]]

Palabras["WORDS2"] = Palabras["WORDS"].apply(singular)
Palabras = Palabras[["OPERATION_NUMBER", "WORDS2", "TIPO"]]
Palabras.rename(columns={'WORDS2': 'WORDS'}, inplace=True)

#Palabras=DataFrame(Palabras["PALABRAS","WORDS"].groupby([Palabras['OPERATION_NUMBER']],Palabras['WORDS','PALABRAS']).count()) #Esta línea no corre, lo puse como está en la versión de EDU_IADB_cartera_digital que si corre
#Palabras=DataFrame(Palabras["WORDS"].groupby([Palabras['OPERATION_NUMBER'],Palabras['WORDS']]).count())
Palabras = DataFrame(Palabras['WORDS'].groupby(
    [Palabras['OPERATION_NUMBER'], Palabras['WORDS'],
     Palabras['TIPO']]).count())
Palabras.rename(columns={'WORDS': 'COUNT_WORDS'}, inplace=True)
Palabras.rename(columns={'PALABRAS': 'COUNT_WORDS'}, inplace=True)
Palabras.reset_index(inplace=True)

########EXPORTAR ARCHIVOS#############
with pd.ExcelWriter(path + "/Outputs/output.xlsx") as writer:
    Titulo.to_excel(writer, sheet_name="Operation_Name", index=False)
    Objetivo.to_excel(writer, sheet_name="Objetivo", index=False)
    Componentes1.to_excel(writer, sheet_name="Component", index=False)
    Producto1.to_excel(writer, sheet_name="Output_Name", index=False)
    Bas.to_excel(writer, sheet_name="Metadata", index=False)
    Palabras.to_excel(writer, sheet_name="palabras", index=False)
Example #41
0
def add_difficulty(df: pd.DataFrame, config: Config):
    df = df.reset_index('difficulty')
    df = df[df['difficulty'].isin(config.training.use_difficulties)]
    df['difficulty'] = df['difficulty'].replace(
        config.dataset.difficulty_mapping)
    return df
Example #42
0
def csv_stack(dframe: pd.DataFrame, stackmatcher: Pattern, stackseparator: str,
              newcolumn: str) -> pd.DataFrame:
    """Reshape an incoming dataframe by stacking/pivoting.

    The dataframe object will be modified in-place.

    Args:
        dframe (pd.DataFrame): Data to reshape
        stackmatcher (Pattern): Regular expression that matches columns
            to be stacked.
        stackseparator (str): String to use for splitting columns names
        newcolumn (str): Name of new column containing the latter part of the
            stacked column names.

    Returns:
        pd.DataFrame
    """
    if isinstance(stackmatcher, str):
        stackmatcher = re.compile(stackmatcher)
    if newcolumn in dframe:
        raise ValueError("Column name %s already exists in the data")
    tuplecols = []
    dostack = False
    colstostack = 0
    logger.info(
        "Will stack columns matching '%s' with separator '%s'",
        stackmatcher,
        stackseparator,
    )
    logger.info("Name of new identifying column will be '%s'", newcolumn)

    nostackcolumnnames = []
    for col in dframe.columns:
        if stackmatcher.match(col):
            tuplecols.append(tuple(col.split(stackseparator)))
            colstostack = colstostack + 1
            dostack = True
        else:
            tuplecols.append(tuple([col, ""]))
            nostackcolumnnames.append(col)

    logger.info("Found %d out of %d columns to stack", colstostack,
                len(dframe.columns))

    if dostack:
        # Convert to MultiIndex columns
        dframe.columns = pd.MultiIndex.from_tuples(tuplecols,
                                                   names=["", newcolumn])

        # Stack the multiindex columns, this will add a lot of rows to
        # our ensemble, and condense the number of columns
        dframe = dframe.stack()

        # The values from non-multiindex-columns must be propagated to
        # the rows that emerged from the stacking. If you use the
        # 'all' pivottype, then you will get some NaN-values in the
        # MultiIndex columns that are intentional.
        dframe[nostackcolumnnames] = dframe[nostackcolumnnames].fillna(
            method="ffill")

        dframe = dframe.reset_index()

        # Now we have rows that does not belong to any well, we should
        # delete those rows
        dframe = dframe[dframe[newcolumn] != ""]

        # And delete a byproduct of our reshaping (this is the index
        # prior to stacking)
        del dframe["level_0"]

    return dframe.reset_index(drop=True)
Example #43
0
def add_data_to_day_price(day_price: DataFrame,
                          index_day_price: DataFrame) -> DataFrame:
    group_list = []
    grouped = day_price.groupby("ticker_id")
    for key, group in grouped:
        group = group.reset_index(drop=True)
        group["sma_10"] = sma(group["close"], 10)
        group["sma_20"] = sma(group["close"], 20)
        group["sma_tr_val_5"] = sma(group["tr_val"], 5)
        group["pct_change_1"] = pct_change(group["close"], 1)
        group["ts_max_20"] = ts_max(group["close"], 20)
        group["ibs"] = ibs(group["high"], group["low"], group["close"])
        group["increase_ratio_3"] = increase_from_lowest_price(
            group["low"], group["close"], 3)
        group["pdi_5"] = pdi(group["high"], group["low"], group["close"], 5,
                             MovingAverage.ema)
        group["pdi_5_sto"] = stochastic_fast_k(group["pdi_5"], group["pdi_5"],
                                               group["pdi_5"], 20)
        group["pdi_5_sto_pct_change_3"] = pct_change(group["pdi_5_sto"], 3)
        group["pivot_standard"] = pivot_standard(group["high"], group["low"],
                                                 group["close"])
        group_list.append(group)
    day_price = pd.concat(group_list, axis=0)
    day_price = day_price.reset_index(drop=True)
    group_list = []
    grouped = day_price.groupby("date")
    for key, group in grouped:
        group = group.reset_index(drop=True)
        group["rank_tr_val_5"] = rank(group["sma_tr_val_5"])
        group_list.append(group)
    day_price = pd.concat(group_list, axis=0)
    day_price = day_price.reset_index(drop=True)
    group_list = []
    grouped = day_price.groupby("ticker_id")
    for key, group in grouped:
        increase_condition1 = (group["pdi_5_sto_pct_change_3"].shift(1) >=
                               0.1) | (group["pdi_5_sto_pct_change_3"].shift(2)
                                       >= 0.1)
        increase_condition = increase_condition1
        decrease_condition1 = group["open"] > group["close"]
        decrease_condition2 = group["pct_change_1"] < 0
        decrease_condition3 = group["ibs"] < 0.25
        decrease_condition = (decrease_condition1
                              | decrease_condition2) & decrease_condition3
        liquidity_condition1 = group["rank_tr_val_5"] > 0.8
        liquidity_condition = liquidity_condition1
        # Result
        group[
            "#result"] = increase_condition & decrease_condition & liquidity_condition
        group["#priority"] = group["increase_ratio_3"].shift(
            1) + group["increase_ratio_3"].shift(2)
        group_list.append(group)
    day_price = pd.concat(group_list, axis=0)
    day_price = day_price.reset_index(drop=True)
    # Market Timing
    index_day_price["sma_3"] = sma(index_day_price["close"], 3)
    index_day_price["sma_5"] = sma(index_day_price["close"], 5)
    index_day_price["sma_10"] = sma(index_day_price["close"], 10)
    index_day_price["#market_timing"] = (
        (index_day_price["close"] > index_day_price["sma_3"])
        | (index_day_price["close"] > index_day_price["sma_5"])
        | (index_day_price["close"] > index_day_price["sma_10"]))
    index_day_price = index_day_price.set_index("date")
    return day_price, index_day_price
Example #44
0
    def acc_one(self, entity_id, df: pd.DataFrame, acc_df: pd.DataFrame,
                state: dict) -> (pd.DataFrame, dict):
        self.logger.info(f'acc_one:{entity_id}')
        if pd_is_not_null(acc_df):
            df = df[df.index > acc_df.index[-1]]
            if pd_is_not_null(df):
                self.logger.info(f'compute from {df.iloc[0]["timestamp"]}')
                # 遍历的开始位置
                start_index = len(acc_df)

                acc_df = pd.concat([acc_df, df])

                zen_state = state

                acc_df = acc_df.reset_index(drop=True)
            else:
                self.logger.info('no need to compute')
                return acc_df, state
        else:
            acc_df = df
            # 笔的底
            acc_df['bi_di'] = False
            # 笔的顶
            acc_df['bi_ding'] = False
            # 记录笔顶/底分型的值,bi_di取low,bi_ding取high,其他为None,绘图时取有值的连线即为 笔
            acc_df['bi_value'] = np.NAN

            # 记录临时分型,不变
            acc_df['tmp_ding'] = False
            acc_df['tmp_di'] = False
            # 分型的力度
            acc_df['fenxing_power'] = np.NAN

            acc_df['duan_state'] = 'yi'

            # 段的底
            acc_df['duan_di'] = False
            # 段的顶
            acc_df['duan_ding'] = False
            # 记录段顶/底的值,为duan_di时取low,为duan_ding时取high,其他为None,绘图时取有值的连线即为 段
            acc_df['duan_value'] = np.NAN

            # 记录在确定中枢的最后一个段的终点x1,值为Rect(x0,y0,x1,y1)
            acc_df['zhongshu'] = np.NAN

            acc_df = acc_df.reset_index(drop=True)

            zen_state = ZenState(
                dict(fenxing_list=[],
                     direction=None,
                     can_fenxing=None,
                     can_fenxing_index=None,
                     opposite_count=0,
                     current_duan_state='yi',
                     duans=[],
                     pre_bi=None,
                     pre_duan=None))

            zen_state.fenxing_list: List[Fenxing] = []

            # 取前11条k线,至多出现一个顶分型+底分型
            # 注:只是一种方便的确定第一个分型的办法,有了第一个分型,后面的处理就比较统一
            # start_index 为遍历开始的位置
            # direction为一个确定分型后的方向,即顶分型后为:down,底分型后为:up
            fenxing, start_index, direction = handle_first_fenxing(acc_df,
                                                                   step=11)
            if not fenxing:
                return None, None

            zen_state.fenxing_list.append(fenxing)
            zen_state.direction = direction

            # list of (timestamp,value)
            zen_state.duans = []

        pre_kdata = acc_df.iloc[start_index - 1]
        pre_index = start_index - 1

        tmp_direction = zen_state.direction

        for index, kdata in acc_df.iloc[start_index:].iterrows():
            # print(f'timestamp: {kdata.timestamp}')
            # 临时方向
            tmp_direction = get_direction(kdata,
                                          pre_kdata,
                                          current=tmp_direction)

            # 处理包含关系
            handle_including(one_df=acc_df,
                             index=index,
                             kdata=kdata,
                             pre_index=pre_index,
                             pre_kdata=pre_kdata,
                             tmp_direction=tmp_direction)

            # 根据方向,寻找对应的分型 和 段
            if zen_state.direction == Direction.up:
                tmp_fenxing_col = 'tmp_ding'
                fenxing_col = 'bi_ding'
            else:
                tmp_fenxing_col = 'tmp_di'
                fenxing_col = 'bi_di'

            # 方向一致,延续中
            if tmp_direction == zen_state.direction:
                zen_state.opposite_count = 0
            # 反向,寻找反 分型
            else:
                zen_state.opposite_count = zen_state.opposite_count + 1
                # 第一次反向
                if zen_state.opposite_count == 1:
                    acc_df.loc[pre_index, tmp_fenxing_col] = True
                    acc_df.loc[pre_index, 'fenxing_power'] = fenxing_power(
                        acc_df.loc[pre_index - 1],
                        pre_kdata,
                        kdata,
                        fenxing=tmp_fenxing_col)

                    if pd_is_not_null(zen_state.can_fenxing):
                        # 候选底分型
                        if tmp_direction == Direction.up:
                            # 取小的
                            if pre_kdata['low'] <= zen_state.can_fenxing['low']:
                                zen_state.can_fenxing = pre_kdata
                                zen_state.can_fenxing_index = pre_index

                        # 候选顶分型
                        else:
                            # 取大的
                            if pre_kdata['high'] >= zen_state.can_fenxing[
                                    'high']:
                                zen_state.can_fenxing = pre_kdata
                                zen_state.can_fenxing_index = pre_index
                    else:
                        zen_state.can_fenxing = pre_kdata
                        zen_state.can_fenxing_index = pre_index

                # 分型确立
                if pd_is_not_null(zen_state.can_fenxing):
                    if zen_state.opposite_count >= 4 or (
                            index - zen_state.can_fenxing_index >= 8):
                        acc_df.loc[zen_state.can_fenxing_index,
                                   fenxing_col] = True

                        # 记录笔的值
                        if fenxing_col == 'bi_ding':
                            bi_value = acc_df.loc[zen_state.can_fenxing_index,
                                                  'high']
                        else:
                            bi_value = acc_df.loc[zen_state.can_fenxing_index,
                                                  'low']
                        acc_df.loc[zen_state.can_fenxing_index,
                                   'bi_value'] = bi_value

                        zen_state.pre_bi = (zen_state.can_fenxing_index,
                                            bi_value)

                        zen_state.opposite_count = 0
                        zen_state.direction = zen_state.direction.opposite()
                        zen_state.can_fenxing = None

                        # 确定第一个段
                        if zen_state.fenxing_list != None:
                            zen_state.fenxing_list.append(
                                Fenxing(state=fenxing_col,
                                        kdata=acc_df.loc[
                                            zen_state.can_fenxing_index,
                                            ['open', 'close', 'high', 'low']],
                                        index=zen_state.can_fenxing_index))

                            if len(zen_state.fenxing_list) == 4:
                                duan_state = handle_duan(
                                    fenxing_list=zen_state.fenxing_list,
                                    pre_duan_state=zen_state.current_duan_state
                                )

                                change = duan_state != zen_state.current_duan_state

                                if change:
                                    zen_state.current_duan_state = duan_state

                                    # 确定状态
                                    acc_df.loc[
                                        zen_state.fenxing_list[0].
                                        index:zen_state.fenxing_list[-1].index,
                                        'duan_state'] = zen_state.current_duan_state

                                    duan_index = zen_state.fenxing_list[
                                        0].index
                                    if zen_state.current_duan_state == 'up':
                                        acc_df.loc[duan_index,
                                                   'duan_di'] = True
                                        duan_value = acc_df.loc[duan_index,
                                                                'low']
                                    else:
                                        duan_index = zen_state.fenxing_list[
                                            0].index
                                        acc_df.loc[duan_index,
                                                   'duan_ding'] = True
                                        duan_value = acc_df.loc[duan_index,
                                                                'high']
                                    # 记录段的值
                                    acc_df.loc[duan_index,
                                               'duan_value'] = duan_value

                                    # 记录用于计算中枢的段
                                    zen_state.duans.append(
                                        (acc_df.loc[duan_index,
                                                    'timestamp'], duan_value))

                                    # 计算中枢
                                    if len(zen_state.duans) == 4:
                                        x1 = zen_state.duans[0][0]
                                        x2 = zen_state.duans[3][0]
                                        if zen_state.duans[0][
                                                1] < zen_state.duans[1][1]:
                                            # 向下段
                                            range = intersect(
                                                (zen_state.duans[0][1],
                                                 zen_state.duans[1][1]),
                                                (zen_state.duans[2][1],
                                                 zen_state.duans[3][1]))
                                            if range:
                                                y1, y2 = range
                                                # 记录中枢
                                                acc_df.loc[duan_index,
                                                           'zhongshu'] = Rect(
                                                               x0=x1,
                                                               x1=x2,
                                                               y0=y1,
                                                               y1=y2)
                                                zen_state.duans = zen_state.duans[
                                                    -1:]
                                            else:
                                                zen_state.duans = zen_state.duans[
                                                    1:]
                                        else:
                                            # 向上段
                                            range = intersect(
                                                (zen_state.duans[1][1],
                                                 zen_state.duans[0][1]),
                                                (zen_state.duans[3][1],
                                                 zen_state.duans[2][1]))
                                            if range:
                                                y1, y2 = range
                                                # 记录中枢
                                                acc_df.loc[duan_index,
                                                           'zhongshu'] = Rect(
                                                               x0=x1,
                                                               x1=x2,
                                                               y0=y1,
                                                               y1=y2)
                                                zen_state.duans = zen_state.duans[
                                                    -1:]
                                            else:
                                                zen_state.duans = zen_state.duans[
                                                    1:]

                                    # 只留最后一个
                                    zen_state.fenxing_list = zen_state.fenxing_list[
                                        -1:]
                                else:
                                    # 保持之前的状态并踢出候选
                                    acc_df.loc[
                                        zen_state.fenxing_list[0].index,
                                        'duan_state'] = zen_state.current_duan_state
                                    zen_state.fenxing_list = zen_state.fenxing_list[
                                        1:]

            pre_kdata = kdata
            pre_index = index

        acc_df = acc_df.set_index('timestamp', drop=False)
        return acc_df, zen_state
Example #45
0
    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = DataFrame(
            {
                "household_id": [1, 2, 3],
                "male": [0, 1, 0],
                "wealth": [196087.3, 316478.7, 294750],
            },
            columns=["household_id", "male", "wealth"],
        ).set_index("household_id")
        portfolio = DataFrame(
            {
                "household_id": [1, 2, 2, 3, 3, 3, 4],
                "asset_id": [
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                "name": [
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                    np.nan,
                ],
                "share": [1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            },
            columns=["household_id", "asset_id", "name", "share"],
        ).set_index(["household_id", "asset_id"])
        result = household.join(portfolio, how="inner")
        expected = (DataFrame({
            "male": [0, 1, 1, 0, 0, 0],
            "wealth": [
                196087.3,
                316478.7,
                316478.7,
                294750.0,
                294750.0,
                294750.0,
            ],
            "name": [
                "ABN Amro",
                "Robeco",
                "Royal Dutch Shell",
                "Royal Dutch Shell",
                "AAB Eastern Europe Equity Fund",
                "Postbank BioTech Fonds",
            ],
            "share": [1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
            "household_id": [1, 2, 2, 3, 3, 3],
            "asset_id": [
                "nl0000301109",
                "nl0000289783",
                "gb00b03mlx29",
                "gb00b03mlx29",
                "lu0197800237",
                "nl0000289965",
            ],
        }).set_index(["household_id", "asset_id"
                      ]).reindex(columns=["male", "wealth", "name", "share"]))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = merge(
            household.reset_index(),
            portfolio.reset_index(),
            on=["household_id"],
            how="inner",
        ).set_index(["household_id", "asset_id"])
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how="outer")
        expected = concat(
            [
                expected,
                (DataFrame(
                    {"share": [1.00]},
                    index=MultiIndex.from_tuples(
                        [(4, np.nan)], names=["household_id", "asset_id"]),
                )),
            ],
            axis=0,
            sort=True,
        ).reindex(columns=expected.columns)
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = "foo"

        with pytest.raises(
                ValueError,
                match="cannot join with no overlapping index names"):
            household.join(portfolio, how="inner")

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(["household_id", "foo"])

        with pytest.raises(ValueError,
                           match="columns overlap but no suffix specified"):
            portfolio2.join(portfolio, how="inner")
class dataNormalization:
    """正式处理数据的类\n
    提供对数据的筛选、去重、去除不符合要求的课程等高级筛选功能"""
    def __init__(self, dataFrame: DataFrame) -> None:
        self.oriDataFrame = dataFrame  # 保留一份原始数据
        self.rowNum, self.colNum = dataFrame.shape
        self.tmpDataFrame = DataFrame()  # 期间可能用到的临时数据存放点
        self.finDataFrame = DataFrame()  # 最终处理完返回的数据
        self.error = False

    def __call__(self) -> DataFrame:
        self.normalization()
        return self.finDataFrame

    def normalization(self):
        """数据处理程序"""
        self.finDataFrame = self.oriDataFrame.copy()
        for i in self.finDataFrame.index:
            one_row = self.oriDataFrame.loc[i]
            cr, gd, sc = self.classroom_processor(
                one_row['上课教室']), self.grade_processor(
                    one_row['年级']), self.school_processor(one_row['上课院系'])
            if False in [cr, gd, sc]:
                self.finDataFrame = self.finDataFrame.drop(index=i)
            else:
                self.finDataFrame.at[i, '上课教室'] = cr
                self.finDataFrame.at[i, '年级'] = gd
                self.finDataFrame.at[i, '上课院系'] = sc
        self.finDataFrame = self.finDataFrame.reset_index(drop=True)

    def classroom_processor(self, cr):
        """对上课教室列的处理"""
        pattern = re.compile(
            r'立人楼[A-Za-z]-?[0-9]{3}|品学楼[A-Za-z]-?[0-9]{3}-?[A-Za-z]?')
        location_set = set(pattern.findall(cr))  # 获取符合要求的所有教室,并做去重复处理
        if len(
                location_set
        ) == 1:  # 如果只有一个上课地点则保留所在行,理由:每一行对应一个老师周内的一个时间,只可能有一个上课地点,多个地点说明会根据周的不同改变,但所提供数据无法提供具体从哪一周起改变教室,故做删除处理
            return location_set.pop()
        else:
            return False

    def grade_processor(self, gd):
        """对年级列的处理"""
        count = 0
        name = ''
        for n in GradeNum:  # 如果只有一个年级则保留,多年级则删去行
            if gd.find(n) != -1:
                count += 1
                name = n
        if count == 1:
            return name
        else:
            return False

    def school_processor(self, sc):
        """对学院列的处理"""
        count = 0
        name = ''
        for n in SchoolName:  # 只有一个学院则保留,多学院则删除行
            if sc.find(n) != -1:
                count += 1
                name = SchoolName_whole[n]
        if count == 1:
            return name
        else:
            return False
def pivot(  # pylint: disable=too-many-arguments
    df: DataFrame,
    index: List[str],
    aggregates: Dict[str, Dict[str, Any]],
    columns: Optional[List[str]] = None,
    metric_fill_value: Optional[Any] = None,
    column_fill_value: Optional[str] = None,
    drop_missing_columns: Optional[bool] = True,
    combine_value_with_metric: bool = False,
    marginal_distributions: Optional[bool] = None,
    marginal_distribution_name: Optional[str] = None,
    flatten_columns: bool = True,
) -> DataFrame:
    """
    Perform a pivot operation on a DataFrame.

    :param df: Object on which pivot operation will be performed
    :param index: Columns to group by on the table index (=rows)
    :param columns: Columns to group by on the table columns
    :param metric_fill_value: Value to replace missing values with
    :param column_fill_value: Value to replace missing pivot columns with
    :param drop_missing_columns: Do not include columns whose entries are all missing
    :param combine_value_with_metric: Display metrics side by side within each column,
           as opposed to each column being displayed side by side for each metric.
    :param aggregates: A mapping from aggregate column name to the the aggregate
           config.
    :param marginal_distributions: Add totals for row/column. Default to False
    :param marginal_distribution_name: Name of row/column with marginal distribution.
           Default to 'All'.
    :param flatten_columns: Convert column names to strings
    :return: A pivot table
    :raises QueryObjectValidationError: If the request in incorrect
    """
    if not index:
        raise QueryObjectValidationError(
            _("Pivot operation requires at least one index"))
    if not aggregates:
        raise QueryObjectValidationError(
            _("Pivot operation must include at least one aggregate"))

    if column_fill_value:
        df[columns] = df[columns].fillna(value=column_fill_value)

    aggregate_funcs = _get_aggregate_funcs(df, aggregates)

    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
    #  Remove once/if support is added.
    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}

    df = df.pivot_table(
        values=aggfunc.keys(),
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=metric_fill_value,
        dropna=drop_missing_columns,
        margins=marginal_distributions,
        margins_name=marginal_distribution_name,
    )

    if combine_value_with_metric:
        df = df.stack(0).unstack()

    # Make index regular column
    if flatten_columns:
        df.columns = [
            _flatten_column_after_pivot(col, aggregates) for col in df.columns
        ]
    # return index as regular column
    df.reset_index(level=0, inplace=True)
    return df
Example #48
0
import ast
from pprint import pprint
import time
import pandas as pd
from pandas import DataFrame as df

start_t = time.time()

df = pd.read_csv("/root/NeoMeetup/csv/struttura/relations_topics.csv")

df.sort_values('urlkey', inplace=True)
#df.head()

# In[5]:

df.reset_index(inplace=True)
#df.head()

# In[6]:

df.drop('index', axis=1, inplace=True)

# In[7]:

df['topic_id'] = 0

temp = df.urlkey.at[0]
count = 0
index = 0

# In[8]:
Example #49
0
def remove_names(df: pd.DataFrame) -> pd.DataFrame:
    """Convert personal names to numerical values."""
    df = df.reset_index()
    df.drop(columns='Name', inplace=True)
    return df
Example #50
0
def generate_pretreatment_variables(
    data: pd.DataFrame, level_index: str, pre_treatment_year: int
):
    """Merge descriptive statistics from pre-treatment year to all years

    Args:
        data (pd.DataFrame): data containing pre-treatment variables
        level_index (str): campus or district
        pre_treatment_year (int): pre-treatment year

    Returns:
        data: data with new columns
    """

    data_pre = data.loc[data.year == pre_treatment_year]
    data_pre = data_pre.rename(
        columns={
            "students_hisp": "pre_hisp",
            "students_ell": "pre_ell",
            "students_white": "pre_white",
            "students_black": "pre_black",
            "students_sped": "pre_sped",
            "students_frpl": "pre_frpl",
            "avescores": "pre_avescore",
            "students_num": "pre_num",
            "teachers_exp": "pre_exp",
            "teachers_turnover_ratio_d": "pre_turnover",
            "teachers_tenure_ave": "pre_tenure",
            "students_teacher_ratio": "pre_ratio",
        }
    )
    for var in [
        "pre_hisp",
        "pre_ell",
        "pre_white",
        "pre_black",
        "pre_sped",
        "pre_num",
        "pre_turnover",
        "pre_avescore",
    ]:
        for p in [0.25, 0.5, 0.75, 1]:
            num = str(int(p * 100))
            newvar = var + num
            if p == 0.25:
                data_pre[newvar] = np.where(
                    data_pre[var] <= data_pre[var].quantile(p), 1, 0
                )
            if p > 0.25:
                lp = p - 0.25
                data_pre[newvar] = np.where(
                    (
                        (data_pre[var] > data_pre[var].quantile(lp))
                        & (data_pre[var] <= data_pre[var].quantile(p))
                    ),
                    1,
                    0,
                )
    variables = [level_index]
    variables = variables + (list(data_pre.filter(regex=("pre_"))))
    data_pre = data_pre[variables]
    data_pre_geo_vars = [
        level_index,
        "type_urban",
        "type_suburban",
        "type_town",
        "type_rural",
    ]
    data_pre_geo = data[data.year == 2016][data_pre_geo_vars]
    data_pre = data_pre.merge(
        data_pre_geo,
        how="left",
        left_on=[level_index],
        right_on=[level_index],
        validate="one_to_one",
    )
    data_pre = data_pre.rename(
        columns={
            "type_urban": "pre_urban",
            "type_suburban": "pre_suburban",
            "type_town": "pre_town",
            "type_rural": "pre_rural",
        }
    )
    data_pre["pre_turnover"] = data_pre.pre_turnover / 100
    data = data.reset_index().merge(
        data_pre, left_on=level_index, right_on=level_index, how="left", validate="m:1"
    )

    return data
Example #51
0
    def test_reset_index(self, float_frame):
        stacked = float_frame.stack()[::2]
        stacked = DataFrame({"foo": stacked, "bar": stacked})

        names = ["first", "second"]
        stacked.index.names = names
        deleveled = stacked.reset_index()
        for i, (lev, level_codes) in enumerate(
                zip(stacked.index.levels, stacked.index.codes)):
            values = lev.take(level_codes)
            name = names[i]
            tm.assert_index_equal(values, Index(deleveled[name]))

        stacked.index.names = [None, None]
        deleveled2 = stacked.reset_index()
        tm.assert_series_equal(deleveled["first"],
                               deleveled2["level_0"],
                               check_names=False)
        tm.assert_series_equal(deleveled["second"],
                               deleveled2["level_1"],
                               check_names=False)

        # default name assigned
        rdf = float_frame.reset_index()
        exp = Series(float_frame.index.values, name="index")
        tm.assert_series_equal(rdf["index"], exp)

        # default name assigned, corner case
        df = float_frame.copy()
        df["index"] = "foo"
        rdf = df.reset_index()
        exp = Series(float_frame.index.values, name="level_0")
        tm.assert_series_equal(rdf["level_0"], exp)

        # but this is ok
        float_frame.index.name = "index"
        deleveled = float_frame.reset_index()
        tm.assert_series_equal(deleveled["index"], Series(float_frame.index))
        tm.assert_index_equal(deleveled.index,
                              Index(np.arange(len(deleveled))))

        # preserve column names
        float_frame.columns.name = "columns"
        resetted = float_frame.reset_index()
        assert resetted.columns.name == "columns"

        # only remove certain columns
        df = float_frame.reset_index().set_index(["index", "A", "B"])
        rs = df.reset_index(["A", "B"])

        # TODO should reset_index check_names ?
        tm.assert_frame_equal(rs, float_frame, check_names=False)

        rs = df.reset_index(["index", "A", "B"])
        tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False)

        rs = df.reset_index(["index", "A", "B"])
        tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False)

        rs = df.reset_index("A")
        xp = float_frame.reset_index().set_index(["index", "B"])
        tm.assert_frame_equal(rs, xp, check_names=False)

        # test resetting in place
        df = float_frame.copy()
        resetted = float_frame.reset_index()
        return_value = df.reset_index(inplace=True)
        assert return_value is None
        tm.assert_frame_equal(df, resetted, check_names=False)

        df = float_frame.reset_index().set_index(["index", "A", "B"])
        rs = df.reset_index("A", drop=True)
        xp = float_frame.copy()
        del xp["A"]
        xp = xp.set_index(["B"], append=True)
        tm.assert_frame_equal(rs, xp, check_names=False)
def process(data: DataFrame) -> GeneralProcessor:
    """
    Process the data and structure them in a 2D table.

    Parameters
    ----------
    data: DataFrame
        Original data.

    Returns
    -------
    GeneralProcessor
        Processed and structured data.
    """
    columns = ["value", "date", "areaCode", "areaType", "areaName", "category"]

    dt_final = DataFrame(columns=columns)

    # Because of the hierarchical nature of the original data, there is
    # no easy way to automate this process using a generic solution
    # without prolonging the execution time. The iterative method appears
    # to produce the optimal time.
    for area_type in CATEGORY_LABELS:
        dt_label = DataFrame(columns=columns)

        for area_code in data[area_type]:
            area_name = data[area_type][area_code]['name']['value']
            df_code = DataFrame(columns=columns)

            for category in VALUE_COLUMNS:
                if category not in data[area_type][area_code]:
                    continue

                df_value = json_normalize(data[area_type][area_code],
                                          [category], [])

                df_value["areaCode"] = area_code
                df_value["areaType"] = area_type
                df_value["areaName"] = area_name
                df_value["category"] = category

                df_code = df_code.append(df_value)

            dt_label = dt_label.append(df_code)

        dt_final = dt_final.append(dt_label)

    # Reset index to appear incrementally.
    dt_final = dt_final.reset_index()[columns]
    logging.info(">> Data was successfully processed.")

    # Convert date strings to timestamp objects (needed for sorting).
    dt_final[DATE_COLUMN] = to_datetime(dt_final[DATE_COLUMN])
    logging.info(">> Dates were successfully converted to datetime object.")

    # Create a hierarchy that allows aggregation as required
    # in output data.
    dt_final = dt_final.groupby(
        ["areaType", "category", "date", "areaName", "areaCode"])
    logging.info(">> Data has been grouped.")

    # Given that the aggregation grouping produces rows with unique
    # value, the `sum()` function will produce the original value
    # or `NaN`.
    dt_final = dt_final.sum().unstack(["areaType", "category"])

    # Sort the data
    dt_final = dt_final.sort_values(["date", "areaName"],
                                    ascending=False).reset_index()
    logging.info(
        ">> Data was successfully sorted by date and area name - descending.")

    metadata = Metadata(lastUpdatedAt=data['lastUpdatedAt'],
                        disclaimer=data['disclaimer'])
    logging.info(">> Metadata extracted.")

    daily_records = DailyRecords(areaName="United Kingdom",
                                 totalLabConfirmedCases=None,
                                 dailyLabConfirmedCases=None)

    if (overview := data.get("overview")) is None:
        logging.warning(f'Missing data - Key: overview')
Example #53
0
    def test_reset_index_datetime(self, tz_naive_fixture):
        # GH#3950
        tz = tz_naive_fixture
        idx1 = pd.date_range("1/1/2011",
                             periods=5,
                             freq="D",
                             tz=tz,
                             name="idx1")
        idx2 = Index(range(5), name="idx2", dtype="int64")
        idx = MultiIndex.from_arrays([idx1, idx2])
        df = DataFrame(
            {
                "a": np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"]
            },
            index=idx,
        )

        expected = DataFrame(
            {
                "idx1": [
                    datetime(2011, 1, 1),
                    datetime(2011, 1, 2),
                    datetime(2011, 1, 3),
                    datetime(2011, 1, 4),
                    datetime(2011, 1, 5),
                ],
                "idx2":
                np.arange(5, dtype="int64"),
                "a":
                np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"],
            },
            columns=["idx1", "idx2", "a", "b"],
        )
        expected["idx1"] = expected["idx1"].apply(
            lambda d: Timestamp(d, tz=tz))

        tm.assert_frame_equal(df.reset_index(), expected)

        idx3 = pd.date_range("1/1/2012",
                             periods=5,
                             freq="MS",
                             tz="Europe/Paris",
                             name="idx3")
        idx = MultiIndex.from_arrays([idx1, idx2, idx3])
        df = DataFrame(
            {
                "a": np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"]
            },
            index=idx,
        )

        expected = DataFrame(
            {
                "idx1": [
                    datetime(2011, 1, 1),
                    datetime(2011, 1, 2),
                    datetime(2011, 1, 3),
                    datetime(2011, 1, 4),
                    datetime(2011, 1, 5),
                ],
                "idx2":
                np.arange(5, dtype="int64"),
                "idx3": [
                    datetime(2012, 1, 1),
                    datetime(2012, 2, 1),
                    datetime(2012, 3, 1),
                    datetime(2012, 4, 1),
                    datetime(2012, 5, 1),
                ],
                "a":
                np.arange(5, dtype="int64"),
                "b": ["A", "B", "C", "D", "E"],
            },
            columns=["idx1", "idx2", "idx3", "a", "b"],
        )
        expected["idx1"] = expected["idx1"].apply(
            lambda d: Timestamp(d, tz=tz))
        expected["idx3"] = expected["idx3"].apply(
            lambda d: Timestamp(d, tz="Europe/Paris"))
        tm.assert_frame_equal(df.reset_index(), expected)

        # GH#7793
        idx = MultiIndex.from_product([["a", "b"],
                                       pd.date_range("20130101",
                                                     periods=3,
                                                     tz=tz)])
        df = DataFrame(np.arange(6, dtype="int64").reshape(6, 1),
                       columns=["a"],
                       index=idx)

        expected = DataFrame(
            {
                "level_0":
                "a a a b b b".split(),
                "level_1": [
                    datetime(2013, 1, 1),
                    datetime(2013, 1, 2),
                    datetime(2013, 1, 3),
                ] * 2,
                "a":
                np.arange(6, dtype="int64"),
            },
            columns=["level_0", "level_1", "a"],
        )
        expected["level_1"] = expected["level_1"].apply(
            lambda d: Timestamp(d, freq="D", tz=tz))
        result = df.reset_index()
        tm.assert_frame_equal(result, expected)
Example #54
0
def lithotrack(df:pd.DataFrame,
               codecols:list,
               percols:list, 
               dtick:bool=False, 
               lims:list=None, 
               codedict: dict=None, 
               fontsize=8, 
               ax=None,
               correlation: pd.DataFrame = None,
               grid_numbers : list = [11,51],
               steps: list  = None,
               corr_kw={}):
    """lithotrack [summary]

    Parameters
    ----------
    df : pd.DataFrame
        [description]
    codecols : list
        [description]
    percols : list
        [description]
    dtick : bool, optional
        [description], by default False
    lims : list, optional
        [description], by default None
    codedict : dict, optional
        [description], by default None
    fontsize : int, optional
        [description], by default 8
    ax : [type], optional
        [description], by default None
    correlation : pd.DataFrame, optional
        [description], by default None
    grid_numbers : list, optional
        [description], by default [11,51]
    steps : list, optional
        [description], by default None
    corr_kw : dict, optional
        [description], by default {}
    """
    lit=ax or plt.gca()
    
    def_corr_kw = {
    'color': 'red',
    'linestyle':'--',
    'linewidth': 2
    }    
    for (k,v) in def_corr_kw.items():
        if k not in corr_kw:
            corr_kw[k]=v
    
    df.index.names=['depth']
    df=df.reset_index()
    df=df.loc[(df.index>=lims[0])&(df.index<=lims[1]),:]
    #Create a pivot table concatenating the lithology code names
    mm=pd.DataFrame()
    for (k,v) in enumerate(codecols):    
        m=df.pivot_table(index=['depth'],columns=[v],values=percols[k])
        mm=pd.concat([mm,m],axis=1)

    #Merge in a single dataframe the repeated colnames
    mm=mm.fillna(0)
    lm=pd.DataFrame()
    for i in mm.columns.unique():
        if mm[i].ndim>1:             
            lm[i]=mm[i].max(axis=1)
        elif mm[i].ndim==1:
            lm[i]=mm[i]
    try:
        lm=lm.drop(columns=[0])
    except:
        pass
    lmc=np.cumsum(lm,axis=1)
    
    for i, col in enumerate(lmc.columns):
        lit.fill_betweenx(lmc.index, lmc.iloc[:,i], label=codedict[col], zorder=-i)
        
    if lims==None: #Depth Limits
        lims=[df.index.min(),df.index.max()]

    lit.set_ylim([lims[1],lims[0]])
        
    #Set the vertical grid spacing
    if steps is None:
        mayor_grid = np.linspace(lims[0],lims[1],grid_numbers[0])
        minor_grid = np.linspace(lims[0],lims[1],grid_numbers[1])
    else:
        mayor_grid = np.arange(lims[0],lims[1],steps[0])
        minor_grid = np.arange(lims[0],lims[1],steps[1])
        
    lit.legend()
    lit.set_xlim([0,100])
    lit.set_yticks(mayor_grid)
    lit.set_yticks(minor_grid,minor=True)  
    if dtick==True:
        lit.set_yticklabels(mayor_grid)
    else:
        lit.set_yticklabels([])
    lit.set_xlabel("Lithology")
    lit.xaxis.tick_top()
    lit.xaxis.set_label_position("top")
    lit.tick_params("both",labelsize=fontsize)    
Example #55
0
 def merge_preserve_left_index(left: pd.DataFrame, right: pd.DataFrame,
                               **kwargs):
     return left.reset_index().merge(
         right, **kwargs).set_index('index').rename_axis(None)
Example #56
0
f_House_yn['f_marsupwt'] = family_marsupwt
f_House_yn['h_marsupwt'] = house_marsupwt

family_gestfips = CPS_dataset.groupby(['fh_seq', 'ffpos'])['gestfips'].mean()
f_House_yn['f_gestfips'] = family_gestfips

# Under 30 percent of median income
f_House_yn['under_30_inc'] = f_House_yn.apply(lambda x: income_lim_indicator30(
    x['family_net'], x['family_size'], x['f_gestfips']),
                                              axis=1)

# Under 50 percent of median income
f_House_yn['under_50_inc'] = f_House_yn.apply(lambda x: income_lim_indicator50(
    x['family_net'], x['family_size'], x['f_gestfips']),
                                              axis=1)
f_House_yn = f_House_yn.reset_index()
# f_House_yn.to_csv('use_df_both.csv')
# f_House_yn = pd.read_csv('use_df_both.csv')
f_House_yn['RfYes'] = Rf_probs[:, 1]

# CPS total benefits and Administrative total benefits
state_benefit = {}
state_recipients = {}

for fip in Admin_totals.Fips:
    this_state = (f_House_yn.f_gestfips == fip)
    CPS_totalb = (f_House_yn.fVouch_val[f_House_yn.indicator == 1] *
                  f_House_yn.f_marsupwt
                  )[this_state].sum()  # The CPS subsidy amount is montly
    admin_totalb = Admin_totals['housing_value'][
        Admin_totals.Fips == fip].values / 12  # to match montly
def prophet(  # pylint: disable=too-many-arguments
    df: DataFrame,
    time_grain: str,
    periods: int,
    confidence_interval: float,
    yearly_seasonality: Optional[Union[bool, int]] = None,
    weekly_seasonality: Optional[Union[bool, int]] = None,
    daily_seasonality: Optional[Union[bool, int]] = None,
) -> DataFrame:
    """
    Add forecasts to each series in a timeseries dataframe, along with confidence
    intervals for the prediction. For each series, the operation creates three
    new columns with the column name suffixed with the following values:

    - `__yhat`: the forecast for the given date
    - `__yhat_lower`: the lower bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date


    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param time_grain: Time grain used to specify time period increments in prediction
    :param periods: Time periods (in units of `time_grain`) to predict into the future
    :param confidence_interval: Width of predicted confidence interval
    :param yearly_seasonality: Should yearly seasonality be applied.
           An integer value will specify Fourier order of seasonality.
    :param weekly_seasonality: Should weekly seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :param daily_seasonality: Should daily seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :return: DataFrame with contributions, with temporal column at beginning if present
    """
    # validate inputs
    if not time_grain:
        raise QueryObjectValidationError(_("Time grain missing"))
    if time_grain not in PROPHET_TIME_GRAIN_MAP:
        raise QueryObjectValidationError(
            _(
                "Unsupported time grain: %(time_grain)s",
                time_grain=time_grain,
            ))
    freq = PROPHET_TIME_GRAIN_MAP[time_grain]
    # check type at runtime due to marhsmallow schema not being able to handle
    # union types
    if not periods or periods < 0 or not isinstance(periods, int):
        raise QueryObjectValidationError(
            _("Periods must be a positive integer value"))
    if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1:
        raise QueryObjectValidationError(
            _("Confidence interval must be between 0 and 1 (exclusive)"))
    if DTTM_ALIAS not in df.columns:
        raise QueryObjectValidationError(
            _("DataFrame must include temporal column"))
    if len(df.columns) < 2:
        raise QueryObjectValidationError(
            _("DataFrame include at least one series"))

    target_df = DataFrame()
    for column in [column for column in df.columns if column != DTTM_ALIAS]:
        fit_df = _prophet_fit_and_predict(
            df=df[[DTTM_ALIAS, column]].rename(columns={
                DTTM_ALIAS: "ds",
                column: "y"
            }),
            confidence_interval=confidence_interval,
            yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality),
            weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality),
            daily_seasonality=_prophet_parse_seasonality(daily_seasonality),
            periods=periods,
            freq=freq,
        )
        new_columns = [
            f"{column}__yhat",
            f"{column}__yhat_lower",
            f"{column}__yhat_upper",
            f"{column}",
        ]
        fit_df.columns = new_columns
        if target_df.empty:
            target_df = fit_df
        else:
            for new_column in new_columns:
                target_df = target_df.assign(
                    **{new_column: fit_df[new_column]})
    target_df.reset_index(level=0, inplace=True)
    return target_df.rename(columns={"ds": DTTM_ALIAS})
Example #58
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011', periods=5,
                                freq='D', tz=tz, name='idx')
            df = pd.DataFrame(
                {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)

            expected = pd.DataFrame({'idx': [datetime(2011, 1, 1),
                                             datetime(2011, 1, 2),
                                             datetime(2011, 1, 3),
                                             datetime(2011, 1, 4),
                                             datetime(2011, 1, 5)],
                                     'a': range(5),
                                     'b': ['A', 'B', 'C', 'D', 'E']},
                                    columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Example #59
0
def globalfuncion(Base, Diccionario, Variable_Analizar, listStopWords):
    ''' Función de generación de resultados
    
    Descripcion
    ----------------------------------------------------------------------------------------------------------
    Esta función genera una data frame mostrando el nombre del proyecto, la columna  que se esta analizando y 
    tipo de producto a lo cual se clasifico el texto.
    
    Parametros:   
    ----------------------------------------------------------------------------------------------------------
        
        Base  (DataFrame)            ---   Base de datos donde se encuentran la información que se va a procesar.
        Diccionario  (DataFrame)     ---   Diccionario donde se encuentran las palabras y el tipo de producto por cada palabra.
        Variable_Analizar (String)   ---   Nombre de la columna sobre la cual se va a realizar el procesamiento del texto.
        listStopWords (list)         ---   Lista de stopwords
        
    Retorno:     
    -----------------------------------------------------------------------------------------------------------
        DataFrame
        
        
    Nota:
    ------------------------------------------------------------------------------------------------------------
    
    Esta función depende de las siguientes funciones:
        repeticiones(....)
        corpusword(.....)
        search_tec_inno(......)
        limpieza_texto1(text)
        
    
    '''

    Idioma = 'PALABRAS'

    if (Variable_Analizar == 'OUTPUT_NAME'):
        Base_Aux = Base[{
            'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar,
            'OUTPUT_DESCRIPTION'
        }]
        Base_Aux = Base_Aux[(
            pd.isnull(Base_Aux['OUTPUT_DESCRIPTION']) == False) |
                            (pd.isnull(Base_Aux['COMPONENT_NAME']) == False)]
        Base_Aux['OUTPUT_NAME'] = Base_Aux['OUTPUT_NAME'].fillna('')
        Base_Aux['OUTPUT_DESCRIPTION'] = Base_Aux['OUTPUT_DESCRIPTION'].fillna(
            '')
        a = list([str(i) for i in (Base_Aux['OUTPUT_NAME'])])
        b = list([str(j) for j in (Base_Aux['OUTPUT_DESCRIPTION'])])
        c = []
        for i in range(len(a)):
            if (a[i] != '') & (b[i] != ''):
                c.append(a[i] + str(' ') + b[i])
            elif b[i] == '':
                c.append(a[i])
            else:
                c.append(b[i])

        Base_Aux['OUTPUT_NAME'] = c
        Base_Aux.drop(['OUTPUT_DESCRIPTION'], axis=1, inplace=True)

    else:
        Base_Aux = DataFrame()
        Base_Aux = Base[['OPERATION_NUMBER', Variable_Analizar]]
        Base_Aux.drop_duplicates(inplace=True)
        Base_Aux.dropna(inplace=True)
        Base_Aux[Variable_Analizar] = Base_Aux[Variable_Analizar].apply(str)

    list_of_words = Base_Aux[Variable_Analizar].apply(
        corpusword,
        args=(
            Diccionario_Total[Diccionario_Total.TOKENS == 1]['PALABRAS'],
            listStopwords,
        ))
    list_of_words2 = Base_Aux[Variable_Analizar].apply(
        limpieza_texto2, args=(Diccionario_Total, ))
    list_of_words3 = Base_Aux[Variable_Analizar].apply(searchsysteminformation)
    list_of_words = list_of_words + list_of_words2 + list_of_words3
    rep_name = repeticiones(list_of_words, Base_Aux, 'OPERATION_NUMBER')
    rep_variable = repeticiones(list_of_words, Base_Aux, Variable_Analizar)

    dframe = DataFrame()

    if (Variable_Analizar == 'OUTPUT_NAME'):
        Base_Aux['COMPONENT_NAME'] = Base_Aux['COMPONENT_NAME'].astype(str)
        rep_component = repeticiones(list_of_words, Base_Aux, 'COMPONENT_NAME')
        dframe['COMPONENT_NAME'] = rep_component

    list_of_words = list(chain(*list_of_words))

    #

    dframe['OPERATION_NUMBER'] = rep_name
    dframe[Variable_Analizar] = rep_variable
    dframe['WORDS'] = list_of_words
    #

    #    Base_Aux[Variable_Analizar]=Base_Aux[Variable_Analizar].str.replace(' xxxxxx ',' Red ')
    dframe = dframe.merge(Diccionario[['TIPO', Idioma]],
                          left_on='WORDS',
                          right_on=Idioma,
                          how='left')

    if (Variable_Analizar == 'OUTPUT_NAME'):
        dframe2 = dframe[[
            'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'WORDS'
        ]]
        dframe2.drop_duplicates(inplace=True)
        dframe = dframe[[
            'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar, 'TIPO'
        ]].drop_duplicates()
        dframe = pd.crosstab([
            dframe['OPERATION_NUMBER'], dframe['COMPONENT_NAME'],
            dframe[Variable_Analizar]
        ],
                             columns=dframe['TIPO'])

    else:
        dframe2 = dframe[[
            'OPERATION_NUMBER', Variable_Analizar, 'WORDS', 'TIPO'
        ]]
        #        dframe2.drop_duplicates(inplace=True)
        dframe = dframe[['OPERATION_NUMBER', Variable_Analizar,
                         'TIPO']].drop_duplicates()
        dframe = pd.crosstab(
            [dframe['OPERATION_NUMBER'], dframe[Variable_Analizar]],
            columns=dframe['TIPO'])

#

    dframe.reset_index(inplace=True)
    X = set(dframe.columns)  #####conjunto de las columnas
    Y = set({'NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO',
             'POSITIVO'})  ###columnas necesarias para aplicar la condicion
    b = list(Y - X)

    if len(b) > 0:
        aux = DataFrame(np.repeat(0,
                                  len(b) * dframe.shape[0]).reshape(
                                      (dframe.shape[0], len(b))),
                        columns=b)
        dframe = pd.concat([dframe, aux], axis=1)
    Base_Aux.index = range(len(Base_Aux))
    dframe = Base_Aux.merge(dframe, how='left')
    dframe.fillna(np.nan, inplace=True)

    if (Variable_Analizar == 'OUTPUT_NAME'):
        dframe = dframe[[
            'OPERATION_NUMBER', 'COMPONENT_NAME', Variable_Analizar,
            'NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO'
        ]]
        dframe = dframe.groupby([
            dframe['OPERATION_NUMBER'], dframe['COMPONENT_NAME'],
            dframe[Variable_Analizar]
        ]).sum()
    else:
        dframe = dframe[[
            'OPERATION_NUMBER', Variable_Analizar, 'NEGATIVO', 'NEUTRO',
            'NEUTRO POSITIVO', 'POSITIVO'
        ]]
        dframe = dframe.groupby(
            [dframe['OPERATION_NUMBER'], dframe[Variable_Analizar]]).sum()

#Aplicar condiciones
    dframe['RESULT' + '_' + Variable_Analizar] = np.where(
        (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] == 0) &
        (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0),
        'NO DIGITAL',
        np.where(
            (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] == 0) &
            (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0),
            'NO DIGITAL',
            np.where(
                (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] >= 1) &
                (dframe['NEUTRO POSITIVO'] == 0) & (dframe['POSITIVO'] == 0),
                'NO DIGITAL',
                np.where(
                    (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] == 0) &
                    (dframe['NEUTRO POSITIVO'] >= 1) &
                    (dframe['POSITIVO'] == 0), 'NO DIGITAL',
                    np.where(
                        (dframe['NEGATIVO'] >= 1) & (dframe['NEUTRO'] == 0) &
                        (dframe['NEUTRO POSITIVO'] >= 1) &
                        (dframe['POSITIVO'] == 0), 'NO DIGITAL',
                        np.where(
                            (dframe['NEGATIVO'] == 0) & (dframe['NEUTRO'] >= 1)
                            & (dframe['NEUTRO POSITIVO'] == 0) &
                            (dframe['POSITIVO'] == 0), 'SIN DEFINIR',
                            'DIGITAL'))))))
    dframe.drop(['NEGATIVO', 'NEUTRO', 'NEUTRO POSITIVO', 'POSITIVO'],
                axis=1,
                inplace=True)
    dframe.reset_index(inplace=True)
    dframe['RESULT' + '_' + Variable_Analizar] = np.where(
        (dframe[Variable_Analizar] == ' ') | (dframe[Variable_Analizar] == 'x')
        | (dframe[Variable_Analizar] == 'xx') |
        (dframe[Variable_Analizar] == '.')
        | (dframe[Variable_Analizar] == ',')
        | [x in str(range(20)) for x in dframe[Variable_Analizar]] |
        (dframe[Variable_Analizar].apply(type) == int) |
        (dframe[Variable_Analizar] == '*')
        | (dframe[Variable_Analizar] == '#') |
        (dframe[Variable_Analizar] == '-') | (dframe[Variable_Analizar] == '_')
        | (dframe[Variable_Analizar] == '- ')
        | (dframe[Variable_Analizar] == ' -') |
        (dframe[Variable_Analizar] == '. -'), np.nan,
        dframe['RESULT' + '_' + Variable_Analizar])

    dframe['RESULT_' + Variable_Analizar +
           '_TECN-INNOV'] = dframe[Variable_Analizar].apply(search_tec_inno)

    return [dframe, dframe2]
Example #60
-1
def process_matebook_data(directory, paramlist, storage_location):
    vidname = parse_screen_filename(directory)
    for filename in find_files(directory, 'track.tsv'):
        vidpath, flyID = parse_filename(filename)
        tag = vidname + "_" + flyID
        if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'):
            fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3])
            tempdf = DataFrame(index = fi.index)
            if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2:
                print "arena dropped for poor quality: ", tag
                continue
            elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0:
                print "arena dropped because quality = 1: ", tag
                continue
            elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1:
                print "arena dropped because courtship = nan: ", tag
                continue
            else:
                for j in paramlist:
                    tempdf[j[1]] = fi[j[0],j[1]]
                    if 'movedAbs_u' in j:
                        tempdf[j[1]] = tempdf[j[1]] * FPS
            tempdf['Time'] = tempdf.index/FPS
            time_ID = vidpath.split('_',1)[-1].split('.',1)[0]
            tempdf = merge_jvision_data(tempdf.reset_index(), time_ID)
            tempdf.to_pickle(storage_location + '/'+ tag + '_arena.pickle')
            print ".....", tag, " processed to pickling."
    return