def test_string_no_dates(self): s1 = Series(['a', 'A longer string']) s2 = Series([1.0, 2.0], dtype=np.float64) original = DataFrame({'s1': s1, 's2': s2}) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original)
def test_nan_to_missing_value(self): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan s2[1::2] = np.nan original = DataFrame({'s1': s1, 's2': s2}) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, original)
def test_read_write_dta13(self): s1 = Series(2**9, dtype=np.int16) s2 = Series(2**17, dtype=np.int32) s3 = Series(2**33, dtype=np.int64) original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) original.index.name = 'index' formatted = original formatted['int64'] = formatted['int64'].astype(np.float64) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def readFromFileMl1m(behavioursFile: str): behavioursDF: DataFrame = pd.read_csv(behavioursFile, sep='\t', header=0, encoding="ISO-8859-1") behavioursDF.columns = [ Behaviours.COL_USERID, Behaviours.COL_MOVIEID, Behaviours.COL_REPETITION, Behaviours.COL_BEHAVIOUR ] return behavioursDF behaviour: List[float] = [] for indexI, rowI in behavioursDF.iterrows(): behaviourI: List[bool] = Behaviours.__convertToListOfBoolean( str(rowI[Behaviours.COL_BEHAVIOUR])) behaviour.append(behaviourI) behavioursConvertedDF: DataFrame = pd.concat( [ behavioursDF[Behaviours.COL_USERID], behavioursDF[Behaviours.COL_MOVIEID], behavioursDF[Behaviours.COL_REPETITION], Series(behaviour) ], axis=1, keys=[ Behaviours.COL_USERID, Behaviours.COL_MOVIEID, Behaviours.COL_REPETITION, Behaviours.COL_BEHAVIOUR ]) return behavioursConvertedDF
def test_read_dta4(self): parsed_113 = self.read_dta(self.dta4_113) parsed_114 = self.read_dta(self.dta4_114) parsed_115 = self.read_dta(self.dta4_115) parsed_117 = self.read_dta(self.dta4_117) expected = DataFrame.from_records( [["one", "ten", "one", "one", "one"], ["two", "nine", "two", "two", "two"], ["three", "eight", "three", "three", "three"], ["four", "seven", 4, "four", "four"], ["five", "six", 5, np.nan, "five"], ["six", "five", 6, np.nan, "six"], ["seven", "four", 7, np.nan, "seven"], ["eight", "three", 8, np.nan, "eight"], ["nine", "two", 9, np.nan, "nine"], ["ten", "one", "ten", np.nan, "ten"]], columns=[ 'fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled' ]) # these are all categoricals expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected) ], axis=1) tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) tm.assert_frame_equal(parsed_117, expected)
def test_bool_uint(self): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16) s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16) s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32) original = DataFrame({ 's0': s0, 's1': s1, 's2': s2, 's3': s3, 's4': s4, 's5': s5, 's6': s6 }) original.index.name = 'index' expected = original.copy() expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, expected)
def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.assertRaises(ValueError): with tm.ensure_clean() as path: original.to_stata(path)
def readFromFileMl1m(): behavioursFile: str = ".." + os.sep + "datasets" + os.sep + "ml-1m" + os.sep + "behaviours.dat" behavioursDF:DataFrame = pd.read_csv(behavioursFile, sep='\t', header=0, encoding="ISO-8859-1") behavioursDF.columns = [Behaviours.COL_USERID, Behaviours.COL_MOVIEID, Behaviours.COL_REPETITION, Behaviours.COL_STATIC08, Behaviours.COL_STATIC06, Behaviours.COL_STATIC04, Behaviours.COL_STATIC02, Behaviours.COL_LINEAR0109] behaviourStatic08:List[float] = [] behaviourStatic06:List[float] = [] behaviourStatic04:List[float] = [] behaviourStatic02:List[float] = [] behaviourLinear0109:List[float] = [] for indexI, rowI in behavioursDF.iterrows(): behavStatic08I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC08])) behaviourStatic08.append(behavStatic08I) behavStatic06I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC06])) behaviourStatic06.append(behavStatic06I) behavStatic04I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC04])) behaviourStatic04.append(behavStatic04I) behavStatic02I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_STATIC02])) behaviourStatic02.append(behavStatic02I) behavLinear0109I:List[bool] = Behaviours.__convertToListOfBoolean(str(rowI[Behaviours.COL_LINEAR0109])) behaviourLinear0109.append(behavLinear0109I) behavioursConvertedDF:DataFrame = pd.concat([behavioursDF[Behaviours.COL_USERID], behavioursDF[Behaviours.COL_MOVIEID], behavioursDF[Behaviours.COL_REPETITION], Series(behaviourStatic08), Series(behaviourStatic06), Series(behaviourStatic04), Series(behaviourStatic02), Series(behaviourLinear0109)], axis=1, keys=[Behaviours.COL_USERID, Behaviours.COL_MOVIEID, Behaviours.COL_REPETITION, Behaviours.COL_STATIC08, Behaviours.COL_STATIC06, Behaviours.COL_STATIC04, Behaviours.COL_STATIC02, Behaviours.COL_LINEAR0109]) return behavioursConvertedDF
def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) sr = StataReader(path) variables = sr.varlist formats = sr.fmtlist for variable, fmt in zip(variables, formats): self.assertTrue(int(variable[1:]) == int(fmt[1:-1]))
def test_read_dta10(self): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], columns=['string', 'object', 'integer', 'float', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' with ensure_clean(self.dta10) as path: original.to_stata(path, {'datetime': 'tc'}, False) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original)
def test_read_write_dta10(self): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' original.index = original.index.astype(np.int32) original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}) written_and_read_again = self.read_dta(path) # original.index is np.int32, readed index is np.int64 tm.assert_frame_equal(written_and_read_again.set_index('index'), original, check_index_type=False)
def test_read_write_dta10(self): if not is_little_endian(): raise nose.SkipTest("known failure of test_write_dta10 on " "non-little endian") original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], columns=['string', 'object', 'integer', 'float', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original)
def countAggrBanditsResponsibility(methodsResult: List[tuple], modelDF: DataFrame): #print(methodsResult) result: List[tuple] = [] for itemIdI, methodIdI in methodsResult: wIJ: float = modelDF.loc[methodIdI, 'r'] / modelDF.loc[methodIdI, 'n'] result.append((itemIdI, wIJ)) itemsIDs: List[int] = [x[0] for x in result] scores: List[float] = [x[1] for x in result] resultSer: Series = Series(scores, index=itemsIDs) finalScores = normalize(np.expand_dims(resultSer.values, axis=0))[0, :] resultNorm: List[tuple] = zip(resultSer.index, finalScores.tolist()) return resultNorm
def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2**15 - 1], dtype=np.int16) s3 = Series([1, 2**63 - 1], dtype=np.int64) original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) original.index.name = 'index' with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() modified['s1'] = Series(modified['s1'], dtype=np.int16) modified['s2'] = Series(modified['s2'], dtype=np.int32) modified['s3'] = Series(modified['s3'], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index('index'), modified)
#!/usr/bin/env python """ @author: jstrick Created on Sat May 18 10:46:25 2013 """ import numpy as np from pandas.core.frame import Series NUM_VALUES = 10 index = [chr(i) for i in range(97,97 + NUM_VALUES)] print(index) s1 = Series(np.linspace(1,5,NUM_VALUES), index=index) s2 = Series(np.linspace(1,5,NUM_VALUES)) print(s1, "\n") print(s2, "\n") print(s1[['h','b']], "\n") print(s1[['a','b','c']], "\n") print(s1.sum(), s1.mean(), s1.min(), s1.max(), "\n") print(s1.cumsum(), s1.cumprod(), s1.std(), "\n") print('a' in s1) print('m' in s1) s3 = s1 * 10 print(s3, "\n")
#!/usr/bin/env python """ @author: jstrick Created on Sat May 18 16:20:49 2013 """ from pandas.core.frame import Series # create from list s1 = Series([5, 10, 15]) print(s1, "\n") print("s1[0]:", s1[0], "\n") print('-' * 60) # create from list with index s2 = Series([5, 10, 15], ['a', 'b', 'c']) print(s2, "\n") print("s2['a']:", s2['a']) print('-' * 60) # create from dictionary (keys are indices) s3 = Series({'b': 10, 'a': 5, 'c': 15}) print(s3, "\n") print("s3.sum(), s3.mean():", s3.sum(), s3.mean()) print('-' * 60)
print(index2 & index3) print(index2.intersection(index3)) print() print_header("index2 | index3", 70) # these are the same print(index2 | index3) print(index2.union(index3)) print() print_header("index1.difference(index3)", 70) print(index1.difference(index3)) print() print_header("Series([10,20,30], index=index1)", 70) series1 = Series([10, 20, 30], index=index1) print(series1) print() print_header( "DataFrame([(1,2,3),(4,5,6),(7,8,9)], index=index1, columns=index4)", 70) dataframe1 = DataFrame([(1, 2, 3), (4, 5, 6), (7, 8, 9)], index=index1, columns=index4) print(dataframe1) print() print_header( "DataFrame([(1,2,3),(4,5,6),(7,8,9)], index=index4, columns=index1)", 70) dataframe2 = DataFrame([(1, 2, 3), (4, 5, 6), (7, 8, 9)], index=index4,
def generateGraphData(self, stats=None): safePrint('Generating data files to %s' % self.pathGraphs) wiki = self.getWiki() if stats is None: allData = read_table(self.combinedFile, sep='\t') else: allData = DataFrame(stats, columns=columnHdrResult) # filter type==DATA and site==wikipedia allData = allData[(allData['type'] == 'DATA') & (allData['site'] == 'wikipedia')] # filter out last date lastDate = allData.date.max() df = allData[allData.date < lastDate] allEnabled = df[(df.ison == 'on') & (df.iszero == 'yes')] s = StringIO.StringIO() pivot_table(allEnabled, 'count', ['date', 'xcs'], aggfunc=np.sum).to_csv(s, header=True) result = s.getvalue() wiki( 'edit', title='RawData:AllEnabled', summary='refreshing data', text=result, token=wiki.token() ) for xcs in list(df.xcs.unique()): xcsDf = df[df.xcs == xcs] # create an artificial yes/opera value opera = xcsDf[(xcsDf.via == 'OPERA') & (xcsDf.iszero == 'yes')] opera['str'] = 'zero-opera' yes = xcsDf[xcsDf.iszero == 'yes'] yes['str'] = 'zero-all' no = xcsDf[xcsDf.iszero == 'no'] no['str'] = 'non-zero' combined = opera.append(yes).append(no) s = StringIO.StringIO() pivot_table(combined, 'count', ['date', 'str'], aggfunc=np.sum).to_csv(s, header=False) result = 'date,iszero,count\n' + s.getvalue() wiki( 'edit', title='RawData:' + xcs, summary='refreshing data', text=result, token=wiki.token() ) byLang = pivot_table(xcsDf, 'count', ['lang'], aggfunc=np.sum).order('count', ascending=False) top = byLang.head(5) other = byLang.sum() - top.sum() s = StringIO.StringIO() Series.to_csv(top, s) result = 'lang,count\n' + s.getvalue() + ('other,%d\n' % other) wiki( 'edit', title='RawData:' + xcs + '-langTotal', summary='refreshing data', text=result, token=wiki.token() )
names = "names" indices = "indices" range = "range" # Indices=NewType("indices",List[int]) # Names=NewType("names",List[str]) # Start=NewType("start",PositiveInt) # End=NewType("end",PositiveInt) # Range=NewType("range",Tuple[Start,End]) class SelectRow(BasicOperator): data: DFPort bool_array: series_port(optional=False) mode: ModeEnum if __name__ == "__main__": data = DataFrame(np.arange(15).reshape(3, 5), index=['one', 'two', 'three'], columns=['a', 'b', 'c', 'd', 'e']) bool_array = Series([True, False, True, False]) try: SelectRow(data=data, bool_array=bool_array, mode="indices", config=(9, 9)) except ValidationError as e: print(str(e))