def test_query_multiple_time_series(self): data = [{ "name": "series1", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, 323048, 323048, 323048, 0]] }, { "name": "series2", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, -2.8233, -2.8503, -2.7832, 0.0173]] }, { "name": "series3", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, -0.01220, -0.01220, -0.01220, 0]] }] dataframes = { 'series1': pd.DataFrame(data=[[323048, 323048, 323048, 0]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']), 'series2': pd.DataFrame(data=[[-2.8233, -2.8503, -2.7832, 0.0173]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']), 'series3': pd.DataFrame(data=[[-0.01220, -0.01220, -0.01220, 0]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']) } with _mocked_session('get', 200, data): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query("""select mean(value), min(value), max(value), stddev(value) from series1, series2, series3""") assert dataframes.keys() == result.keys() for key in dataframes.keys(): assert_frame_equal(dataframes[key], result[key])
def test_query_into_dataframe(self): data = [{ "name": "foo", "columns": ["time", "sequence_number", "column_one"], "points": [[3600, 16, 2], [3600, 15, 1], [0, 14, 2], [0, 13, 1]] }] # dataframe sorted ascending by time first, then sequence_number dataframe = pd.DataFrame(data=[[13, 1], [14, 2], [15, 1], [16, 2]], index=pd.to_datetime([0, 0, 3600, 3600], unit='s', utc=True), columns=['sequence_number', 'column_one']) with _mocked_session('get', 200, data): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query('select column_one from foo;') assert_frame_equal(dataframe, result)
def test_query_into_dataframe(self): data = [ { "name": "foo", "columns": ["time", "sequence_number", "column_one"], "points": [ [3600, 16, 2], [3600, 15, 1], [0, 14, 2], [0, 13, 1] ] } ] # dataframe sorted ascending by time first, then sequence_number dataframe = pd.DataFrame(data=[[13, 1], [14, 2], [15, 1], [16, 2]], index=pd.to_datetime([0, 0, 3600, 3600], unit='s', utc=True), columns=['sequence_number', 'column_one']) with _mocked_session('get', 200, data): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query('select column_one from foo;') assert_frame_equal(dataframe, result)
def test_query_multiple_time_series(self): """Test query for multiple time series.""" data = [ { "name": "series1", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, 323048, 323048, 323048, 0]] }, { "name": "series2", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, -2.8233, -2.8503, -2.7832, 0.0173]] }, { "name": "series3", "columns": ["time", "mean", "min", "max", "stddev"], "points": [[0, -0.01220, -0.01220, -0.01220, 0]] } ] dataframes = { 'series1': pd.DataFrame(data=[[323048, 323048, 323048, 0]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']), 'series2': pd.DataFrame(data=[[-2.8233, -2.8503, -2.7832, 0.0173]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']), 'series3': pd.DataFrame(data=[[-0.01220, -0.01220, -0.01220, 0]], index=pd.to_datetime([0], unit='s', utc=True), columns=['mean', 'min', 'max', 'stddev']) } with _mocked_session('get', 200, data): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query("""select mean(value), min(value), max(value), stddev(value) from series1, series2, series3""") self.assertEqual(dataframes.keys(), result.keys()) for key in dataframes.keys(): assert_frame_equal(dataframes[key], result[key])
class InfluxDB(object): ''' Connect to influxdb and pull/write data ''' def __init__(self, db_name=None): self.url = 'localhost' self.port = 8086 self.user = '******' self.password = '******' self.db_list = [ 'FRED', 'Quandl', 'Econ', 'ChinaData' ] self.db = DataFrameClient(self.url, self.port, self.user, self.password) if(db_name != None): self.db_name = db_name self.db.switch_database(db_name) def _search_db(self,series_name): ''' Search the db name for a series name ''' for db in self.db_list: temp_db = DataFrameClient(self.url, self.port, self.user, self.password, db) if series_name in temp_db.get_list_series(): return db return None def query(self,series_name,db_name=None): ''' Query a particular series ------ series_name: str name of the series, e.g. "CPI_US" ------ return a pandas DataFrame with NaN representing missing values ------ ''' if(db_name != None): self.db.switch_database(db_name) results = self.db.query('SELECT * FROM %s' % series_name) else: db_name = self._search_db(series_name) self.db.switch_database(db_name) results = self.db.query('SELECT * FROM %s' % series_name) if(results['value'].str.contains('.').isnull().sum()!=len(results)): results.loc[results['value']=='.','value'] = None return results.astype(float) def _is_num(self, s): ''' Determine if a string is a number ''' try: float(s) return True except ValueError: return False def _include(self,expression_list,c): ''' check whether expression_list include basic operator c, and give the index of first occurrence ------ c: str the basic operator or parentheses c ------ ''' for index, item in enumerate(expression_list): if(type(item)==type('string')): if(item == c): return index return -1 def _get_index(self,expression_list, op_1, op_2): ''' Get the index of first occurrence of op_1 OR op_2, assume expression_list includes op_1 OR op_2 ''' index_1 = self._include(expression_list, op_1) index_2 = self._include(expression_list, op_2) if(min(index_1,index_2)==-1): return(max(index_1,index_2)) else: return(min(index_1,index_2)) def _close_parentheses(self,expression): ''' Find the closing parentheses to the first opening ( ------ expression: str str contains an opening ( ------ ''' layer = 0 for i, char in enumerate(expression): if(char=='('): layer = layer + 1 elif(char==')'): layer = layer - 1 if(layer == 0): return i return -1 def _break_expression(self, expression, operators, functions): ''' Break the expression into logical components ''' expression = expression.replace(' ','') #empty string if(len(expression)==0): return [expression] # interpret functions for func in functions: if func in expression: func_start = expression.find(func) func_end = func_start + self._close_parentheses(expression[func_start:]) if(expression[func_start-1] in operators or func_start == 0): return (self._break_expression(expression[:func_start], operators, functions) +[expression[func_start:func_end+1]] +self._break_expression(expression[func_end+1:], operators, functions)) # then deal with the time series, operators and numbers results = [] current_expression = '' for char in expression: if(char in operators): results.append(current_expression) results.append(char) current_expression = '' else: current_expression = current_expression + char results.append(current_expression) results[:] = [item for item in results if item != ''] return results def _eval_func(self, func): ''' Evaluate the function ''' function = func[:func.index('(')] if 'lag' == function: #lag the time series by a number of periods. lag(*series*,i) where i is number of period index_start = func.index('(') index_mid = len(func)-func[::-1].index(',')-1 index_end = len(func)-func[::-1].index(')')-1 try: series = self.query(func[index_start+1:index_mid]) except: series = self.interpret(func[index_start+1:index_mid]) periods = float(func[index_mid+1:index_end]) return series.shift(periods) elif 'mlag' == function: # shift the time stamp by a number of months. mlag(*series*,i) where i is number of months index_start = func.index('(') index_mid = len(func)-func[::-1].index(',')-1 index_end = len(func)-func[::-1].index(')')-1 try: series = self.query(func[index_start+1:index_mid]) except: series = self.interpret(func[index_start+1:index_mid]) periods = float(func[index_mid+1:index_end]) return series.tshift(periods,freq='M').tshift(1,freq='D') elif 'avg' == function: #taking the average of time series index_start = func.index('(') index_mid = len(func)-func[::-1].index(',')-1 index_end = len(func)-func[::-1].index(')')-1 try: series = self.query(func[index_start+1:index_mid]) except: series = self.interpret(func[index_start+1:index_mid]) freq = func[index_mid+1:index_end] series = series.resample(freq, how='mean') if 'M' in freq: return series.tshift(-1,freq='M').tshift(1,freq='D') else: return series elif 'anticum' == function: #reverse the cumulative common to China stats index_start = func.index('(') index_mid = len(func)-func[::-1].index(',')-1 index_end = len(func) - func[::-1].index(')')-1 new_series = {} try: series = self.query(func[index_start+1:index_mid]) except: series = self.interpret(func[index_start+1:index_mid]) freq = func[index_mid+1:index_end] if(freq=='M'): freq_m = 1 elif(freq=='Q'): freq_m = 3 for item in series.iterrows(): if item[0].month != freq_m: prev_date = item[0] + DateOffset(months=-1*freq_m) prev_date = prev_date.to_period('M').to_timestamp('M') monthly = item[1]['value']-series.loc[prev_date] new_series[item[0]] = monthly['value'] else: new_series[item[0]] = item[1]['value'] return DataFrame({'value':Series(new_series)}) else: message = '%s not defined' % func raise ValueError(message) def _convert_expressions(self, expression, operators): ''' Convert expressions to data series after they are broken down ''' expression[:] = [item for item in expression if item != ''] converted_results = [] for item in expression: if '(' in item and ')' in item: converted_results.append(self._eval_func(item)) elif item in operators: converted_results.append(item) elif self._is_num(item): converted_results.append(float(item)) else: converted_results.append(self.query(item)) return converted_results def _calculate(self, expression_list): ''' Calculate a list of expression elements ''' e_list = expression_list if(len(e_list)==1): return e_list if(self._include(e_list, '^') != -1): position = self._include(e_list, '^') eval_result = pow(e_list[position-1],e_list[position+1]) return self._calculate(e_list[:position-1] + [eval_result] + e_list[position+2:]) elif(self._include(e_list, '*') != -1 or self._include(e_list, '/') != -1): position = self._get_index(e_list, '*', '/') if(e_list[position]=='*'): eval_result = e_list[position-1] * e_list[position+1] else: eval_result = e_list[position-1] / e_list[position+1] return self._calculate(e_list[:position-1] + [eval_result] + e_list[position+2:]) elif(self._include(e_list, '+') != -1 or self._include(e_list, '-') != -1): position = self._get_index(e_list, '+', '-') if(e_list[position]=='+'): eval_result = e_list[position-1] + e_list[position+1] else: eval_result = e_list[position-1] - e_list[position+1] return self._calculate(e_list[:position-1] + [eval_result] + e_list[position+2:]) else: raise ValueError('cannot recognize operators in the expression') def _parentheses(self, expression_list): ''' Iterate through parentheses, always interpret the first closing parentheses ) ''' e_list = expression_list if(self._include(e_list, '(') != -1 and self._include(e_list, ')') != -1): close_index = self._include(e_list, ')') sub = e_list[:close_index][::-1] open_index = close_index - self._include(sub, '(') - 1 eval_result = self._calculate(e_list[(open_index+1):close_index]) new_list = e_list[:open_index] + eval_result + e_list[close_index+1:] return(self._parentheses(new_list)) else: if(self._include(e_list, '(') != -1 or self._include(e_list, ')') != -1): raise ValueError('unmatched parentheses, check the expression') else: return(self._calculate(e_list)) def interpret(self, expression): ''' Interpret an expression ''' operators = ['^','+','-','*','/','(',')'] functions = ['lag', 'mlag', 'avg', 'anticum'] broken_expression = self._break_expression(expression, operators, functions) interp_expression = self._convert_expressions(broken_expression, operators) results = self._parentheses(interp_expression)[0] results = results.loc[isnull(results['value']) != True] return results
def test_query_with_empty_result(self): with _mocked_session('get', 200, []): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query('select column_one from foo;') self.assertEqual(result, [])
def test_query_with_empty_result(self): with _mocked_session('get', 200, []): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query('select column_one from foo;') assert result == []
def test_query_with_empty_result(self): """Test query with empty results.""" with _mocked_session('get', 200, []): cli = DataFrameClient('host', 8086, 'username', 'password', 'db') result = cli.query('select column_one from foo;') self.assertEqual(result, [])