def get_density(self, x): """ Returns the density for a given point. The density is derived in two steps: - locating the point in the distribution that is closest to x (according to Euclidian distance) - dividing the probability mass for the point by the n-dimensional volume around this point. The radius of the ball is the half of the minimum distance between the points of the distribution. :param x: the point :return: the density at the point """ if isinstance(x, float): x = np.array([x]) closest = [] closest_dist = np.inf for point in self._points.keys(): cur_dist = MathUtils.get_distance(point, x) if cur_dist < closest_dist: closest = point closest_dist = cur_dist if closest_dist < self._min_distance / 2: return self._points[closest] / MathUtils.get_volume(self._min_distance / 2, self.get_dimensions()) else: return 0
def __init__(self): self.root_path = join(expanduser("~"), "PycharmProjects", "python-vistalytics", "source") self.ignore_keys = ['Margins % of Sales', 'Profitability', 'Cash Flow Ratios', 'Balance Sheet Items (in %)', 'Liquidity/Financial Health', 'Efficiency'] self.sheet_list = ['Income_Statement', 'Balance_Sheet', 'Cash_Flow_Statement', 'Key_Ratios'] self.math_utils = MathUtils() self.file_utils = FileUtils()
def __init__(self): self.tree_str = TreeStructure() self.tree = self.tree_str.tree self.report_choice = con.INCOME_STATEMENTS self.file_utils = FileUtils() self.math_utils = MathUtils() self.root_path = join(expanduser('~'), 'PycharmProjects', 'python-vistalytics', 'source')
def test_maths(self): assert MathUtils.get_volume(2.0, 1) == pytest.approx(4.0, abs=0.001) assert MathUtils.get_volume(2.0, 2) == pytest.approx(math.pi * 4, abs=0.001) assert MathUtils.get_volume(2.0, 3) == pytest.approx(4.0 / 3.0 * math.pi * 8, abs=0.001) assert MathUtils.get_volume(3.0, 4) == pytest.approx( math.pow(math.pi, 2) / 2 * 81, abs=0.001)
class QuarterlyAverageChangeAnalysis(object): def __init__(self): self.root_path = join(expanduser('~'), "PycharmProjects", "python-vistalytics", "source") self.math_utils = MathUtils() self.file_utils = FileUtils() def run(self): path = join(self.root_path, "in", 'quarter') file_list = self.file_utils.get_files(path) for f in file_list: key_list = [] change = [] percent_change = [] data = self.file_utils.read_csv(path, f) if 'TTM' in data: data = data.drop('TTM', 1) for i, v in data.iterrows(): # Prepare Columns key_list.append(i) size_val = len(v) change.append( self.math_utils.get_change(v[size_val - 5], v[size_val - 1])) percent_change.append( self.math_utils.percentage_change(v[size_val - 5], v[size_val - 1])) # Write csv. output_data = { 'Keys': key_list, 'Year Over Year Change': change, 'Year Over Year Change (%)': percent_change } output_data = pd.DataFrame(output_data, columns=[ 'Keys', 'Year Over Year Change', 'Year Over Year Change (%)' ]) output_dir = join( self.root_path, 'out', splitext(f)[0].replace(" ", "_") + "_Quarterly_Report.csv") self.file_utils.write_csv(output_dir, output_data)
def __init__(self, points=None): if isinstance(points, dict): """ Creates a new discrete density function, given the set of points :param points: a set of (value,prob) pairs """ # the set of points for the density function self._points = dict() # {point: probability} self._points.update(points) # minimum distance between points points_keys = np.array(list(map(lambda x: np.array(x), points.keys()))) self._min_distance = MathUtils.get_min_euclidian_distance(points_keys) # the volume employed for the normalisation self._volume = MathUtils.get_volume(self._min_distance / 2., self.get_dimensions()) else: raise NotImplementedError()
def _get_node_list(self, sub_tree): node_id_list = [] sub_tree_root_node = sub_tree.get_node(sub_tree.root) for node in sub_tree.all_nodes(): self.tree.get_node(node.identifier).data[con.TAGGED] = False if not node.is_root() and MathUtils().compare( node.data[con.SLOPE_VALUE], sub_tree_root_node.data[con.SLOPE_VALUE]): node_id_list.append(node.identifier) return node_id_list
class YearlyAverageChangeAnalysis(object): def __init__(self): self.root_path = join(expanduser("~"), "PycharmProjects", "python-vistalytics", "source") self.file_utils = FileUtils() self.math_utils = MathUtils() def run(self): src_dir_path = join(self.root_path, "in", "annual") file_list = self.file_utils.get_files(src_dir_path) for f in file_list: key_list = [] avg_change_3_years_list = [] percentage_change_3_years_list = [] avg_change_5_years_list = [] percentage_change_5_years_list = [] data = self.file_utils.read_csv(src_dir_path, f) if 'TTM' in data: data = data.drop('TTM', 1) for i, v in data.iterrows(): # Prepare columns. size_val = len(v) key_list.append(i) avg_change_3_years_list.append(self.math_utils.average_change(v[size_val - 3:])) percentage_change_3_years_list.append(self.math_utils.percentage_change(v[size_val - 3], v[size_val - 1])) avg_change_5_years_list.append(self.math_utils.average_change(v[size_val - 5:])) percentage_change_5_years_list.append(self.math_utils.percentage_change(v[size_val - 5], v[size_val - 1])) # Write to csv. output_data = {'Keys': key_list, 'Average Change Over Last 3 Years': avg_change_3_years_list, 'Average Change Over Last 3 Years (%)': percentage_change_3_years_list, 'Average Change Over Last 5 Years': avg_change_5_years_list, 'Average Change Over Last 5 Years (%)': percentage_change_5_years_list} output_data_frame = pd.DataFrame(output_data, columns=['Keys', 'Average Change Over Last 3 Years', 'Average Change Over Last 3 Years (%)', 'Average Change Over Last 5 Years', 'Average Change Over Last 5 Years (%)']) output_dir_path = join(self.root_path, "out", splitext(f)[0].replace(" ", "_") + "_Annual_Report.csv") self.file_utils.write_csv(output_dir_path, output_data_frame)
def _filter_nodes(self): node_id_list = [] for node in self.tree.all_nodes(): if not node.is_root(): parent_node = self.tree.parent(node.identifier) if not MathUtils().compare(node.data[con.SLOPE_VALUE], parent_node.data[con.SLOPE_VALUE]) and\ not parent_node.data[con.TAGGED]: node.data[con.TAGGED] = True # print('###### Tagged Nodes: ' + node.identifier + ' data: ' + str(node.data)) node_id_list.append(node.identifier) else: node.data[con.TAGGED] = False # print('Un-tagged Nodes: ' + node.identifier + ' data: ' + str(node.data)) return node_id_list
def _process_tree(self): root_node = self.tree.get_node(self.tree.root) try: print(root_node.data[con.VALUES]) except TypeError: dummy_list = self.tree.children(root_node.identifier) root_node.data = { con.VALUES: [(x + y + z) for x, y, z in zip( dummy_list[0].data[con.VALUES], dummy_list[1].data[ con.VALUES], dummy_list[2].data[con.VALUES])], con.TAGGED: False } for i in self.tree.all_nodes(): i.data[con.AVERAGE_CHANGE] = MathUtils().average_change( i.data[con.VALUES]) slope, const = MathUtils().get_linear_function_properties( i.data[con.VALUES]) i.data[con.AVERAGE_PERCENTAGE_CHANGE] = MathUtils().average_change( MathUtils().merge_list_percentage(i.data[con.VALUES], root_node.data[con.VALUES])) i.data[con.SLOPE_VALUE] = slope i.data[con.CONSTANT_VALUE] = const
def get_cdf(self, x): """ Returns the cumulative probability distribution for the KDE. :param x: the point :return: the cumulative probability from 0 to x. """ if isinstance(x, float): x = np.array([x]) if len(x) != self.get_dimensions(): raise ValueError("Illegal dimensionality: ", x.length, "!=", self.get_dimensions()) nb_lower_points = 0 for data_idx in range(len(self._points)): if MathUtils.is_lower(self._points[data_idx, :], x): nb_lower_points += 1 return nb_lower_points / len(self._points)
def get_cdf(self, x): """ Returns the cumulative distribution for the distribution (by counting all the points with a value that is lower than x). :param x: the point :return: the cumulative density function up to the point """ if isinstance(x, float): x = np.array([x]) if len(x) != self.get_dimensions(): raise ValueError("Illegal dimensionality: %d != %d" % (len(x), self.get_dimensions())) cdf = 0. for point in self._points.keys(): if MathUtils.is_lower(np.array(point), x): cdf += self._points[point] return cdf
def get_prob(self, value): """ Returns the probability P(val). :param value: the value :return: the associated probability, if one exists. """ if value in self._table: return self._table[value] elif isinstance(value, DoubleVal) and self._is_continuous(): # if the distribution has continuous values, search for the closest element to_find = value.get_double() closest = None min_distance = math.inf for v in self._table.keys(): distance = abs(v.get_double() - to_find) if distance < min_distance: closest = v min_distance = distance return self.get_prob(closest) elif isinstance(value, ArrayVal) and self._is_continuous(): to_find = value.get_array() closest = None min_distance = math.inf for v in self._table.keys(): if isinstance(v, NoneVal): continue distance = MathUtils.get_distance(v.get_array(), to_find) if distance < min_distance: closest = v min_distance = distance return self.get_prob(closest) return 0.
def __init__(self): self.root_path = join(expanduser('~'), "PycharmProjects", "python-vistalytics", "source") self.math_utils = MathUtils() self.file_utils = FileUtils()
class ReportOutlierDetection(object): def __init__(self): self.root_path = join(expanduser("~"), "PycharmProjects", "python-vistalytics", "source") self.ignore_keys = ['Margins % of Sales', 'Profitability', 'Cash Flow Ratios', 'Balance Sheet Items (in %)', 'Liquidity/Financial Health', 'Efficiency'] self.sheet_list = ['Income_Statement', 'Balance_Sheet', 'Cash_Flow_Statement', 'Key_Ratios'] self.math_utils = MathUtils() self.file_utils = FileUtils() def _process_data(self, values, v): std_dev = self.math_utils.get_std_dev(values) mean_val = self.math_utils.get_mean(values) if 0 < 3 * std_dev < abs(v - mean_val): return std_dev, mean_val return 0, 0 def run(self): src_dir_path = join(self.root_path, "in", "combined") file_list = self.file_utils.get_files(src_dir_path) for f in file_list: data = self.file_utils.read_excel(src_dir_path, f, [0, 1, 2, 3]) for j in range(0, 4): avg_change_3_years_list = [] percentage_change_3_years_list = [] avg_change_5_years_list = [] percentage_change_5_years_list = [] dev_list = [] mean_list = [] temp = data.get(j) if 'TTM' in temp: temp = temp.drop('TTM', 1) temp.fillna(0, inplace=True) temp.replace('(%)', '', inplace=True, regex=True) for i, v in temp.iterrows(): size_val = len(v) if not pd.isnull(i) and i not in self.ignore_keys and size_val is not 0: v = [float(x) for x in v] # Get mean and deviation x, y = self._process_data(v[0:size_val-2], v[size_val - 1]) dev_list.append(x) mean_list.append(y) # Get average and percentage for 3 and 5 years. val1 = self.math_utils.percentage_change(0 if x is 0 else v[size_val - 3], v[size_val - 1]) val2 = self.math_utils.percentage_change(0 if x is 0 else v[size_val - 5], v[size_val - 1]) avg_change_3_years_list.append(0 if abs(val1) < 5 else self.math_utils.average_change(v[len(v) - 3:])) percentage_change_3_years_list.append(val1) avg_change_5_years_list.append(0 if abs(val2) < 5 else self.math_utils.average_change(v[size_val - 5:])) percentage_change_5_years_list.append(val2) print("################################") print("For Index: " + i + " Last change " + str(v[size_val-1]) + " is outlier.") else: avg_change_3_years_list.append(0) percentage_change_3_years_list.append(0) avg_change_5_years_list.append(0) percentage_change_5_years_list.append(0) dev_list.append(0) mean_list.append(0) temp.insert(len(temp.columns), 'Deviation', dev_list) temp.insert(len(temp.columns), 'Mean', mean_list) temp.insert(len(temp.columns), 'Average Change Over Last 3 Years', avg_change_3_years_list) temp.insert(len(temp.columns), 'Average Change Over Last 3 Years (%)', percentage_change_3_years_list) temp.insert(len(temp.columns), 'Average Change Over Last 5 Years', avg_change_5_years_list) temp.insert(len(temp.columns), 'Average Change Over Last 5 Years (%)', percentage_change_5_years_list) # print(temp) out_src_dir = join(self.root_path, 'out', splitext(f)[0].replace(" ", "_") + "_Combined_Report.xlsx") self.file_utils.write_excel(out_src_dir, temp, self.sheet_list[j])
class ReportChangeComparision: def __init__(self): self.tree_str = TreeStructure() self.tree = self.tree_str.tree self.report_choice = con.INCOME_STATEMENTS self.file_utils = FileUtils() self.math_utils = MathUtils() self.root_path = join(expanduser('~'), 'PycharmProjects', 'python-vistalytics', 'source') def _read_csv(self, resource_dir): src_dir_path = join(self.root_path, 'in', 'csv', resource_dir) files = self.file_utils.get_files(src_dir_path) data_list = [] for f in files: data_list.append(self.file_utils.read_csv(src_dir_path, f)) return data_list def _process_data(self, data_list): for data in data_list: if con.TTM in data.columns: data.drop(con.TTM, 1, inplace=True) for i, v in data.iterrows(): if self.tree.__contains__(self.tree_str.get_string(i)): node = self.tree.get_node(self.tree_str.get_string(i)) node.data = {con.VALUES: v.tolist()} def _process_tree(self): root_node = self.tree.get_node(self.tree.root) try: values = root_node.data[con.VALUES] except TypeError: dummy_list = self.tree.children(root_node.identifier) root_node.data = { con.VALUES: [(x + y + z) for x, y, z in zip( dummy_list[0].data[con.VALUES], dummy_list[1].data[ con.VALUES], dummy_list[2].data[con.VALUES])] } for i in self.tree.all_nodes(): i.data[con.AVERAGE_CHANGE] = self.math_utils.average_change( i.data[con.VALUES]) slope, const = self.math_utils.get_linear_function_properties( i.data[con.VALUES]) i.data[con. AVERAGE_PERCENTAGE_CHANGE] = self.math_utils.average_change( self.math_utils.merge_list_percentage( i.data[con.VALUES], root_node.data[con.VALUES])) i.data[con.SLOPE_VALUE] = slope i.data[con.CONSTANT_VALUE] = const def _print_text(self): root_node = self.tree.get_node(self.tree.root) print('##################################################') print('For time period 2012-2016 analysed: \n \n') for i in self.tree.all_nodes(): indicator_str1 = con.RAISED if i.data[ con.AVERAGE_CHANGE] > 0 else con.DROPPED if i.is_root(): print('Index ' + i.identifier.upper() + ' has ' + indicator_str1 + ' by ' + str(i.data[con.AVERAGE_CHANGE]) + '\n \n') root_str = { con.KEY: i.identifier.upper(), con.AVERAGE_CHANGE: i.data[con.AVERAGE_CHANGE], con.INDICATOR: indicator_str1 } else: indicator_str2 = con.INCREASED if i.data[ con.AVERAGE_PERCENTAGE_CHANGE] > 0 else con.REDUCED print('Index ' + i.identifier.upper() + ' has ' + indicator_str1 + ' by ' + str(i.data[con.AVERAGE_CHANGE])) print('While total ' + str(root_str.get(con.KEY)) + ' has ' + str(root_str.get(con.INDICATOR)) + ' ' + i.identifier.upper() + ' has ' + indicator_str2 + ' by ' + str(i.data[con.AVERAGE_PERCENTAGE_CHANGE]) + '%') # Trend comparision with root node indicator_str3 = 'in ' + con.OPPOSITE_DIRECTION if i.data[con.SLOPE_VALUE] < 0 \ else con.RAPIDLY if i.data[con.SLOPE_VALUE] > root_node.data[con.SLOPE_VALUE] else con.SLOWLY print('Index ' + i.identifier.upper() + ' is growing ' + indicator_str3 + ' with rate ' + str(i.data[con.SLOPE_VALUE]) + ' as compared to base index ' + root_node.identifier.upper() + ' where growth rate is ' + str(root_node.data[con.SLOPE_VALUE])) # # Comparision with all sibling nodes # nid = i.identifier # sibling_node = tree.siblings(nid) # for n in sibling_node: # indicator_str3 = con.RAPIDLY if i.data[con.SLOPE_VALUE] > n.data[con.SLOPE_VALUE] else con.SLOWLY # print('Index ' + i.identifier.upper() + ' is growing ' + indicator_str3 + ' with rate ' + # str(i.data[con.SLOPE_VALUE]) + ' as compared to base index ' + # n.identifier.upper() + ' where growth rate is ' + str(n.data[con.SLOPE_VALUE])) print('\n \n') def run(self): print( "Choices :\n\n\tDefault: Income Statement\n\t1: Balance Statement\n\t2: Cash Flow Statement" ) c = input() try: self.report_choice = self.tree_str.report_choices[int(c)] if self.report_choice is con.BALANCE_STATEMENTS: self.tree_str.get_balance_tree() else: self.tree_str.get_cash_flow_tree() except KeyError: self.report_choice = con.INCOME_STATEMENTS self.tree_str.get_income_tree() except ValueError: self.report_choice = con.INCOME_STATEMENTS self.tree_str.get_income_tree() self.tree = self.tree_str.tree print(self.report_choice) print(self.tree) data_list = self._read_csv(self.report_choice) # print(data_list) self._process_data(data_list) # print(self.tree.to_json(with_data=True)) self._process_tree() # print(self.tree.to_json(with_data=True)) self._print_text()