def parse(self, pdf_path, data_date=None): self.pdf_path = pdf_path self.doc = fitz.Document(pdf_path) self.date = self.find_data_date(data_date) dataframes = [ self.process_page(page, page_num) for page_num, page in enumerate(self.doc.pages()) ] dataframes = [ dataframe for dataframe in dataframes if dataframe is not None ] if not dataframes: raise ParseError( self.pdf_path, 'No data found. Probably problems with encoding.') if self.unification_info: if self.overlapping_info and self.overlapping_info['values']: result = ParsingResult(unification_info=self.unification_info, overlapping_info=self.overlapping_info) else: result = ParsingResult(unification_info=self.unification_info) return result if self.overlapping_info and self.overlapping_info['values']: raise UniqueError(self.overlapping_info) if self.warnings: return ParsingResult(warnings=self.warnings) return None
def parse(self, pdf_path, data_date=None): self.pdf_path = pdf_path self.doc = fitz.Document(pdf_path) if data_date: self.date = data_date self.data_year = data_date.year self.yearbook_year = data_date.year + 1 else: self.find_data_date() self.date = date(int(self.data_year), month=12, day=31) self.skip = [ f'Spółki według wartości rynkowej (na koniec {self.data_year} r.)', f'Spółki według wartości rynkowej (na koniec {self.data_year} r.) (cd.)', f'Spółki o największej wartości rynkowej na koniec {self.data_year} r.' ] self.stop_parsing = [ 'Razem spółki zagraniczne', 'Spółki zagraniczne razem:', f'{self.yearbook_year} Rocznik Giełdowy' ] dataframes = [ self.process_page(page, page_num) for page_num, page in enumerate(self.doc.pages()) ] dataframes = [ dataframe for dataframe in dataframes if dataframe is not None ] if not dataframes: raise ParseError( self.pdf_path, 'No data found. Probably problems with encoding.') if self.unification_info: if self.overlapping_info and self.overlapping_info['values']: result = ParsingResult(unification_info=self.unification_info, overlapping_info=self.overlapping_info) else: result = ParsingResult(unification_info=self.unification_info) return result if self.overlapping_info and self.overlapping_info['values']: raise UniqueError(self.overlapping_info) if self.warnings: return ParsingResult(warnings=self.warnings) return None
def parse(self, path, data_date=None): self.path = path self.workbook = xlrd.open_workbook(self.path, on_demand=True) self.date, sheet_names = self.get_date_and_sheet_names(data_date) data = [self.parse_sheet(sheet_name) for sheet_name in sheet_names] data = [d for d in data if d] if not data: raise ParseError(self.path, 'No data found.') if self.unification_info: if self.overlapping_info and self.overlapping_info['values']: result = ParsingResult(unification_info=self.unification_info, overlapping_info=self.overlapping_info) else: result = ParsingResult(unification_info=self.unification_info) return result if self.overlapping_info and self.overlapping_info['values']: raise UniqueError(self.overlapping_info) return None
def download_all_companies(self, user_date): day, month, year = user_date.day, user_date.month, user_date.year interval_id = get_interval_id_from_shortcut('d') overlapping_stock = {} i = 1 frames = [] frames_change = [] found = False while True: url = self._all_companies_date_ulr_base.format(number=i, day=day, month=month, year=year) site_html = requests.get(url).content.decode("utf-8") try: df_list = pd.read_html(site_html) except ValueError: break except lxml.etree.ParserError: break if len(df_list) == 0: break for df in df_list: if 'Symbol' in df.columns and 'Name' in df.columns and 'Last' in df.columns: if not df.empty and not df.Symbol.apply(lambda x: bool( self._tables_filter.match(str(x)))).any(): frames.append(df) found = True if not found: break i += 1 found = False i = 1 found = False while True: url_change = self._all_companies_date_ulr_change.format( number=i, day=day, month=month, year=year) site_html_change = requests.get(url_change).content try: df_list_change = pd.read_html(site_html_change) except ValueError: break except lxml.etree.ParserError: break if len(df_list_change) == 0: break for df in df_list_change: if 'Symbol' in df.columns and 'Name' in df.columns and 'Change' in df.columns: if not df.empty and not df.Symbol.apply(lambda x: bool( self._tables_filter.match(str(x)))).any(): frames_change.append(df) found = True if not found: break i += 1 found = False if frames is None or len(frames) == 0: raise ParseError(url, "No stock quotes found for given date") if frames_change is None or len(frames_change) == 0: raise ParseError(url_change, "No stock quotes found for given date") result = pd.concat(frames) result_change = pd.concat(frames_change) result_change = result_change[['Symbol', 'Change.1']] result = result.join(result_change.set_index('Symbol'), on='Symbol') result = result.where(result.notnull(), None) try: result['Volume'] = result['Volume'].apply( lambda x: _convert_kmb(x)) result['Turnover'] = result['Turnover'].apply( lambda x: _convert_kmb(x)) except ValueError: raise ParseError(url, 'Wrong data in Volume/Turnover column') unification_info = [] for index, row in result.iterrows(): parsed_data = date(year, month, day) ticker = row['Symbol'].upper() company = Company(name=row['Name'], ticker=ticker) company_id, possible_companies = get_company(company) if company_id is None and not possible_companies: company_id = insert_company(company) if row['Last'] is None: continue stock_quotes = [ company_id, str(parsed_data), row['Last'], row['Change.1'], row['Open'], row['High'], row['Low'], row['Volume'], row['Turnover'], interval_id ] if possible_companies: unification_info.append( StooqUnificationInfo(company=company, possible_matches=possible_companies, data=[stock_quotes])) else: stock_quotes_to_insert = (company_id, parsed_data, float_or_none(row['Last']), float_or_none(row['Change.1']), float_or_none(row['Open']), float_or_none(row['High']), float_or_none(row['Low']), int_or_none(row['Volume']), int_or_none(row['Turnover']), interval_id) try: insert_stock_quotes(stock_quotes_to_insert) except IntegrityError: if not exactly_same_stock_quote(stock_quotes_to_insert): if not overlapping_stock: self._init_overlapping_info(overlapping_stock) overlapping_stock["values"].append(stock_quotes) if unification_info: if overlapping_stock: result = ParsingResult(unification_info=unification_info, overlapping_info=overlapping_stock) else: result = ParsingResult(unification_info=unification_info) return result if overlapping_stock: raise UniqueError(overlapping_stock)
def download_company(self, company, start_date, end_date, interval='d'): # no turnover start_day, start_month, start_year = start_date.day, start_date.month, start_date.year end_day, end_month, end_year = end_date.day, end_date.month, end_date.year i = 1 frames = [] found = False interval_id = get_interval_id_from_shortcut(interval) ticker = company.upper() overlapping_stock = {} url = self._company_url_base.format(number=1, company=ticker, day1=start_day, month1=start_month, year1=start_year, day2=end_day, month2=end_month, year2=end_year, interval=interval) site_html = requests.get(url).content.decode("utf-8") company_name = re.search('Historical data: (.*) \(', str(site_html)).group(1) company = Company(name=company_name, ticker=ticker) company_id, possible_companies = get_company(company) if company_id is None and not possible_companies: company_id = insert_company(company) unification_info = StooqUnificationInfo( company=company, possible_matches=possible_companies, data=[]) while True: url = self._company_url_base.format(number=i, company=ticker, day1=start_day, month1=start_month, year1=start_year, day2=end_day, month2=end_month, year2=end_year, interval=interval) site_html = requests.get(url).content.decode("utf-8") try: df_list = pd.read_html(site_html) except ValueError: break except lxml.etree.ParserError: break if len(df_list) == 0: break for df in df_list: if 'Date' in df.columns and 'Close' in df.columns: if not df.empty and not df.Date.isnull().any(): frames.append(df) found = True if not found: break i += 1 found = False if frames is None or len(frames) == 0: raise ParseError(url, "No stock quotes found for given data") result = pd.concat(frames) result = result[::-1] result = result.where(result.notnull(), None) try: result['Volume'] = result['Volume'].apply( lambda x: _convert_kmb(x)) except ValueError: raise ParseError(url, 'Wrong data in Volume column') for index, row in result.iterrows(): if pd.isnull(row['No.']): continue if row['Close'] is None: continue try: parsed_date = _parse_date(row['Date']) except (ValueError, TypeError): raise ParseError(url, 'Wrong date format') stock_quotes = [ company_id, str(parsed_date), row['Close'], row['Change.1'], row['Open'], row['High'], row['Low'], row['Volume'], None, interval_id ] if possible_companies: unification_info.add_data(stock_quotes) else: stock_quotes_to_insert = (company_id, parsed_date, float_or_none(row['Close']), float_or_none(row['Change.1']), float_or_none(row['Open']), float_or_none(row['High']), float_or_none(row['Low']), int_or_none(row['Volume']), None, interval_id) try: insert_stock_quotes(stock_quotes_to_insert) except IntegrityError: if not exactly_same_stock_quote(stock_quotes_to_insert): if not overlapping_stock: self._init_overlapping_info(overlapping_stock) overlapping_stock["values"].append(stock_quotes) if unification_info.data: return ParsingResult(unification_info=[unification_info]) if overlapping_stock: raise UniqueError(overlapping_stock)
def parse(self, path, end_date=None): self.path = path self.workbook = xlrd.open_workbook(path, on_demand=True) sheet_name = 'kap' excel_sheet = self.workbook.sheet_by_name(sheet_name) start_row = 8 curr_row = start_row isin_column = 1 headers_check_row = 4 name_column = 2 capitalization_column = 4 values = [] milion = 1e6 unification_info = [] overlapping_info = {} end_date = self.get_date(end_date) if "isin" in excel_sheet.cell( headers_check_row, isin_column).value.lower() and "nazwa" in excel_sheet.cell( headers_check_row, name_column).value.lower(): while curr_row < excel_sheet.nrows: isin = excel_sheet.cell(curr_row, isin_column).value name = excel_sheet.cell(curr_row, name_column).value value = excel_sheet.cell(curr_row, capitalization_column).value * milion save_value_to_database(name, isin, value, end_date, overlapping_info, unification_info, self.save, self.override) curr_row = curr_row + 1 elif "nazwa" in excel_sheet.cell( headers_check_row, isin_column).value.lower( ): # case where name is in place of isin name_column = 1 capitalization_column = 3 while curr_row < excel_sheet.nrows: name = excel_sheet.cell(curr_row, name_column).value value = excel_sheet.cell(curr_row, capitalization_column).value * milion isin = None save_value_to_database(name, isin, value, end_date, overlapping_info, unification_info, self.save, self.override) curr_row = curr_row + 1 else: raise ParseError( path, '1: "ISIN" should be in B5 cell and "Nazwa" should be in C5 cell or 2: "Nazwa" ' 'should be in B5 cell') if unification_info: if overlapping_info and overlapping_info['values']: result = ParsingResult(unification_info=unification_info, overlapping_info=overlapping_info) else: result = ParsingResult(unification_info=unification_info) return result if overlapping_info and overlapping_info['values']: raise UniqueError(overlapping_info) return None
def import_notoria(request): def render_overlapping_data_popup(chosen_sheet, sheet_shortcut, get_existing_data_func, request): for sheet in chosen_sheet: try: res = excel_parser.functions[sheet_shortcut](file_path, sheet, override=override, save=save) except UniqueError as e: for data in e.overlapping_data: existing = get_existing_data_func(data) data["exists"] = list(map(lambda x: list(x), existing)) return e, None return [], res try: if request.method == 'POST': form = NotoriaImportForm(request.POST) if form.is_valid(): file_path = request.POST.get('file_path', None) chosen_sheets_bs = form.cleaned_data.get('chosen_sheets_bs') chosen_sheets_fr = form.cleaned_data.get('chosen_sheets_fr') chosen_sheets_dp = form.cleaned_data.get('chosen_sheets_dp') directory_import = form.cleaned_data.get('directory_import') override = False save = False files_paths = [] if directory_import: if os.path.isdir(file_path): override_save = form.cleaned_data.get('override_save') for root, dirs, files in os.walk(file_path): for file in files: if is_excel_file(file): absolute_path = os.path.join(root, file) files_paths.append(absolute_path) else: messages.error( request, "Directory must have only excel files from notoria." ) return render( request, 'import/notoria.html', {'form': NotoriaImportForm()}) break if override_save == 'o': override = True save = False elif override_save == 's': save = True override = False else: messages.error( request, "Pass proper path to directory with Notoria excel files, e.g '/home/notoria" ) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) else: extension = os.path.splitext(file_path)[1] if extension == '.xls' or extension == '.xlsx': files_paths = [file_path] else: messages.error( request, "Pass proper path to Notoria excel files, e.g '/home/AGORA.xlsx'" ) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) error_bs = [] error_fr = [] error_dp = [] overlap_bs = [] overlap_fr = [] overlap_dp = [] result_bs = None result_fr = None result_dp = None try: for fp in files_paths: file_path = fp.__str__() if chosen_sheets_bs: error_bs, result_bs = render_overlapping_data_popup( chosen_sheets_bs, 'bs', get_existing_data_balance_sheet, request) if error_bs: overlap_bs = error_bs.overlapping_data if chosen_sheets_fr: error_fr, result_fr = render_overlapping_data_popup( chosen_sheets_fr, 'fr', get_existing_financial_ratios_for_parsed_file, request) if error_fr: overlap_fr = error_fr.overlapping_data if chosen_sheets_dp: error_dp, result_dp = render_overlapping_data_popup( chosen_sheets_dp, 'dp', get_existing_dupont_indicators_for_parsed_file, request) if error_dp: overlap_dp = error_dp.overlapping_data except ParseError as e: messages.error(request, e) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) except Exception as e: print(e) messages.error( request, "Error occurred while parsing. " + type(e).__name__ + ": " + str(e)) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) if error_bs or error_fr or error_dp: messages.success(request, "Parsed notoria successfully.") return render( request, 'import/notoria.html', { 'form': form, "error_bs": error_bs, "error_fr": error_fr, "error_dp": error_dp, "overlap_bs": json.dumps(overlap_bs), "overlap_fr": json.dumps(overlap_fr), "overlap_dp": json.dumps(overlap_dp) }) result = ParsingResult.combine_notoria_results( result_bs, result_dp, result_fr) if result is not None: messages.success(request, "Parsed notoria successfully") return render( request, 'import/notoria.html', { 'form': NotoriaImportForm(), 'unification_form': UnificationForm( unification=result.unification_info), 'unification': result.to_json(), 'overlapping_data': json.dumps({}) }) messages.success(request, "Parsed notoria successfully") return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) else: for field in form: for err in field.errors: messages.error(request, field.label + ": " + err) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) return render(request, 'import/notoria.html', {'form': NotoriaImportForm()}) except: return render(request, 'error.html')
def parse_ratios(self, path, sheet_name, ratio_name, table_name, override=False, save=False): function_mapping = { 'FinancialRatios': exactly_same_financial_ratios, 'DuPontIndicators': exactly_same_dupont_indicators } if sheet_name not in self.available_sheets: raise ParseError(path, "Available sheet names: QS, YS") excel_sheet = get_sheet(path, sheet_name) is_directory_import = override or save company_id, unification_info = self.get_company_id_balance_sheet( path, is_directory_import) curr_row = 200 if ratio_name == 'DuPont indicators': curr_row = 225 curr_column = 2 ratios = [company_id] overlapping_ratios = {} while curr_row < excel_sheet.nrows: if excel_sheet.cell(curr_row, curr_column).value == ratio_name: attributes_column = curr_column curr_column += 1 dates_row = curr_row + 1 curr_row += 2 attributes = ['CompanyID', 'Period start', 'Period end'] while curr_column < excel_sheet.ncols: date_value = excel_sheet.cell(dates_row, curr_column).value if not date_value: curr_column += 1 continue period_start, period_end = get_start_end_date(date_value) ratios += [period_start, period_end] while excel_sheet.cell(curr_row, attributes_column).value != '': attribute = excel_sheet.cell(curr_row, attributes_column).value curr_value = excel_sheet.cell(curr_row, curr_column).value attributes.append(attribute) insert_float_value(ratios, curr_value) curr_row += 1 if unification_info is not None: unification_info.add_data(table_name=table_name, columns=attributes, data=ratios) else: if override: common.DAL.db_queries_insert.replace_values( table_name=table_name, columns=attributes, values=ratios) elif save: common.DAL.db_queries_insert.insert_values( table_name=table_name, columns=attributes, values=ratios) else: try: common.DAL.db_queries_insert.insert_values_without_ignore( table_name=table_name, columns=attributes, values=ratios) except IntegrityError: if not function_mapping[table_name](attributes, ratios): if not overlapping_ratios: init_overlapping_info( overlapping_ratios, table_name, attributes) overlapping_ratios["values"].append(ratios) attributes = ['CompanyID', 'Period start', 'Period end'] ratios = [company_id] curr_column += 1 curr_row = dates_row + 1 break curr_row += 1 if unification_info is not None and unification_info.data: return ParsingResult([unification_info]) if overlapping_ratios and not is_directory_import: raise UniqueError(overlapping_ratios) return None
def parse_balance_sheet(self, path, sheet_name, override=False, save=False): if sheet_name not in self.available_sheets: raise ParseError(path, "Available sheet names: QS, YS") excel_sheet = get_sheet(path, sheet_name) is_directory_import = override or save company_id, unification_info = self.get_company_id_balance_sheet( path, is_directory_import) curr_row = 0 curr_column = 2 assets = [company_id] assets_categories = [company_id] equity_liabilities = [company_id] equity_liabilities_categories = [company_id] overlapping_assets = {} overlapping_assets_categories = {} overlapping_equity_liabilities = {} overlapping_equity_liabilities_categories = {} while curr_row < excel_sheet.nrows: if excel_sheet.cell(curr_row, curr_column).value == 'Balance sheet': attributes_column = curr_column curr_column += 1 dates_row = curr_row + 1 sum_row = dates_row + 1 curr_row += 3 assets_attributes = ['CompanyID', 'Date'] equity_liabilities_categories_attributes = [ 'CompanyID', 'Date' ] assets_categories_attributes = ['CompanyID', 'Date'] equity_liabilities_attributes = ['CompanyID', 'Date'] different_assets_exist = False while curr_column < excel_sheet.ncols: # check if data for that period exists if not excel_sheet.cell(sum_row, curr_column).value: curr_column += 1 continue # add date to list date_value = excel_sheet.cell(dates_row, curr_column).value assets.append(date_value) assets_categories.append(date_value) equity_liabilities.append(date_value) equity_liabilities_categories.append(date_value) # iterate from the first element until assets end while excel_sheet.cell(curr_row, attributes_column).value != '': attribute = excel_sheet.cell(curr_row, attributes_column).value curr_value = excel_sheet.cell(curr_row, curr_column).value if attribute in self.assets_categories: assets_categories_attributes.append(attribute) insert_float_value(assets_categories, curr_value) elif attribute in self.detailed_assets: assets_attributes.append(attribute) insert_float_value(assets, curr_value) else: different_assets_exist = True curr_row += 1 if different_assets_exist: for a in self.detailed_assets: if a not in assets_attributes: assets_attributes.append(a) insert_float_value(assets, '') for ac in self.assets_categories: if ac not in assets_categories_attributes: assets_categories_attributes.append(ac) insert_float_value(assets_categories, '') curr_row += 2 # omit headers and iterate until equities and liabilities end different_eq_exist = False while excel_sheet.cell( curr_row, attributes_column).value != 'Date of publication': attribute = excel_sheet.cell(curr_row, attributes_column).value curr_value = excel_sheet.cell(curr_row, curr_column).value if attribute in self.equity_liabilities_categories: equity_liabilities_categories_attributes.append( attribute) insert_float_value(equity_liabilities_categories, curr_value) elif attribute in self.detailed_equity_liabilities: equity_liabilities_attributes.append(attribute) insert_float_value(equity_liabilities, curr_value) else: different_eq_exist = True curr_row += 1 if different_eq_exist: for e in self.detailed_equity_liabilities: if e not in equity_liabilities_attributes: equity_liabilities_attributes.append(e) insert_float_value(equity_liabilities, '') for eqc in self.equity_liabilities_categories: if eqc not in equity_liabilities_categories_attributes: equity_liabilities_categories_attributes.append( eqc) insert_float_value( equity_liabilities_categories, '') if unification_info is not None: data_to_insert = [ ("Assets", assets_attributes, assets), ("EquityLiabilities", equity_liabilities_attributes, equity_liabilities), ("AssetsCategories", assets_categories_attributes, assets_categories), ("EquityLiabilitiesCategories", equity_liabilities_categories_attributes, equity_liabilities_categories), ] for data in data_to_insert: unification_info.add_data(table_name=data[0], columns=data[1], data=data[2]) else: if override: common.DAL.db_queries_insert.replace_values( table_name="Assets", columns=assets_attributes, values=assets) common.DAL.db_queries_insert.replace_values( table_name="EquityLiabilities", columns=equity_liabilities_attributes, values=equity_liabilities) common.DAL.db_queries_insert.replace_values( table_name="AssetsCategories", columns=assets_categories_attributes, values=assets_categories) common.DAL.db_queries_insert.replace_values( table_name="EquityLiabilitiesCategories", columns= equity_liabilities_categories_attributes, values=equity_liabilities_categories) elif save: common.DAL.db_queries_insert.insert_values( table_name="Assets", columns=assets_attributes, values=assets) common.DAL.db_queries_insert.insert_values( table_name="EquityLiabilities", columns=equity_liabilities_attributes, values=equity_liabilities) common.DAL.db_queries_insert.insert_values( table_name="AssetsCategories", columns=assets_categories_attributes, values=assets_categories) common.DAL.db_queries_insert.insert_values( table_name="EquityLiabilitiesCategories", columns= equity_liabilities_categories_attributes, values=equity_liabilities_categories) else: try: common.DAL.db_queries_insert.insert_values_without_ignore( table_name="Assets", columns=assets_attributes, values=assets) except IntegrityError: if not exactly_same_assets( assets_attributes, assets): if not overlapping_assets: init_overlapping_info( overlapping_assets, "Assets", assets_attributes) overlapping_assets["values"].append(assets) try: common.DAL.db_queries_insert.insert_values_without_ignore( table_name="EquityLiabilities", columns=equity_liabilities_attributes, values=equity_liabilities) except IntegrityError: if not exactly_same_equity_liabilities( equity_liabilities_attributes, equity_liabilities): if not overlapping_equity_liabilities: init_overlapping_info( overlapping_equity_liabilities, "EquityLiabilities", equity_liabilities_attributes) overlapping_equity_liabilities[ "values"].append(equity_liabilities) try: common.DAL.db_queries_insert.insert_values_without_ignore( table_name="AssetsCategories", columns=assets_categories_attributes, values=assets_categories) except IntegrityError: if not exactly_same_assets_categories( assets_categories_attributes, assets_categories): if not overlapping_assets_categories: init_overlapping_info( overlapping_assets_categories, "AssetsCategories", assets_categories_attributes) overlapping_assets_categories[ "values"].append(assets_categories) try: common.DAL.db_queries_insert.insert_values_without_ignore( table_name="EquityLiabilitiesCategories", columns= equity_liabilities_categories_attributes, values=equity_liabilities_categories) except IntegrityError: if not exactly_same_equity_liabilities_categories( equity_liabilities_categories_attributes, equity_liabilities_categories): if not overlapping_equity_liabilities_categories: init_overlapping_info( overlapping_equity_liabilities_categories, "EquityLiabilitiesCategories", equity_liabilities_categories_attributes ) overlapping_equity_liabilities_categories[ "values"].append( equity_liabilities_categories) assets_attributes = ['CompanyID', 'Date'] assets_categories_attributes = ['CompanyID', 'Date'] equity_liabilities_attributes = ['CompanyID', 'Date'] equity_liabilities_categories_attributes = [ 'CompanyID', 'Date' ] assets = [company_id] equity_liabilities = [company_id] assets_categories = [company_id] equity_liabilities_categories = [company_id] curr_column += 1 curr_row = sum_row + 1 break curr_row += 1 overlapping_data = [] if overlapping_assets: overlapping_data.append(overlapping_assets) if overlapping_assets_categories: overlapping_data.append(overlapping_assets_categories) if overlapping_equity_liabilities: overlapping_data.append(overlapping_equity_liabilities) if overlapping_equity_liabilities_categories: overlapping_data.append(overlapping_equity_liabilities_categories) if unification_info is not None and unification_info.data: return ParsingResult([unification_info]) if overlapping_data and not is_directory_import: raise UniqueError(*overlapping_data) return None