def get_all_rics_data(self): writer = pd.ExcelWriter('00 BAP FY18-1-2-3 Numbers.xlsx') for ric in self.rics: print(ric.upper()) self.Q1CompanyData_sheet = self.read_file_source(self.path_quarter_one, ric, WorkSheet.bap_company_old.value) self.Q2CompanyData_sheet = self.read_file_source(self.path_quarter_two, ric, WorkSheet.bap_company_old.value) self.Q3CompanyData_sheet = self.read_file_source(self.path_quarter_three, ric, WorkSheet.bap_company.value) data_source = Common.set_datasource(ric) batch1 = db.pandas_read(self.batch.format(self.year, self.Q1, data_source, SourceSystemType.RICCD_bap.value))['BatchID'] batch2 = db.pandas_read(self.batch.format(self.year, self.Q2, data_source, SourceSystemType.RICCD_bap.value))['BatchID'] batch3 = db.pandas_read(self.batch.format(self.year, self.Q3, data_source, SourceSystemType.RICCD_bap.value))['BatchID'] self.Q1CompanyData = db.pandas_read(self.selectQ1.format(str(batch1[0]) + ' ORDER BY CompanyName')) self.Q2CompanyData = db.pandas_read(self.select.format('Config.CompanyDataRaw', str(batch2[0]) + ' ORDER BY CompanyName')) self.Q3CompanyData = db.pandas_read(self.select.format('BAP.QuarterlyCompanyData', str(batch3[0]) + ' ORDER BY [Company Name]')) self.Q1CompanyData_fact_ric = db.pandas_read(self.select.format('Reporting.FactRICCompanyData', batch1[0])) self.Q2CompanyData_fact_ric = db.pandas_read(self.select.format('Reporting.FactRICCompanyData', batch2[0])) self.Q3CompanyData_fact_ric = db.pandas_read(self.select.format('Reporting.FactRICCompanyData', batch3[0])) self.Q1CompanyData_rollup = db.pandas_read(SQL.sql_rollup_select.value.format(self.year, 1, data_source)) self.Q2CompanyData_rollup = db.pandas_read(SQL.sql_rollup_select.value.format(self.year, 2, data_source)) self.Q3CompanyData_rollup = db.pandas_read(SQL.sql_rollup_select.value.format(self.year, 3, data_source)) df_ric = self.bap_summary() if df_ric is not None: df_ric.to_excel(writer, ric.upper(), index=False) df_ric = None writer.save()
def combine_bap_missing_source_file(self, current_path=''): clms = [ 'CompanyName', 'Website', 'AnnualRevenue', 'NumberOfEmployees', 'FundingToDate', 'DataSource', 'Fiscal_Quarter', 'FiscalYear' ] if current_path != '': current_path = os.path.join(os.path.expanduser("~"), current_path) os.chdir(current_path) self.source_file = os.listdir(current_path) file_list = self.get_source_file() q_company = [] i = 0 for fl in file_list: try: i += 1 ds = COM.set_datasource(str(fl)) if ds is not None: com = pd.read_excel(fl, WS.bap_company.value) print('{}.[{}] {} -->{}'.format(i, ds, fl, len(com.columns))) # print(com.head()) com.insert(5, 'DataSource', ds) com.columns = clms q_company.append(com) else: print('\tMissing - {}'.format(fl)) except Exception as ex: print(ex) print('\n\n') bap_company = pd.concat(q_company) return bap_company
def read_source_file(self, ftype, data_source, combine_for: Combine, file_name='', current_path=''): com_a = None if current_path != '': current_path = os.path.join(os.path.expanduser("~"), current_path) os.chdir(current_path) self.source_file = os.listdir(current_path) file_list = self.get_source_file() if data_source == DS.BAP: if ftype == FT.SPREAD_SHEET.value: l_company, l_company_annual, l_program, l_program_youth = self.bap_dataframes( ) i = 0 for fl in file_list: try: i += 1 ds = COM.set_datasource(str(fl)) print('{}. {}'.format(i, fl)) prg = pd.read_excel(fl, WS.bap_program.value) prg.drop(prg.columns[[13]], inplace=True, axis=1) prg.columns = self.program_columns prg_youth = pd.read_excel(fl, WS.bap_program_youth.value) prg_youth.drop(prg_youth.columns[2], inplace=True, axis=1) prg_youth.columns = self.program_youth_columns com = pd.read_excel(fl, WS.bap_company.value) com.columns = self.quarterly_company_columns if ds in [DST.COMMUNI_TECH.value, DST.HAL_TECH.value ] and self.quarter - 1 == 3: com_a = pd.read_excel(fl, WS.bap_company_annual.value) com_a.columns = self.annual_company_columns l_company_annual.append(com_a) if combine_for == Combine.FOR_ETL: FileService.data_system_source( prg, prg_youth, com, com_a, os.getcwd(), str(fl), ds) l_program.append(prg) l_program_youth.append(prg_youth) l_company.append(com) except Exception as ex: print(ex) bap_program = pd.concat(l_program) bap_program_youth = pd.concat(l_program_youth) bap_company = pd.concat(l_company) if self.quarter - 1 == 4: #combine_for == Combine.FOR_ETL or combine_for == Combine.FOR_QA: bap_company_annual = pd.concat(l_company_annual) return bap_program, bap_program_youth, bap_company, bap_company_annual else: return bap_program, bap_program_youth, bap_company, None elif data_source == DS.CBINSIGHT: if ftype == FT.CSV: if file_name != '': return pd.read_csv(file_name) else: for fl in file_list: self.data_list.append(pd.read_csv(fl)) return self.data_list elif data_source == DS.OSVP: print('') print('') elif data_source == DS.OTHER: if ftype == FT.SPREAD_SHEET: target_list = [] self.target_list_dataframe() j = 0 for f in file_list: j += 1 print('{}. {}'.format(j, f)) tl = pd.read_excel(f, WS.target_list.value) tl.insert(5, 'Venture_basic_name', None) datasource = COM.set_datasource(f) tl.insert(0, 'DataSource', datasource) tl.insert(0, 'Worksheet', str(WS.target_list.value)) tl.insert(0, 'FileName', str(f)) tl.insert(0, 'Path', self.path) tl.insert(0, 'CompanyID', None) tl.insert(0, 'BatchID', 0) tl['OSVP_Status'] = None tl['CII'] = None tl['Year'] = self.year tl['Status'] = 1 # tl.columns = self.tl_columns # tl['Date_founded'] = tl['Date_founded'][:10] # tl['Date_of_incorporation'] = tl['Date_of_incorporation'][:10] target_list.append(tl) print('{} - {}'.format(len(tl.columns), tl.columns)) df_tl = pd.concat(target_list) return df_tl elif data_source == DS.IAF: if ftype == FT.SPREAD_SHEET: iaf_list = [] k = 0 for ia in file_list: k += 1 print('{}. {}'.format(k, ia)) iaf = pd.read_excel(ia, sheet_name=None) for i in range(len(list(iaf.items()))): if i == 0: df_summary = list(iaf.items())[i][1] elif i > 0: df = list(iaf.items())[i][1][:-8].T[1:] # print(df[1][0]) # print(list(iaf.items())[1][1]['General Info - Firm / Entrepreneur']) iaf_list.append(df) df_iaf = pd.concat(iaf_list) df_iaf = df_iaf.where(pd.notnull(df_iaf), None) df_summary = df_summary.where(pd.notnull(df_summary), None) return df_iaf, df_summary