def update_cb_basic_company(self): df = db.pandas_read(sql.sql_cb_basic_company.value) for _, r in df.iterrows(): basic_name = CM.get_basic_name(r['name']) sql_update = sql.sql_cb_basic_company_update.value.format(basic_name, CM.sql_compliant(r['org_uuid'])) print(sql_update) db.execute(sql_update)
def schema_to_dfs(self, schema, which_tables="all"): """ Takes name of schema and returns dict of dataframes, one entry for each table in schema. Key = table name Value = dataframe Dataframes will contain results from SELECT * FROM TABLE. :param schema: :return dict: """ select_all = "SELECT * FROM " sql = str( "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '" + str(schema) + "'") tables = DAL.pandas_read(sql) if which_tables == "all": table_names = tables["TABLE_NAME"] else: table_names = which_tables table_dict = {} sql_statements = [] for table in table_names: statement = str(select_all + schema + "." + table) sql_statements.append(statement) table_dict[table] = 0 for statement in sql_statements: df = DAL.pandas_read(statement) table_name = statement.split(".")[1] for key in table_dict.keys(): if key.lower() == table_name.lower(): table_dict[key] = df return table_dict
def push_entity_to_db(self, json, org_uuid, sql_insert, uuid, i=0, fk_uuid='org_uuid', columns=[]): try: json_properties = None if CBDict.properties.value in json.keys(): json_properties = json[CBDict.properties.value] elif json[CBDict.cardinality.value] == 'OneToOne': json_properties = json[CBDict.item.value][CBDict.properties.value] elif json[CBDict.cardinality.value] == 'OneToMany': json_properties = json[CBDict.items.value][i][CBDict.properties.value] if 'uuid' not in json_properties.keys(): json_properties['uuid'] = uuid if fk_uuid not in json_properties.keys(): json_properties[fk_uuid] = org_uuid # print(list(json_properties.keys())) df_properties = pd.DataFrame([json_properties], columns=json_properties.keys()) if len(columns) > 0: df_properties = df_properties[columns] values = CM.df_list(df_properties) val = [] for l, j in enumerate(values[0]): if isinstance(values[0][l], list): val.append(' , '.join(str(x) for x in values[0][l])) elif isinstance(values[0][l], str): val.append(self.common.sql_compliant(values[0][l])) else: val.append(values[0][l]) db.bulk_insert(sql_insert, [val]) except Exception as ex: print(ex)
def update_tdw_basic_company(self): df = db.pandas_read(sql.sql_tdw_basic_company.value) for _, r in df.iterrows(): basic_name = CM.get_basic_name(r.legal_name) sql_update = sql.sql_tdw_basic_company_update.value.format(basic_name, CM.sql_compliant(r.legal_name)) print(sql_update) db.execute(sql_update)
def __init__(self): self.db = DB() self.training_file = 'company_traning.json' self.setting_file = 'company_settings' self.result_set = None self.start_time = time.time()
def __init__(self): self.source_table = 'MDC_DEV.dbo.SourceTable' self.valid = validate() self.data = db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture").to_dict('index') self.source = db.pandas_read( 'SELECT * FROM MDC_DEV.dbo.SourceTable WHERE ID < 0').to_dict( 'index')
def transfer_fact_ric_aggregation(): date_id = COM.get_dateid(datevalue=None) metric_prg = [130, 132, 133, 129, 134, 63, 77, 60, 68, 67, 135, 136, 137] metric_prg_youth = [134, 138] df_program = db.pandas_read(sql.sql_company_aggregate_program.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter)) df_program_youth = db.pandas_read(sql.sql_company_aggregate_program_youth.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter)) values = [] for _, row in df_program.iterrows(): i = 7 while i < 20: m = i - 7 val = [] val.append(int(row['DataSource'])) # DataSource val.append(int(date_id)) # RICDateID val.append(int(metric_prg[m])) # MetricID val.append(int(row['BatchID'])) # BatchID if str(row[i]) in ['no data', 'n\\a', '-', 'n/a', 'nan']: val.append(-1.0) print(row[i]) else: val.append(round(float(row[i]), 2)) # AggregateNumber val.append(str(datetime.datetime.today())[:23]) # ModifiedDate val.append(str(datetime.datetime.today())[:23]) # CreatedDate val.append(row['Youth']) # Youth values.append(val) i = i + 1 # db.execute(sql.sql_bap_fra_insert.value.format(tuple(val))) for _, row in df_program_youth.iterrows(): j = 7 while j < 9: m = j - 7 val = [] val.append(int(row['DataSource'])) # DataSource val.append(int(date_id)) # RICDateID val.append(int(metric_prg_youth[m])) # MetricID val.append(int(row['BatchID'])) # BatchID if str(row[j]) in ['no data', 'n\\a', '-', 'n/a', 'nan']: val.append(-1.0) print(row[j]) else: val.append(round(float(row[j]), 2)) # AggregateNumber val.append(str(datetime.datetime.today())[:23]) # ModifiedDate val.append(str(datetime.datetime.today())[:23]) # CreatedDate val.append(row['Youth']) # Youth values.append(val) j = j + 1 # db.execute(sql.sql_bap_fra_insert.value.format(tuple(val))) for val in range(len(values)): print('{}. {}'.format(val,values[val])) # print('{}. {}'.format(val,values[val][1])) db.bulk_insert(sql.sql_bap_fact_ric_aggregation_insert.value, values)
def push_bap_missing_data_to_temp_table(): current_path = os.path.join(os.path.expanduser("~"), '/Users/mnadew/Box Sync/Workbench/BAP/BAP_FY18/FY18_Q3/for ETL/Missing data Reports') os.chdir(current_path) df = pd.read_excel('00 BAP Missing data Combined.xlsx', 'BAP Missing data') df['CompanyID'] = 0 new_col = ['CompanyID','CompanyName','BasicName','Website','AnnualRevenue','NumberOfEmployees','FundingToDate','DataSource'] dfs = df[new_col] sql = 'INSERT INTO BAP.BAP_FY18Q3_Missing_Data VALUES (?, ?, ?, ?, ?, ?, ?, ?)' values = COM.df_list(dfs) db.bulk_insert(sql, values)
def delete_old_ans(): # delete old ans using answer ids # store old ans in xl file old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'old_ans') old_ans_df = DB.pandas_read(old_ans_sql) DBInteractions.store_df(old_ans_df, '_OLD_PIPE_ANS') # run sql to delete old ans del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'del_old_ans') DB.execute(del_old_ans_sql)
def update(self, table_name, source_id_col, company_id_col): etl = common.df_list( db.pandas_read('SELECT ' + source_id_col + ',' + company_id_col + ' FROM ' + table_name)) for index1, val1 in enumerate(self.source_table): for index2, val2 in enumerate(etl): if val1[0] == str(val2[0]): db.execute('UPDATE ' + table_name + ' SET ' + company_id_col + ' = ' + str(val1[1]) + ' WHERE ' + source_id_col + ' = ' + str(val2[0])) break
def create_bap_batch(): batch = BatchService() program = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program.value,BapQuarterly.year,BapQuarterly.quarter)) program_youth = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program_youth.value, BapQuarterly.year,BapQuarterly.quarter)) company = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.venture_data.value, BapQuarterly.year, BapQuarterly.quarter)) comapny_annual = db.pandas_read(sql.sql_annual_bap_distinct_batch.value.format(tbl.venture_annual.value,BapQuarterly.year)) # batch.create_bap_batch(program, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program.value, WS.bap_program.value, ss.RICPD_bap.value) # batch.create_bap_batch(program_youth, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program_youth.value, WS.bap_program_youth.value, ss.RICPDY_bap.value) batch.create_bap_batch(company, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_data.value, WS.bap_company.value, ss.RICCD_bap.value) if BapQuarterly.quarter == 3: batch.create_bap_batch(comapny_annual, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_annual.value, WS.bap_company_annual.value, ss.RICACD_bap.value)
def transfer_fact_ric_company_data(): df = db.pandas_read(sql.sql_bap_fact_ric_data_fyq4.value) df_frc = BapQuarterly.get_proper_values(df) # BapQuarterly.update_month_year(df_frc) # df_frc['IntakeDate'] = pd.to_datetime(df_frc['IntakeDate']) df_frc['Age'] = None # df_frc['Date of Incorporation'] = pd.to_datetime(df_frc['Date of Incorporation']) # df_ric = df_frc.drop(columns=['ID', 'Incorporate year (YYYY)', 'Incorporation month (MM)']) # BapQuarterly.file.save_as_csv(df_frc, '00 FactRICCompany.xlsx', os.getcwd(), 'FactRICCompany') values_list = COM.df_list(df_frc) db.bulk_insert(sql.sql_bap_fact_ric_company_insert.value, values_list)
def nomatch_create_new(self): """Add non-duplicate ventures that are new companies (-ve ID) as new ventures to the venture table """ new_ventures = common.df_list( db.pandas_read( "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID NOT IN " "(SELECT ID FROM MDC_DEV.dbo.EntityMap) AND a.ID < 0 ")) sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)' db.bulk_insert(sql, new_ventures) # Update ID to match Venture Table in the given source table if self.source_table is not None: sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name' db.execute(sql)
def move_annual_company_data(self): i, j = 0, 0 dfac = db.pandas_read('SELECT ID, BatchID, CompanyID,[Company Name] FROM BAP.AnnualCompanyData') dfdc = db.pandas_read('SELECT CompanyID, CompanyName FROM Reporting.DimCompany') dfac['BasicName'] = dfac.apply(lambda dfs: CM.get_basic_name(dfs['Company Name']), axis=1) dfdc['BasicName'] = dfdc.apply(lambda dfs: CM.get_basic_name(dfs.CompanyName), axis=1) for i, c in dfac.iterrows(): dfc = dfdc[dfdc['BasicName'] == c.BasicName] val = dict() if len(dfc) > 0: i+=1 db.execute(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) print(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID)) else: j+=1 print(sql.sql_dim_company_insert.value) new_com_id = self.batch.get_table_seed('MaRSDataCatalyst.Reporting.DimCompany', 'CompanyID') + 1 val['CompanyID'] = new_com_id val['Company Name'] = c['Company Name'] val['Description'] = None val['Phone'] = None val['Phone2'] = None val['Fax'] = None val['Email'] = None val['Website'] = None val['CompanyType'] = None val['BatchID'] = c.BatchID val['ModifiedDate'] = str(dt.datetime.utcnow())[:-3] val['CreatedDate'] = str(dt.datetime.utcnow())[:-3] df = pd.DataFrame([val], columns=val.keys()) values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_insert.value, values) db.execute(sql.sql_annual_comapny_data_update.value.format(new_com_id, c.ID)) print('{} exists and {} doesn not exist'.format(i, j))
def __init__(self): self.MDCReport = common.df_list( db.pandas_read( 'SELECT RICCompanyDataID, CompanyID,DataSource,BatchID,DateID,AdvisoryServicesHours,' 'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, ' 'HighPotential,SocialEnterprise ' 'FROM MDCReport.BAPQ.FactRICCompany')) self.MaRSDataCatalyst = common.df_list( db.pandas_read( 'SELECT RICCompanyDataID, CompanyID,DataSourceID,BatchID,DateID,AdvisoryServicesHours,' 'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, ' 'HighPotential,SocialEnterprise FROM MaRSDataCatalyst.Reporting.FactRICCompanyData' )) self.records = []
def _main_(): print("Getting SQL query") sql = CM.get_config("config_sql.ini", "ann_survey_18", "caprevjob_by_ric") print("SQL: {}".format(sql)) print("Executing SQL to get dataframe of results") all_results = DB.pandas_read(sql) print("Creating column names") all_results['ConcatQ'] = all_results[['Cap/Rev/Emp', 'Question']].apply(lambda x: ' - '.join(x), axis=1) print("Splitting dataframe into one per RIC") split_frames = partition_by(all_results, "RIC_Program") print("Getting write path") user_path = os.path.expanduser("~") path = user_path + "/Box Sync/Workbench/BAP/Annual Survey FY2018/Results by RIC/" print("Path: {}".format(path)) print("Writing files to disc:") for ric in split_frames.keys(): x = split_frames[ric] x['rid_cid'] = x['resp_id'].astype(str) + '_' + x['Company_ID'].astype(str) x = spread(x, 'rid_cid', 'ConcatQ', 'Answer') x['rid_cid'] = x.index x['_resp_id'], x['_Company_ID'] = x['rid_cid'].str.split('_', 1).str x = x.apply(pd.to_numeric, errors='ignore') cols = x.columns.tolist() cols = cols[-2:] + cols[:-2] x = x[cols] for i in range(len(cols)): if str(cols[i])[0] == '_': cols[i] = cols[i][1:] x.columns = cols x = x.drop('rid_cid', axis=1) filename = "{} Survey Results".format(ric) write_to_xl(x, filename, path, 'Results') print("Wrote {} to path: {}".format(filename, path))
def get_table_seed(self, table, id_column): seed = 0 sql_dc = sql.sql_get_max_id.value.format(id_column, table) df = db.pandas_read(sql_dc) if len(df) > 0: seed = df.values[0][0] return seed
def split_venture_former_name(self): df = db.pandas_read('SELECT ID, CompanyName, [Former / Alternate Names] FROM MDCRaw.BAP.VentureQuarterlyData WHERE CompanyName LIKE \'%(%\' AND FiscalYear = 2019') for _, row in df.iterrows(): split = CM.venture_name_with_bracket_split(row['CompanyName']) # print('Current: {}\nName: {}\nAlternate: {}'.format(row['CompanyName'],split[0], split[1].replace('(','').replace(')',''))) # print('-' * 100) update = '''UPDATE MDCRaw.BAP.VentureQuarterlyData SET CompanyName = \'{}\' , [Former / Alternate Names] = \'{}\' WHERE ID = {} -- {}''' print(update.format(split[0], split[1].replace('(','').replace(')','').replace('formerly',''),row['ID'],row['CompanyName']))
def save_orgs_relationship(self, api_url): url = api_url + self.api_org_token print(url) orgs = CM.get_api_data(url) if orgs.ok: self.org_uuid = orgs.json()[CBDict.data.value][CBDict.uuid.value] df = self.db.pandas_read(self.enum.SQL.sql_org_detail_exists.value.format(self.org_uuid)) if len(df) == 0:#self.db.entity_exists('MDCRaw.CRUNCHBASE.Organization', 'org_uuid', self.org_uuid): self.save_organization_detail(self.org_uuid, orgs.json()[CBDict.data.value][CBDict.properties.value]) rs_json = orgs.json()[CBDict.data.value][CBDict.relationships.value] self.save_funding_rounds(rs_json['funding_rounds'], self.org_uuid) self.save_relational_entity(rs_json[self.enum.CBDict.headquarters.value], self.org_uuid, self.enum.SQL.sql_offices_exists.value, self.enum.SQL.sql_offices_insert.value, self.office_col) self.save_relational_entity(rs_json['categories'], self.org_uuid, self.enum.SQL.sql_org_category_exists.value, self.enum.SQL.sql_org_category_insert.value, self.category_col) # save all the related entities # self.save_teams(rs_json['featured_team'], self.org_uuid, TeamStatus.Featured.value) # self.save_teams(rs_json['current_team'], self.org_uuid, TeamStatus.Current.value) # self.save_teams(rs_json['past_team'], self.org_uuid, TeamStatus.Past.value) # self.save_teams(rs_json['board_members_and_advisors'], self.org_uuid, TeamStatus.Board.value) # self.save_investments_invested_in(rs_json['investments']) # self.save_relational_entity(rs_json['sub_organizations'], self.org_uuid, self.sql_sub_organization_insert) # if rs_json[self.enum.CBDict.offices.value][self.enum.CBDict.items] is not None: # self.save_relational_entity(rs_json[self.enum.CBDict.offices.value], self.org_uuid, self.enum.SQL.sql_offices_exists, self.sql_offices_insert, self.office_col) # self.save_relational_entity(rs_json['founders'], self.org_uuid, self.sql_founders_insert) # self.save_relational_entity(rs_json['acquisitions'], self.org_uuid, self.sql_acquisition_insert) # self.save_relational_entity(rs_json['acquired_by'], self.org_uuid, self.sql_acquired_insert) # self.save_relational_entity(rs_json['ipo'], self.org_uuid, self.sql_ipo_insert) # self.save_relational_entity(rs_json['funds'], self.org_uuid, self.sql_funds_insert) # self.save_relational_entity(rs_json['websites'], self.org_uuid, self.sql_websites_insert) # self.save_relational_entity(rs_json['images'], self.org_uuid, self.sql_image_insert) # self.save_relational_entity(rs_json['news'], self.org_uuid, self.sql_news_insert) db.execute(self.orgs_summary_update.format(self.org_uuid)) else: print('organization already exists.')
def df_to_db(self, df, sql_config_header, remove_single_quotes=True, return_vals=False, clean_numeric_cols=False): df_headers, df_qmarks, df_vals = self.get_sql_params( df, remove_single_quotes=remove_single_quotes) df_header_str = self.get_header_str(df_headers) df_sql = CM.get_config("config.ini", "sql_queries", sql_config_header) df_sql = df_sql.replace("WHAT_HEADERS", df_header_str).replace("WHAT_VALUES", df_qmarks) if clean_numeric_cols: for lst in df_vals: for i in range(len(lst)): element = lst[i] try: if str(element).lower() == "nan" or str( element) == "0000-00-00 00:00:00" or str( element) == '': new_val = None if type(lst) == tuple: lst = self.replace_tuple_val_at_index( lst, i, new_val) if np.dtype(element) == 'int64': new_val = int(str(lst[i])) if type(lst) == tuple: lst = self.replace_tuple_val_at_index( lst, i, new_val) except AttributeError: continue except TypeError: continue except ValueError: continue if len(df_vals) == 1 and type(df_vals[0]) == tuple: df_vals[0] = lst DB.bulk_insert(df_sql, df_vals) if return_vals: return df_vals
def update_raw_company(self): dfnew, dfdc, _ = self.get_company() raw_company = self.generate_basic_name(dfnew) dim_company = self.generate_basic_name(dfdc) if len(dfdc) > 0 else None for index, com in raw_company.iterrows(): try: company_name = com['BasicName'] print(company_name) if len(dim_company[dim_company.BasicName == company_name].CompanyID.values) > 0: companyid = dim_company[dim_company.BasicName == company_name].CompanyID.values[0] if companyid > 0: sql_update = sql.sql_update.value.format('BAP.QuarterlyCompanyData', 'CompanyID', companyid, 'ID', com.ID) print(sql_update) db.execute(sql_update) else: print('{} >> {}'.format(com.ID, com.CompanyName)) except Exception as ex: print('UPDATE ISSUE: {}'.format(ex))
def insert_dim_company_source(self, new_company): try: date_time = str(dt.datetime.utcnow())[:-3] self.dim_company_source_id = self.get_table_seed('Reporting.DimCompanySource', 'SourceCompanyID') + 1 dc = dict() dc['aSourceID'] = self.dim_company_source_id dc['bCompanyID'] = self.dim_company_id dc['cName'] = new_company['Name'] dc['dSCC'] = None dc['eDataSource'] = new_company['DataSource'] dc['eBatchID'] = new_company['BatchID'] dc['fCT'] = None dc['gModified'] = date_time dc['hCreated'] = date_time df = pd.DataFrame.from_dict([dc], orient='columns') values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_source_insert.value, values) except Exception as ex: print(ex)
def check_qs_exist(self, survey_id): sql = CM.get_config("config.ini", "sql_queries", "check_questions_exist") sql = sql.replace("WHAT_SURVEY_ID", str(survey_id)) check = DB.pandas_read(sql) if check.iloc[0][0]: return True else: return False
def preprocess(): # # CLEANING # Website Cleaning # Where all the fields are 'n/a' 'NA' etc need to change them to null db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture " "SET Website = NULL WHERE Website ='na' OR Website = 'n/a' OR Website = '-' " "OR Website = 'http://' OR Website = 'no website' OR Website = '--' " "OR Website = 'http://N/A' OR Website = 'no data' OR Website = 'http://www.facebook.com' " "OR Website = '0' OR Website = 'http://www.nowebsite.co'" "OR Website = 'http://N/A - Development stage.' OR Website = ' -'" "OR Website = 'http://NOT ON WEB' OR Website = 'http://none'" "OR Website = 'http://coming soon' OR Website = 'http://not.yet' " "OR Website = 'http://no website' OR Website = 'http://not yet' " "OR Website = 'none' OR Website = 'http://NA'OR Website = 'tbd' " "OR Website = 'https' OR Website = 'http://www.nowebsite.com' " "OR Website = 'http://nowebsite.com' OR Website = 'http://Nowebsiteyet' " "OR Website = 'Coming soon' OR Website = 'not set up yet' OR Website = 'http://www.yahoo.com' " "OR Website = 'http://under construction' OR Website = 'http://www.nwebsite.com' " "OR Website = 'http://www.google.com' OR Website = 'http://www.google.ca' OR Website = 'youtube.com'" ) # # Phone Cleaning db.execute( "UPDATE MDC_DEV.dbo.ProcessedVenture SET Phone = NULL WHERE LEN(Phone)<10 " ) # # Name Cleaning db.execute( "DELETE FROM MDC_DEV.dbo.ProcessedVenture WHERE Name LIKE '%communitech%' OR Name LIKE '%company:%' " "OR Name LIKE '%nwoic%' OR Name LIKE '%riccentre%' OR Name LIKE '%sparkcentre%'OR Name LIKE '%venture%' " "OR Name LIKE '%Wetch_%' OR Name LIKE '%testBAP%' OR Name LIKE '%SSMIC%' OR Name LIKE '%Techalliance%' " "OR Name LIKE '%RICC_%' OR Name LIKE '%_anon_%' OR Name LIKE '%InnovationFactory_%' " "OR Name LIKE '%InvestOttawa_%' OR Name LIKE '%Québec inc%' OR Name LIKE '%Haltech_%' " "OR Name LIKE '%InnovNiag_survey%' OR Name LIKE '%NOIC%'")
def write_results(clustered_dupes_list): """ Load finalized clusters into EntityMap table :param clustered_dupes_list: list of tuples returned from clustering :return: number of duplicate sets """ # We now have a sequence of tuples of company IDs that dedupe believes # all refer to the same entity. We write this out onto an entity map table db.execute("DELETE FROM MDC_DEV.dbo.EntityMap") print('Populating EntityMap table.') sql = 'INSERT INTO MDC_DEV.dbo.EntityMap (ID, CanonID, ClusterScore) VALUES (?,?,?)' values = [] cluster_id = 1 for cluster, scores in clustered_dupes_list: for id, score in zip(cluster, scores): values.append([int(id), int(cluster_id), float(score)]) cluster_id = cluster_id + 1 db.bulk_insert(sql, values) db.execute("UPDATE MDC_DEV.dbo.EntityMap SET Name = p.Name, BatchID = p.BatchID, " "Description = p.Description, Website = p.Website, Email = p.Email, " "Phone = p.Phone, Address = p.Address " "FROM MDC_DEV.dbo.EntityMap AS a INNER JOIN MDC_DEV.dbo.ProcessedVenture AS p " "ON a.ID = p.ID") return len(clustered_dupes_list)
def remove_false_positives(): """Consult MatchingFalsePositives and remove known false positive clusters from EntityMap""" db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID") entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index') clusters = [] for index1, val1 in entity_map.items(): for index2, val2 in entity_map.items(): if val1['CanonID'] == val2['CanonID'] and index1 != index2: clusters.append([index1, index2]) break fal_pos = db.pandas_read("SELECT * FROM MDC_DEV.dbo.MatchingFalsePositives").to_dict('index') remove = [] for cluster in clusters: for i, v in fal_pos.items(): if cluster[0] == v['ID'] and cluster[1] == v['FalseID']: remove.append([cluster[0]]) remove.append([cluster[1]]) break else: continue sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)' db.bulk_insert(sql, remove)
def block(self, selection): """ :param deduper: deduper object created in training :param selection: sql statement selecting all relevant columns to use in deduplication :return: None """ # If dedupe learned a Index Predicate, we have to take a pass # through the data and create indices. for field in self.deduper.blocker.index_fields: df = db.pandas_read("SELECT DISTINCT {field} FROM MDC_DEV.dbo.ProcessedVenture " "WHERE {field} IS NOT NULL".format(field=field)) dataset = [tuple(x) for x in df.values] field_data = set(row[0] for row in dataset) self.deduper.blocker.index(field_data, field) # Now we are ready to write our blocking map table by creating a # generator that yields unique `(BlockKey, ID)` tuples. db.execute("DELETE FROM MDC_DEV.dbo.BlockingMap") df = db.pandas_read(selection).set_index('ID').to_dict('index') b_data = self.deduper.blocker(df) sql = 'INSERT INTO MDC_DEV.dbo.BlockingMap VALUES (?,?)' print('Populating BlockingMap... ') # Chunk the blocked data into groups of 30,000 blocks to be inserted in the BlockingMap size = 30000 main_list = list(b_data) b_data = None chunks = [main_list[x:x + size] for x in range(0, len(main_list), size)] main_list = None for chunk in chunks: db.bulk_insert(sql, chunk) self.deduper.blocker.resetIndices()
def get_campaigns(self, api_token, survey_id, session_variables, surveys_df): if survey_id == 'w': while type(survey_id) != int: try: survey_id = int( input( "Enter ID of survey that you would like to retrieve campaign data for: " )) if self.return_to_main(survey_id) == 1: return survey_id = self.validate_survey_id( survey_id, session_variables, api_token, surveys_df) survey_id = self.validate_survey_id( survey_id, session_variables, api_token, surveys_df) except ValueError: continue campaigns_df = sg_campaign.sg_campaigns_df(survey_id, api_token) print(campaigns_df) campaigns_df["id"] = campaigns_df["id"].apply(pd.to_numeric, errors='ignore') # remove campaigns from df that are already in DB c_sql = CM.get_config("config.ini", "sql_queries", "campaigns_for_survey") c_sql = c_sql.replace("WHAT_SURVEY_ID", str(survey_id)) db_cmpgns = DB.pandas_read(c_sql) if db_cmpgns is not None: db_cmpgns = db_cmpgns.apply(pd.to_numeric, errors='ignore') cmpgns_not_in_db = pd.merge(campaigns_df, db_cmpgns, how='left', indicator=True, on="id") cmpgns_not_in_db2 = cmpgns_not_in_db[cmpgns_not_in_db['_merge'] == 'left_only'].drop("_merge", axis=1) # cmpgns_not_in_db2 = cmpgns_not_in_db2.apply(pd.to_numeric, errors='ignore') # insert campaigns into DB if len(cmpgns_not_in_db2) > 0: insert_cmpgns_sql = "insert_campaigns" self.df_to_db(cmpgns_not_in_db2, insert_cmpgns_sql, remove_single_quotes=False, clean_numeric_cols=True) return campaigns_df
def insert_dim_company(self, new_company): try: self.dim_company_id = self.get_table_seed('Reporting.DimCompany', 'CompanyID') + 1 date_time = str(dt.datetime.utcnow())[:-3] dc = dict() dc['aCompanyID'] = self.dim_company_id dc['bName'] = new_company['Name'] dc['cDescription'] = None dc['dPhone'] = None dc['ePhone2'] = None dc['fFax'] = None dc['gEmail'] = None dc['hWebsite'] = new_company['Website'] dc['iCompanyType'] = None dc['jBatchID'] = new_company['BatchID'] dc['kModifiedDate'] = date_time dc['lCreatedDate'] = date_time df = pd.DataFrame.from_dict([dc], orient='columns') values = CM.df_list(df) db.bulk_insert(sql.sql_dim_company_insert.value, values) except Exception as es: print(es)
def get_load_order(self, schema): """ Takes name of schema, returns DataFrame with load order in first column. :param schema: :return DataFrame: """ schema_str = "'" + str(schema) + "'" sql_str = CM.get_config("config.ini", "dependency_query", "load_order_query") sql_str = sql_str.replace("WHAT_SCHEMA", schema_str) load_order = DAL.pandas_read(sql_str) return load_order