Ejemplo n.º 1
0
	def move_annual_company_data(self):
		i, j = 0, 0
		dfac = db.pandas_read('SELECT ID, BatchID, CompanyID,[Company Name] FROM BAP.AnnualCompanyData')
		dfdc = db.pandas_read('SELECT CompanyID, CompanyName FROM Reporting.DimCompany')
		dfac['BasicName'] = dfac.apply(lambda dfs: CM.get_basic_name(dfs['Company Name']), axis=1)
		dfdc['BasicName'] = dfdc.apply(lambda dfs: CM.get_basic_name(dfs.CompanyName), axis=1)
		for i, c in dfac.iterrows():
			dfc = dfdc[dfdc['BasicName'] == c.BasicName]
			val = dict()
			if len(dfc) > 0:
				i+=1
				db.execute(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID))
				print(sql.sql_annual_comapny_data_update.value.format(dfc.CompanyID.values[0], c.ID))
			else:
				j+=1
				print(sql.sql_dim_company_insert.value)
				new_com_id = self.batch.get_table_seed('MaRSDataCatalyst.Reporting.DimCompany', 'CompanyID') + 1
				val['CompanyID'] = new_com_id
				val['Company Name'] = c['Company Name']
				val['Description'] = None
				val['Phone'] = None
				val['Phone2'] = None
				val['Fax'] = None
				val['Email'] = None
				val['Website'] = None
				val['CompanyType'] = None
				val['BatchID'] = c.BatchID
				val['ModifiedDate'] = str(dt.datetime.utcnow())[:-3]
				val['CreatedDate'] = str(dt.datetime.utcnow())[:-3]
				df = pd.DataFrame([val], columns=val.keys())
				values = CM.df_list(df)
				db.bulk_insert(sql.sql_dim_company_insert.value, values)
				db.execute(sql.sql_annual_comapny_data_update.value.format(new_com_id, c.ID))
		print('{} exists and {} doesn not exist'.format(i, j))
Ejemplo n.º 2
0
    def schema_to_dfs(self, schema, which_tables="all"):
        """
        Takes name of schema and returns dict of dataframes, one entry for each table in schema.
        Key = table name
        Value = dataframe
        Dataframes will contain results from SELECT * FROM TABLE.
        :param schema:
        :return dict:
        """
        select_all = "SELECT * FROM "
        sql = str(
            "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '" +
            str(schema) + "'")
        tables = DAL.pandas_read(sql)

        if which_tables == "all":
            table_names = tables["TABLE_NAME"]
        else:
            table_names = which_tables

        table_dict = {}
        sql_statements = []
        for table in table_names:
            statement = str(select_all + schema + "." + table)
            sql_statements.append(statement)
            table_dict[table] = 0

        for statement in sql_statements:
            df = DAL.pandas_read(statement)
            table_name = statement.split(".")[1]
            for key in table_dict.keys():
                if key.lower() == table_name.lower():
                    table_dict[key] = df

        return table_dict
Ejemplo n.º 3
0
    def block(self, selection):
        """
        :param deduper: deduper object created in training
        :param selection: sql statement selecting all relevant columns to use in deduplication
        :return: None
        """
        # If dedupe learned a Index Predicate, we have to take a pass
        # through the data and create indices.
        for field in self.deduper.blocker.index_fields:
            df = db.pandas_read("SELECT DISTINCT {field} FROM MDC_DEV.dbo.ProcessedVenture "
                                "WHERE {field} IS NOT NULL".format(field=field))
            dataset = [tuple(x) for x in df.values]
            field_data = set(row[0] for row in dataset)
            self.deduper.blocker.index(field_data, field)

        # Now we are ready to write our blocking map table by creating a
        # generator that yields unique `(BlockKey, ID)` tuples.
        db.execute("DELETE FROM MDC_DEV.dbo.BlockingMap")

        df = db.pandas_read(selection).set_index('ID').to_dict('index')
        b_data = self.deduper.blocker(df)
        sql = 'INSERT INTO MDC_DEV.dbo.BlockingMap VALUES (?,?)'

        print('Populating BlockingMap... ')
        # Chunk the blocked data into groups of 30,000 blocks to be inserted in the BlockingMap
        size = 30000
        main_list = list(b_data)
        b_data = None
        chunks = [main_list[x:x + size] for x in range(0, len(main_list), size)]
        main_list = None
        for chunk in chunks:
            db.bulk_insert(sql, chunk)

        self.deduper.blocker.resetIndices()
Ejemplo n.º 4
0
    def remove_false_positives():
        """Consult MatchingFalsePositives and remove known false positive clusters from EntityMap"""

        db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID")
        entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index')
        clusters = []
        for index1, val1 in entity_map.items():
            for index2, val2 in entity_map.items():
                if val1['CanonID'] == val2['CanonID'] and index1 != index2:
                    clusters.append([index1, index2])
                    break

        fal_pos = db.pandas_read("SELECT * FROM MDC_DEV.dbo.MatchingFalsePositives").to_dict('index')
        remove = []
        for cluster in clusters:
            for i, v in fal_pos.items():
                if cluster[0] == v['ID'] and cluster[1] == v['FalseID']:
                    remove.append([cluster[0]])
                    remove.append([cluster[1]])
                    break
                else:
                    continue

        sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)'
        db.bulk_insert(sql, remove)
Ejemplo n.º 5
0
 def __init__(self):
     self.source_table = 'MDC_DEV.dbo.SourceTable'
     self.valid = validate()
     self.data = db.pandas_read(
         "SELECT * FROM MDC_DEV.dbo.ProcessedVenture").to_dict('index')
     self.source = db.pandas_read(
         'SELECT * FROM MDC_DEV.dbo.SourceTable WHERE ID < 0').to_dict(
             'index')
Ejemplo n.º 6
0
	def transfer_fact_ric_aggregation():
		date_id = COM.get_dateid(datevalue=None)
		metric_prg = [130, 132, 133, 129, 134, 63, 77, 60, 68, 67, 135, 136, 137]
		metric_prg_youth = [134, 138]
		
		df_program = db.pandas_read(sql.sql_company_aggregate_program.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter))
		df_program_youth = db.pandas_read(sql.sql_company_aggregate_program_youth.value.format(2018, 4))#(BapQuarterly.year, BapQuarterly.quarter))
		
		values = []
		
		for _, row in df_program.iterrows():
			i = 7
			while i < 20:
				m = i - 7
				val = []
				val.append(int(row['DataSource']))  # DataSource
				val.append(int(date_id))  # RICDateID
				val.append(int(metric_prg[m]))  # MetricID
				val.append(int(row['BatchID']))  # BatchID
				
				if str(row[i]) in ['no data', 'n\\a', '-', 'n/a', 'nan']:
					val.append(-1.0)
					print(row[i])
				else:
					val.append(round(float(row[i]), 2))  # AggregateNumber
				val.append(str(datetime.datetime.today())[:23])  # ModifiedDate
				val.append(str(datetime.datetime.today())[:23])  # CreatedDate
				val.append(row['Youth'])  # Youth
				values.append(val)
				i = i + 1
				# db.execute(sql.sql_bap_fra_insert.value.format(tuple(val)))
		
		for _, row in df_program_youth.iterrows():
			
			j = 7
			while j < 9:
				m = j - 7
				val = []
				val.append(int(row['DataSource']))  # DataSource
				val.append(int(date_id))  # RICDateID
				val.append(int(metric_prg_youth[m]))  # MetricID
				val.append(int(row['BatchID']))  # BatchID
				if str(row[j]) in ['no data', 'n\\a', '-', 'n/a', 'nan']:
					val.append(-1.0)
					print(row[j])
				else:
					val.append(round(float(row[j]), 2))  # AggregateNumber
				val.append(str(datetime.datetime.today())[:23])  # ModifiedDate
				val.append(str(datetime.datetime.today())[:23])  # CreatedDate
				val.append(row['Youth'])  # Youth
				
				values.append(val)
				j = j + 1
				# db.execute(sql.sql_bap_fra_insert.value.format(tuple(val)))
		for val in range(len(values)):
			print('{}. {}'.format(val,values[val]))
			# print('{}. {}'.format(val,values[val][1]))
		db.bulk_insert(sql.sql_bap_fact_ric_aggregation_insert.value, values)
Ejemplo n.º 7
0
	def create_bap_batch():
		batch = BatchService()
		program = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program.value,BapQuarterly.year,BapQuarterly.quarter))
		program_youth = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.ric_program_youth.value, BapQuarterly.year,BapQuarterly.quarter))
		company = db.pandas_read(sql.sql_bap_distinct_batch.value.format(tbl.venture_data.value, BapQuarterly.year, BapQuarterly.quarter))
		comapny_annual = db.pandas_read(sql.sql_annual_bap_distinct_batch.value.format(tbl.venture_annual.value,BapQuarterly.year))

		# batch.create_bap_batch(program, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program.value, WS.bap_program.value, ss.RICPD_bap.value)
		# batch.create_bap_batch(program_youth, BapQuarterly.year, BapQuarterly.quarter, tbl.ric_program_youth.value, WS.bap_program_youth.value, ss.RICPDY_bap.value)
		batch.create_bap_batch(company, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_data.value, WS.bap_company.value, ss.RICCD_bap.value)
		if BapQuarterly.quarter == 3:
			batch.create_bap_batch(comapny_annual, BapQuarterly.year, BapQuarterly.quarter, tbl.venture_annual.value, WS.bap_company_annual.value, ss.RICACD_bap.value)
Ejemplo n.º 8
0
 def __init__(self):
     self.MDCReport = common.df_list(
         db.pandas_read(
             'SELECT RICCompanyDataID, CompanyID,DataSource,BatchID,DateID,AdvisoryServicesHours,'
             'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, '
             'HighPotential,SocialEnterprise '
             'FROM MDCReport.BAPQ.FactRICCompany'))
     self.MaRSDataCatalyst = common.df_list(
         db.pandas_read(
             'SELECT RICCompanyDataID, CompanyID,DataSourceID,BatchID,DateID,AdvisoryServicesHours,'
             'VolunteerMentorHours, AnnualRevenue, NumberEmployees,FundingToDate, FundingCurrentQuarter, '
             'HighPotential,SocialEnterprise FROM MaRSDataCatalyst.Reporting.FactRICCompanyData'
         ))
     self.records = []
Ejemplo n.º 9
0
	def get_table_seed(self, table, id_column):
		seed = 0
		sql_dc = sql.sql_get_max_id.value.format(id_column, table)
		df = db.pandas_read(sql_dc)
		if len(df) > 0:
			seed = df.values[0][0]
		return seed
Ejemplo n.º 10
0
def _main_():
    print("Getting SQL query")
    sql = CM.get_config("config_sql.ini", "ann_survey_18", "caprevjob_by_ric")
    print("SQL: {}".format(sql))
    print("Executing SQL to get dataframe of results")
    all_results = DB.pandas_read(sql)

    print("Creating column names")
    all_results['ConcatQ'] = all_results[['Cap/Rev/Emp', 'Question']].apply(lambda x: ' - '.join(x), axis=1)
    print("Splitting dataframe into one per RIC")
    split_frames = partition_by(all_results, "RIC_Program")
    print("Getting write path")
    user_path = os.path.expanduser("~")
    path = user_path + "/Box Sync/Workbench/BAP/Annual Survey FY2018/Results by RIC/"
    print("Path: {}".format(path))

    print("Writing files to disc:")
    for ric in split_frames.keys():
        x = split_frames[ric]
        x['rid_cid'] = x['resp_id'].astype(str) + '_' + x['Company_ID'].astype(str)
        x = spread(x, 'rid_cid', 'ConcatQ', 'Answer')
        x['rid_cid'] = x.index
        x['_resp_id'], x['_Company_ID'] = x['rid_cid'].str.split('_', 1).str
        x = x.apply(pd.to_numeric, errors='ignore')
        cols = x.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        x = x[cols]
        for i in range(len(cols)):
            if str(cols[i])[0] == '_':
                cols[i] = cols[i][1:]
        x.columns = cols
        x = x.drop('rid_cid', axis=1)
        filename = "{} Survey Results".format(ric)
        write_to_xl(x, filename, path, 'Results')
        print("Wrote {} to path: {}".format(filename, path))
Ejemplo n.º 11
0
	def update_tdw_basic_company(self):
		df = db.pandas_read(sql.sql_tdw_basic_company.value)
		for _, r in df.iterrows():
			basic_name = CM.get_basic_name(r.legal_name)
			sql_update = sql.sql_tdw_basic_company_update.value.format(basic_name, CM.sql_compliant(r.legal_name))
			print(sql_update)
			db.execute(sql_update)
Ejemplo n.º 12
0
	def update_cb_basic_company(self):
		df = db.pandas_read(sql.sql_cb_basic_company.value)
		for _, r in df.iterrows():
			basic_name = CM.get_basic_name(r['name'])
			sql_update = sql.sql_cb_basic_company_update.value.format(basic_name, CM.sql_compliant(r['org_uuid']))
			print(sql_update)
			db.execute(sql_update)
Ejemplo n.º 13
0
	def split_venture_former_name(self):
		df = db.pandas_read('SELECT ID, CompanyName, [Former / Alternate Names] FROM MDCRaw.BAP.VentureQuarterlyData WHERE CompanyName LIKE \'%(%\' AND FiscalYear = 2019')
		for _, row in df.iterrows():
			split = CM.venture_name_with_bracket_split(row['CompanyName'])
			# print('Current: {}\nName: {}\nAlternate: {}'.format(row['CompanyName'],split[0], split[1].replace('(','').replace(')','')))
			# print('-' * 100)
			update = '''UPDATE MDCRaw.BAP.VentureQuarterlyData SET CompanyName = \'{}\' , [Former / Alternate Names] = \'{}\' WHERE ID = {} -- {}'''
			print(update.format(split[0], split[1].replace('(','').replace(')','').replace('formerly',''),row['ID'],row['CompanyName']))
Ejemplo n.º 14
0
 def delete_old_ans():
     # delete old ans using answer ids
     #   store old ans in xl file
     old_ans_sql = CM.get_config('config.ini', 'secondary_etl', 'old_ans')
     old_ans_df = DB.pandas_read(old_ans_sql)
     DBInteractions.store_df(old_ans_df, '_OLD_PIPE_ANS')
     #   run sql to delete old ans
     del_old_ans_sql = CM.get_config('config.ini', 'secondary_etl',
                                     'del_old_ans')
     DB.execute(del_old_ans_sql)
Ejemplo n.º 15
0
    def check_qs_exist(self, survey_id):

        sql = CM.get_config("config.ini", "sql_queries",
                            "check_questions_exist")
        sql = sql.replace("WHAT_SURVEY_ID", str(survey_id))
        check = DB.pandas_read(sql)

        if check.iloc[0][0]:
            return True
        else:
            return False
Ejemplo n.º 16
0
	def transfer_fact_ric_company_data():
		df = db.pandas_read(sql.sql_bap_fact_ric_data_fyq4.value)
		df_frc = BapQuarterly.get_proper_values(df)
		# BapQuarterly.update_month_year(df_frc)
		# df_frc['IntakeDate'] = pd.to_datetime(df_frc['IntakeDate'])
		df_frc['Age'] = None
		# df_frc['Date of Incorporation'] = pd.to_datetime(df_frc['Date of Incorporation'])
		# df_ric = df_frc.drop(columns=['ID', 'Incorporate year (YYYY)', 'Incorporation month (MM)'])
		# BapQuarterly.file.save_as_csv(df_frc, '00 FactRICCompany.xlsx', os.getcwd(), 'FactRICCompany')
		values_list = COM.df_list(df_frc)

		db.bulk_insert(sql.sql_bap_fact_ric_company_insert.value, values_list)
Ejemplo n.º 17
0
 def update(self, table_name, source_id_col, company_id_col):
     etl = common.df_list(
         db.pandas_read('SELECT ' + source_id_col + ',' + company_id_col +
                        ' FROM ' + table_name))
     for index1, val1 in enumerate(self.source_table):
         for index2, val2 in enumerate(etl):
             if val1[0] == str(val2[0]):
                 db.execute('UPDATE ' + table_name + ' SET ' +
                            company_id_col + ' = ' + str(val1[1]) +
                            ' WHERE ' + source_id_col + ' = ' +
                            str(val2[0]))
                 break
Ejemplo n.º 18
0
    def get_campaigns(self, api_token, survey_id, session_variables,
                      surveys_df):

        if survey_id == 'w':
            while type(survey_id) != int:
                try:
                    survey_id = int(
                        input(
                            "Enter ID of survey that you would like to retrieve campaign data for: "
                        ))
                    if self.return_to_main(survey_id) == 1:
                        return
                    survey_id = self.validate_survey_id(
                        survey_id, session_variables, api_token, surveys_df)
                    survey_id = self.validate_survey_id(
                        survey_id, session_variables, api_token, surveys_df)
                except ValueError:
                    continue

        campaigns_df = sg_campaign.sg_campaigns_df(survey_id, api_token)
        print(campaigns_df)
        campaigns_df["id"] = campaigns_df["id"].apply(pd.to_numeric,
                                                      errors='ignore')

        # remove campaigns from df that are already in DB
        c_sql = CM.get_config("config.ini", "sql_queries",
                              "campaigns_for_survey")
        c_sql = c_sql.replace("WHAT_SURVEY_ID", str(survey_id))
        db_cmpgns = DB.pandas_read(c_sql)
        if db_cmpgns is not None:
            db_cmpgns = db_cmpgns.apply(pd.to_numeric, errors='ignore')

        cmpgns_not_in_db = pd.merge(campaigns_df,
                                    db_cmpgns,
                                    how='left',
                                    indicator=True,
                                    on="id")
        cmpgns_not_in_db2 = cmpgns_not_in_db[cmpgns_not_in_db['_merge'] ==
                                             'left_only'].drop("_merge",
                                                               axis=1)
        # cmpgns_not_in_db2 = cmpgns_not_in_db2.apply(pd.to_numeric, errors='ignore')

        # insert campaigns into DB
        if len(cmpgns_not_in_db2) > 0:
            insert_cmpgns_sql = "insert_campaigns"

            self.df_to_db(cmpgns_not_in_db2,
                          insert_cmpgns_sql,
                          remove_single_quotes=False,
                          clean_numeric_cols=True)

        return campaigns_df
Ejemplo n.º 19
0
    def nomatch_create_new(self):
        """Add non-duplicate ventures that are new companies (-ve ID) as new ventures to the venture table """
        new_ventures = common.df_list(
            db.pandas_read(
                "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID NOT IN "
                "(SELECT ID FROM MDC_DEV.dbo.EntityMap) AND a.ID < 0 "))
        sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
        db.bulk_insert(sql, new_ventures)

        # Update ID to match Venture Table in the given source table
        if self.source_table is not None:
            sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + ' AS a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name'
            db.execute(sql)
Ejemplo n.º 20
0
def partition_by(df, col_name):
    """ Splits df into multiple dfs, using values in col_name
    # df, str -> dict
    """
    sql = CM.get_config("config_sql.ini", "ann_survey_18", "distinct_RICs")
    split_by = DB.pandas_read(sql)
    split_by = split_by['RIC_Program'].tolist()
    frame_dict = {elem: '' for elem in split_by}

    for key in frame_dict.keys():
        query = '{} == \"{}\"'.format(str(col_name), str(key))
        frame_dict[key] = df.query(query)

    return frame_dict
Ejemplo n.º 21
0
    def remove_discovered_matches():
        db.execute("SELECT * FROM MDC_DEV.dbo.EntityMap ORDER BY CanonID")
        entity_map = db.pandas_read("SELECT * FROM MDC_DEV.dbo.EntityMap").set_index('ID').to_dict('index')
        clusters = []
        for index1, val1 in entity_map.items():
            for index2, val2 in entity_map.items():
                if val1['CanonID'] == val2['CanonID'] and index1 != index2:
                    clusters.append([index1, index2])
                    break

        d_matches = db.pandas_read("SELECT * FROM MDC_DEV.dbo.DuplicateVenture").to_dict('index')
        remove = []
        for cluster in clusters:
            for i, v in d_matches.items():
                if (cluster[0] == v['CompanyID'] and cluster[1] == v['DuplicateCompanyID']) or \
                        (cluster[1] == v['CompanyID'] and cluster[0] == v['DuplicateCompanyID']):
                    remove.append([cluster[0]])
                    remove.append([cluster[1]])
                    break
                else:
                    continue
        sql = 'DELETE FROM MDC_DEV.dbo.EntityMap WHERE ID = (?)'
        db.bulk_insert(sql, remove)
Ejemplo n.º 22
0
    def get_load_order(self, schema):
        """
        Takes name of schema, returns DataFrame
        with load order in first column.
        :param schema:
        :return DataFrame:
        """

        schema_str = "'" + str(schema) + "'"
        sql_str = CM.get_config("config.ini", "dependency_query",
                                "load_order_query")
        sql_str = sql_str.replace("WHAT_SCHEMA", schema_str)
        load_order = DAL.pandas_read(sql_str)
        return load_order
Ejemplo n.º 23
0
    def deduper_setup(self,settings_file, training_file, field_list, selection, sample):
        """
        Trains (if training and settings files do not exist) otherwise set up deduper object
        :param settings_file: settings file name
        :param training_file: training file name
        :param field_list: list of lists (field(string), comparator(string), missing?(bool))
        :param selection: sql statement selecting all relevant columns to use in deduplication
        :param sample: sample size of data to be used for training
        :return: deduper object
        """
        if os.path.exists(settings_file):
            print('Reading from ', settings_file)
            with open(settings_file, 'rb') as sf:
                self.deduper = dedupe.StaticDedupe(sf, num_cores=4)
        else:
            # Define the fields dedupe will pay attention to
            fields = []
            for field in field_list:
                fields.append({'field': field[0], 'type': field[1], 'has missing': field[2]})

            # Create a new deduper object and pass our data model to it.
            self.deduper = dedupe.Dedupe(fields, num_cores=4)

            data = db.pandas_read(selection).to_dict('index')

            print('Collecting sample data for active learning... this may take a while.')
            self.deduper.sample(data, sample)

            if os.path.exists(training_file):
                print('Reading labeled examples from ', training_file)
                with open(training_file) as tf:
                    self.deduper.readTraining(tf)

            print('Starting active labeling...')
            dedupe.convenience.consoleLabel(self.deduper)

            # When finished, save our labeled, training pairs to disk
            with open(training_file, 'w') as tf:
                self.deduper.writeTraining(tf)

            # `recall` is the proportion of true dupes pairs that the learned
            # rules must cover. You may want to reduce this if your are making
            # too many blocks and too many comparisons.
            self.deduper.train(recall=0.90)

            with open(settings_file, 'wb') as sf:
                self.deduper.writeSettings(sf)

            self.deduper.cleanupTraining()
Ejemplo n.º 24
0
    def get_dependencies(self, schema):
        """
        Takes name of schema and queries DB for table dependencies.
        Returned df has 2 cols:
            Col 1. Table w FK
            Col 2. Table referenced by FK in Col 1.
        :param schema:
        :return dataframe:
        """

        schema_str = "'" + str(schema) + "'"
        sql_str = CM.get_config("config.ini", "dependency_query", "query")
        dependency_sql = sql_str + schema_str
        dependencies = DAL.pandas_read(dependency_sql)
        dependencies.columns = ["FKTable", "ReferencedTable"]

        return dependencies
Ejemplo n.º 25
0
    def clustering(self):
        """
        Cluster potential record matches
        :param deduper: deduper object
        :return: list of clustered tuples ((ID1, ID2), (confidence_score1, confidence_score2))
        """
        entity_dict = db.pandas_read("SELECT b.ID, b.Name, b.AlternateName, b.BatchID, b.DateFounded, "
                                     "b.DateOfIncorporation, b.Description, b.Website, b.Email, b.Phone, "
                                     "b.Address, a.BlockKey, a.SmallerKeys FROM MDC_DEV.dbo.SmallerCoverage "
                                     "AS a INNER JOIN MDC_DEV.dbo.ProcessedVenture AS b "
                                     "ON a.ID = b.ID ORDER BY a.BlockKey").to_dict('index')

        print('Clustering...')
        clustered_dupes = self.deduper.matchBlocks(self.candidates_gen(entity_dict),
                                              threshold=0.5)

        # matchBlocks returns a generator. Turn it into a list
        return list(clustered_dupes)
Ejemplo n.º 26
0
 def bap_dataframes(self):
     df = DB.pandas_read(sql.sql_bap_schema_tabel_cols.value)
     if df is not None:
         l_program = []
         l_program_youth = []
         l_company = []
         l_company_annual = []
         self.program_columns = list(
             df[df['TABLE_NAME'] == 'RICProgram']['COLUMN_NAME'][7:])
         self.program_youth_columns = list(
             df[df['TABLE_NAME'] == 'RICProgramYouth']['COLUMN_NAME'][7:])
         self.quarterly_company_columns = list(
             df[df['TABLE_NAME'] ==
                'VentureQuarterlyData']['COLUMN_NAME'][10:])
         self.annual_company_columns = list(
             df[df['TABLE_NAME'] == 'AnnualCompanyData']['COLUMN_NAME'][8:])
         return l_company, l_company_annual, l_program, l_program_youth
     else:
         return None, None, None, None
Ejemplo n.º 27
0
    def write_survey_entries(self, api_token):

        year, quarter = CM.fiscal_year_quarter()

        api_surveys_df = self.get_surveys(api_token, prin=False)
        api_surveys_df = api_surveys_df.apply(pd.to_numeric, errors='ignore')

        db_surveys_sql = CM.get_config("config.ini", "sql_queries", "surveys")
        db_surveys_df = DB.pandas_read(db_surveys_sql)
        db_surveys_df = db_surveys_df.apply(pd.to_numeric, errors='ignore')

        surveys_not_in_db = pd.merge(api_surveys_df,
                                     db_surveys_df[['id']],
                                     how='outer',
                                     indicator=True,
                                     on="id")
        surveys_not_in_db2 = surveys_not_in_db[surveys_not_in_db['_merge'] ==
                                               'left_only'].drop("_merge",
                                                                 axis=1)

        # write surveys_not_in_db2 to db, one at a time so BatchService can be executed for each one
        for index in range(len(surveys_not_in_db2)):
            row = surveys_not_in_db2.iloc[index][:]
            df = pd.DataFrame([list(row.values)],
                              columns=list(surveys_not_in_db2))

            batch = BatchService()
            x = batch.create_new_batch(datasource=-1,
                                       systemsource=50,
                                       year=year,
                                       quarter=quarter)
            batch_id = x.iloc[-1][0]

            # add batchID to end of df
            df['BatchID'] = int(batch_id)

            self.df_to_db(df, "insert_survey_entry")

        pass
Ejemplo n.º 28
0
    def fp_create_new(self):

        new_ventures = common.df_list(
            db.pandas_read(
                "SELECT * FROM MDC_DEV.dbo.ProcessedVenture AS a WHERE a.ID IN "
                "(SELECT ID FROM MDC_DEV.dbo.MatchingFalsePositives) AND a.ID < 0"
            ))
        sql = 'INSERT INTO MDC_DEV.dbo.Venture VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
        db.bulk_insert(sql, new_ventures)

        # Update MFP with the new ventures new ID
        db.execute(
            "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET ID = a.ID "
            "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.Name = a.Name"
        )
        db.execute(
            "UPDATE MDC_DEV.dbo.MatchingFalsePositives SET FalseID = a.ID "
            "FROM MDC_DEV.dbo.MatchingFalsePositives AS m INNER JOIN MDC_DEV.dbo.Venture AS a ON m.FalseName = a.Name"
        )

        # Update sourcetable with new ID
        if self.source_table is not None:
            sql = 'UPDATE ' + self.source_table + ' SET ID = b.ID FROM ' + self.source_table + 'as a INNER JOIN MDC_DEV.dbo.Venture AS b ON a.Name = b.Name'
            db.execute(sql)
Ejemplo n.º 29
0
    def del_survey_components(self, survey_id):

        del_sql = CM.get_config("config.ini", "sql_queries",
                                "del_all_for_survey")
        del_sql = del_sql.replace("WHAT_SURVEY", str(survey_id))
        DB.execute(del_sql)
        print("\nDeletion attempt was made. Survey components check:")

        comps_dict = {
            "questions": "select_questions",
            "options": "select_options",
            "answers": "select_answers",
            "responses": "select_responses",
            "emails": "select_emails",
            "campaigns": "select_campaigns"
        }

        for component, sql in comps_dict.items():
            sql = CM.get_config("config.ini", "sql_queries",
                                sql).replace("WHAT_SURVEY", str(survey_id))
            df = DB.pandas_read(sql)
            print("\nCount of {}: {}".format(component, len(df)))

        return
Ejemplo n.º 30
0
	def get_existing_company(self):
		df_company = db.pandas_read(self.sql_dim_company.format('Reporting.DimCompany'))
		df_company_source = db.pandas_read(self.sql_dim_company_source.format('Reporting.DimCompanySource'))
		return df_company, df_company_source