def run(self, conn_id='postgres_bills3'): """ Returns bill text for train and val datasets """ train = self.inputs["train"].read() val = self.inputs["val"].read() train_ids = [str(x) for x in set(train["bill_id"].values)] val_ids = [str(x) for x in set(val["bill_id"].values)] del train del val train_sql_query = """ select bill_id, doc from ml_policy_class.bill_texts where bill_id in ({}) and type_id = 1 """.format(", ".join(train_ids)) val_sql_query = """ select bill_id, doc from ml_policy_class.bill_texts where bill_id in ({}) and type_id = 1 """.format(", ".join(val_ids)) pg_hook = PostgresHook(postgres_conn_id=conn_id) txt_train = pg_hook.get_pandas_df(train_sql_query) txt_val = pg_hook.get_pandas_df(val_sql_query) self.outputs["txt_train"].write(txt_train) self.outputs["txt_val"].write(txt_val)
def get_error_dict(redshift_conn_id): get_table_name = """ SELECT DISTINCT perm.name, stl.tbl AS id FROM stl_load_errors stl LEFT JOIN STV_TBL_PERM perm ON stl.tbl = perm.id WHERE perm.name != 'None' AND stl.session = (SELECT session FROM stl_load_errors ORDER BY session DESC LIMIT 1) """ redshift = PostgresHook(redshift_conn_id) # gets names and table IDs of tables in stl_load_errors table in current redshift session table_df = redshift.get_pandas_df(get_table_name) print('table_df: ', table_df) # creates dictionary of table names and IDs to loop over stl_table_dict = dict(zip(table_df['name'].apply(lambda name: name.strip()).values, table_df['id'].values)) print('Staging tables within stl_load_errors table: ', list(stl_table_dict.keys())) return stl_table_dict
def partitions(self): schema, table = request.args.get("table").split('.') sql = """ SELECT a."PART_NAME", a."CREATE_TIME", c."LOCATION", c."IS_COMPRESSED", c."INPUT_FORMAT", c."OUTPUT_FORMAT" FROM "PARTITIONS" a JOIN "TBLS" b ON a."TBL_ID" = b."TBL_ID" JOIN "DBS" d ON b."DB_ID" = d."DB_ID" JOIN "SDS" c ON a."SD_ID" = c."SD_ID" WHERE b."TBL_NAME" like '{table}' AND d."NAME" like '{schema}' ORDER BY "PART_NAME" DESC """.format(**locals()) h = PostgresHook(METASTORE_POSTGRE_CONN_ID) df = h.get_pandas_df(sql) return df.to_html( classes="table table-striped table-bordered table-hover", index=False, na_rep='', )
def _load_from_database(**context): params = context['params'] postgres_conn_id = params['postgres_conn_id'] pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id) conn = pg_hook.get_conn() cur = conn.cursor() raw_query = f"""SELECT * from repositories WHERE processed = %s AND locked = %s LIMIT 1""" query = cur.mogrify(raw_query) repo_df = pg_hook.get_pandas_df(query, parameters=[False, False]) if repo_df is None or len(repo_df) != 1: log.info("Could not load a valid repository from the database") raise AirflowSkipException( "Could not load a valid repository from the database") repo = repo_df.iloc[0, :] repo['repo_id'] = int(repo['repo_id']) log.info( f'Loaded repository {repo["url"]} with ID {repo["repo_id"]}. (stars={repo["stars"]},' f'size={repo["disk_usage"]}') cur = conn.cursor() cur.execute("""UPDATE repositories SET locked = TRUE WHERE repo_id = %s""", [repo['repo_id']]) conn.commit() log.info( f'Aquired lock for repository {repo["url"]} with ID {repo["repo_id"]}') task_instance = context['task_instance'] task_instance.xcom_push('target_repository', repo) return True
def SOHDailyToS3(): import airflow.hooks.S3_hook from airflow.hooks.postgres_hook import PostgresHook import pandas as pd from io import StringIO import datetime as dt postgres_hook = PostgresHook(postgres_conn_id='redshift') s3_hook = airflow.hooks.S3_hook.S3Hook('s3connection') bucket = 'btq-bi' key = 'soh_' + str(dt.datetime.now().strftime('%d%m%y')) + '.csv' query = """select sku,config_sku,brand,gender,category1,category2,category3,category4,category_manager,bar_code,boutiqaat_exclusive, vendor_item_no, vendor_no,contract_type,payment_term_code,country_code,enable_date,last_selling_price,special_price, last_item_cost,last_item_cost_currency,shipping_cost_per_unit,first_grn_date,last_grn_date,total_grn_qty,total_grn_value, total_sellable_qty,toal_nav_non_sellable,soh,crs_available,nav2crs_total,full_pending_open_po_qty,partially_pending_open_po_qty, partial_pending_open_po_total_qty,partial_pending_open_po_received_qty,sku_avg_cost_2020, COALESCE(stock_refreshed_datetime,(select distinct stock_refreshed_datetime from analytics.soh_report where stock_refreshed_datetime is not null limit 1)) as stock_refreshed_datetime, report_time::date from analytics.soh_report sr;""" df_ = postgres_hook.get_pandas_df(query) csv_buf = StringIO() df_.to_csv(csv_buf, header=True, index=False) csv_buf.seek(0) filename = csv_buf.getvalue() s3_hook.load_string(filename, key, bucket, replace=True) return True
def monitor_redshift_table(**op_kwarg): """Redshift table monitor collects the following metrics: - record count - duplicate records - Null/NaN record counts in each column - mean, median, min, max, std of each numeric column """ hook = PostgresHook(REDSHIFT_CONNECTION_ID) data = hook.get_pandas_df(SELECT_DATA, parameters=[REDSHIFT_MONITOR_TABLE_LIMIT]) log_dataframe( "{}".format(REDSHIFT_TABLE), data, with_histograms=True, with_stats=True, with_schema=True, ) log_metric("record count", data.shape[0]) log_metric("Duplicate records", data.shape[0] - data.drop_duplicates().shape[0]) for column in data.columns: log_metric("{} null record count".format(column), int(data[column].isna().sum())) if issubdtype(data[column].dtype, number): log_metric("{} mean".format(column), round(data[column].mean(), 2)) log_metric("{} median".format(column), data[column].median()) log_metric("{} min".format(column), data[column].min()) log_metric("{} max".format(column), data[column].max()) log_metric("{} std".format(column), round(data[column].std(), 2))
def execute(self, context): query = """ SELECT playername as name, team as team, LEFT(value,3)::int AS value, points_total FROM api.solver_data WHERE season = '2018-2019' AND points_total is not null ORDER BY points_total """ pg = PostgresHook(postgres_conn_id=self.pg_conn_id) df = pg.get_pandas_df(query) players = df['name'] teams = df['team'] df = df.drop(['name'], axis=1) df = df.drop(['team'], axis=1) print(df) km = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0) y_means = km.fit_predict(df) df['clusters'] = y_means df["name"] = players df["team"] = teams df.to_csv(self.target_file, sep='|', index=False, header=False)
def monitor_redshift_db(**op_kwarg): """Redshift database monitor collects the following metrics: - Number of tables in database - Shape of each table in the database - Min, max, mean, median number of rows across all tables, - Min, max, mean, median number of columns across all tables, - Total number of rows and columns - Largest tables by row and column - Disk capacity, Free space on disk, Used space on disk (in GB) - Disk percent usage """ hook = PostgresHook(REDSHIFT_CONN_ID) num_redshift_tables = hook.get_first(COUNT_TABLES, parameters=[TARGET_SCHEMA])[0] log_metric("table count", num_redshift_tables) table_row_counts = hook.get_records(COUNT_TABLE_ROWS, parameters=[TARGET_SCHEMA]) num_rows_per_table = {} for tablename, row_count in table_row_counts: num_rows_per_table[tablename] = int(round(row_count)) row_counts = list(num_rows_per_table.values()) log_metric("Max table row count", max(row_counts)) log_metric("Min table row count", min(row_counts)) log_metric("Mean table row count", round(mean(row_counts), 2)) log_metric("Median table row count", median(row_counts)) tables = hook.get_pandas_df(DESCRIBE_TABLES, parameters=[TARGET_SCHEMA]) table_shapes = DataFrame() table_shapes["columns"] = tables.groupby("tablename").nunique("column")["column"] table_shapes["tablename"] = tables["tablename"].unique() table_shapes["rows"] = ( table_shapes["tablename"].map(num_rows_per_table).fillna(0).astype(int) ) for _, row in table_shapes.iterrows(): log_metric("{} shape".format(row["tablename"]), (row["columns"], row["rows"])) log_metric("Max table column count", table_shapes["columns"].max()) log_metric("Min table column count", table_shapes["columns"].max()) log_metric("Mean table column count", round(table_shapes["columns"].mean(), 2)) log_metric("Median table column count", table_shapes["columns"].median()) log_metric("Total columns", table_shapes["columns"].sum()) log_metric("Total rows", table_shapes["rows"].sum()) max_row_table = table_shapes[table_shapes["rows"] == table_shapes["rows"].max()] max_col_table = table_shapes[ table_shapes["columns"] == table_shapes["columns"].max() ] log_metric("Largest table (by row count)", max_row_table["tablename"][0]) log_metric("Largest table (by col count)", max_col_table["tablename"][0]) disk_stats = hook.get_records(DISK_USAGE).pop() disk_capacity, disk_used, disk_free = disk_stats log_metric("Disk capacity (GB)", disk_capacity) log_metric("Disk used (GB)", disk_used) log_metric("Disk free (GB)", disk_free) log_metric("Percent Disk usage", round((disk_used / disk_capacity) * 100, 2))
def execute(self, context): connection_info = BaseHook.get_connection(self.redshift_connection_id) self.log.info( 'LoadDimensionOperator.execute: redshift_connection_id={}'.format( connection_info)) pg_hook = PostgresHook(self.redshift_connection_id) df = pg_hook.get_pandas_df(self.sql) print(df) return df
class RunDataCheckOperator(BaseOperator): """ Extension of Postgres Operator to do checks on data Checks should return no rows if passing """ template_fields = ('sql', ) template_ext = ('.sql', ) ui_color = '#ededed' @apply_defaults def __init__(self, sql, postgres_conn_id='postgres_default', autocommit=False, parameters=None, database=None, check_name=None, raise_error=False, raise_warning=True, *args, **kwargs): super(RunDataCheckOperator, self).__init__(*args, **kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.check_name = check_name self.raise_error = raise_error self.raise_warning = raise_warning def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) df = self.hook.get_pandas_df(self.sql, parameters=self.parameters) df_string = df.to_string(index=False, header=False) for output in self.hook.conn.notices: self.log.info(output) msg = None if len(df) > 0: logging.info( "Something is wrong with the data, checks return zero rows if everything is ok" ) if self.raise_error: raise RuntimeError( f"Check *{self.check_name}* has failed for the following dates:\n```\n{df_string}\n```" ) elif self.raise_warning: msg = f"\n@here - :red-cross: Check *{self.check_name}* has failed for the following dates: :red-cross:\n```\n{df_string}\n```" else: msg = f"\n*{self.check_name}*\n```\n{df_string}\n```" else: msg = f"\nCheck *{self.check_name}* passed :tick:" return msg
def _load_from_database(**context): params = context['params'] postgres_conn_id = params['postgres_conn_id'] pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id) table_name = 'repositories' constraint_column = 'processed' query = f"""SELECT * from {table_name} WHERE {constraint_column} = %s LIMIT 20""" repos = pg_hook.get_pandas_df(query, parameters=[False]) task_instance = context['task_instance'] task_instance.xcom_push('repositories', repos) return True
def run(self, conn_id='postgres_bills3'): """ Fetches data from poostgres schema defined as airflow hook :param conn_id: schema to fetch from :return: dataframe containg results of sql query """ sql_loc = self.inputs["sql_loc"] pg_hook = PostgresHook(postgres_conn_id=conn_id) data_out = pg_hook.get_pandas_df(open(sql_loc, "r").read()) self.outputs["dataframe"].write(data_out)
def execute(self, context): source_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) destination_hook = S3Hook(s3_conn_id=self.s3_conn_id) df = source_hook.get_pandas_df("SELECT * FROM {}".format(self.table)) # not suitable for large files destination_hook.load_string( df.to_csv(None, index=False), key=self.s3_key, bucket_name=self.s3_bucket, replace=True, )
def execute(self, context): self.log.info('Running data quality checks') redshift_conn = PostgresHook(postgres_conn_id=self.redshift_conn_id) test_pairs = zip(self.sql_queries, self.test_results) for query, test_fn in test_pairs: self.log.debug(f'Run data quality query: {query}') result = redshift_conn.get_pandas_df(query) self.log.debug(f'Result: {result}') if test_fn(result): self.log.info('Data quality check passed.') else: self.log.info('Data quality check failed.') raise AssertionError('Data quality check failed.')
def _load_from_database(**context): params = context['params'] postgres_conn_id = params['postgres_conn_id'] pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id) cur = pg_hook.get_cursor() table_name = 'repositories' constraint_col0 = 'processed' constraint_col1 = 'contains_logging' raw_query = f"""SELECT * from {table_name} WHERE {constraint_col0} = %s AND {constraint_col1} = %s LIMIT 20""" query = cur.mogrify(raw_query) repos = pg_hook.get_pandas_df(query, parameters=[True, True]) task_instance = context['task_instance'] task_instance.xcom_push('target_repositories', repos) return True
def execute(self, context): hook = PostgresHook(postgres_conn_id=self.conn_id) df = hook.get_pandas_df(sql='select * from atp_matches_log;') df_load, key = self.dim_mapping[self.table](df) prim_key = ', '.join(key) print(df.shape) print(df.columns) df_load = df_load.where((pd.notnull(df_load)), None) schema = ','.join(df_load.columns) for i, row in df_load.iterrows(): insert_update_query = self.insert_update_pd( row, self.table, schema, prim_key) try: hook.run(insert_update_query) except Exception as e: print(e) self.log.info(f"{self.table} loaded successfully")
def index(self): sql = """ SELECT a."NAME" as db, "DB_LOCATION_URI" as location, count(1) as object_count, a."DESC" as description FROM "DBS" a JOIN "TBLS" b ON a."DB_ID" = b."DB_ID" GROUP BY a."NAME", "DB_LOCATION_URI", a."DESC" """.format(**locals()) h = PostgresHook(METASTORE_POSTGRE_CONN_ID) df = h.get_pandas_df(sql) df.db = ('<a href="/admin/metastorebrowserview/db/?db=' + df.db + '">' + df.db + '</a>') table = df.to_html( classes="table table-striped table-bordered table-hover", index=False, escape=False, na_rep='', ) return self.render("metastore_browser/dbs.html", table=table)
def setup_new(*args, **kwargs): sql = '''select * from measurement where series_id=10261;''' db_url = kwargs['db_url'] engine = create_engine(db_url, echo=True) conn = engine.connect() conn.execute('CREATE table IF NOT EXISTS data(x float, value float)') conn.execute('DELETE FROM data') try: pg = PostgresHook(postgres_conn_id='openaq-db') df = pg.get_pandas_df(sql, parameters=None) print(f'got the df: {df}') print(f'{df.columns}') for x, y in df['value'].iteritems(): conn.execute(f'INSERT into data(x, value) values({x},{y} )') conn.close() except: logging.error( 'Remote database not defined. Use [openaq-db] connection') return None
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # RedShift Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Get number of records in the table records = redshift.get_records( f"SELECT COUNT(*) FROM {self.table_name}") # Fields and data df = redshift.get_pandas_df(self.sql) fields = list(df.columns.values) data_rows = redshift.get_records(self.sql) if self.load_mode == "clean": # Clear data self.log.info(f"Clearing data from {self.table_name} table") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info( f"Deleted {records[0][0]} records from {self.table_name}") else: job_execution_ts = self.filter_key[0].format(**context) next_job_execution_ts = self.filter_key[1].format(**context) filtered_df = df[(df['start_time'] >= job_execution_ts) & (df['start_time'] < next_job_execution_ts)] data_rows = [tuple(x) for x in filtered_df.values] # Populate table self.log.info("Populating data to {} table".format(self.table_name)) redshift.insert_rows(table=self.table_name, rows=data_rows, target_fields=fields, commit_every=1000, replace=False) self.log.info("Inserted {} records to {}".format( len(data_rows), self.table_name))
rows = metadb_hook.get_pandas_df(""" select t2.type source_type, t2.name source_name, t2.host source_host, t2.port source_port, t2.db_name source_db_name, t2.db_user source_db_user, t2.db_psw source_db_psw, t1.id job_id, t1.job_num job_num, t1.job_name job_name, t1.description job_desc, t1.layer job_layer, t1.sql_text job_sql_text, t4.table_name source_table_name, t4.id_column source_tbl_id_col, t1.target_table target_table_name, t1.dependent_jobs dependent_jobs, t5.id schedule_id, t5.schedule_name schedule_name, t5.schedule_interval schedule_interval, t1.job_type job_type, t1.job_submit_args job_submit_args, t1.owner as owner from metadata.job t1 join metadata.datasource t2 on t1.source_id = t2.id and t2.del = false left join metadata.ods_table t4 on t1.source_table_id = t4.id and t4.del = false join metadata.schedule t5 on t1.schedule_id = t5.id and t5.del = false where t1.del = false and t1.is_valid = true and t1.layer in ('DW', 'ADS') """)
def compute_similarity_score(*args, **kwargs): dw1 = PostgresHook(postgres_conn_id='dw1_etl') job_post_sql = """ SELECT id, title FROM job_postings WHERE is_deleted = False AND( description IS NOT NULL OR title IS NOT NULL ) ORDER BY id """ job_post_df = dw1.get_pandas_df(job_post_sql) work_experience_sql = """ SELECT id, user_id, job_title, summary FROM users_work_experiences WHERE is_deleted = False AND( job_title IS NOT NULL OR summary IS NOT NULL ) ORDER BY id """ work_experience_df = dw1.get_pandas_df(work_experience_sql) job_post_title_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_job_post_title_features.csv', header=0) work_experience_title_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_work_experience_title_features.csv', header=0) job_post_skill_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_job_post_skill_features.csv', header=0) work_experience_skill_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_work_experience_skill_features.csv', header=0) datavecsavg_tfidf_job_post_df = \ pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv', header=0) datavecsavg_tfidf_work_experience_df = \ pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_work_experience_features.csv', header=0) combine_job_post_features_df = \ pd.merge(datavecsavg_tfidf_job_post_df, job_post_title_features_df, how='outer', on='job_postings_id') combine_job_post_features_df = \ pd.merge(combine_job_post_features_df, job_post_skill_features_df, how='outer', on='job_postings_id') combine_job_post_features_df = \ pd.merge(combine_job_post_features_df, job_post_df, left_on='job_postings_id', right_on='id') # extract the new job post features first before dropping unused columns to_update_job_df = pd.read_csv(OUTPUT_DIR + 'to_update_job_posts.csv', header=0) to_add_job_df = pd.read_csv(OUTPUT_DIR + 'to_add_job_posts.csv', header=0) to_update_job_df = pd.merge(to_update_job_df, to_add_job_df, on='job_postings_id', how='outer') combine_new_job_post_features_df = \ combine_job_post_features_df[ combine_job_post_features_df[ 'job_postings_id'].isin(to_update_job_df['job_postings_id'])] selected_new_post_id = combine_new_job_post_features_df['job_postings_id'] combine_new_job_post_features_df = \ combine_new_job_post_features_df.drop(['job_postings_id', 'id', 'title'], axis=1) combine_new_job_post_features_df = \ combine_new_job_post_features_df.fillna(0) datavecsavg_tfidf_new_job_post_stack = \ combine_new_job_post_features_df.as_matrix() combine_old_job_post_features_df = \ combine_job_post_features_df[ ~combine_job_post_features_df[ 'job_postings_id'].isin(to_update_job_df['job_postings_id'])] selected_old_post_id = combine_old_job_post_features_df['job_postings_id'] combine_old_job_post_features_df = \ combine_old_job_post_features_df.drop(['job_postings_id', 'id', 'title'], axis=1) combine_old_job_post_features_df = \ combine_old_job_post_features_df.fillna(0) datavecsavg_tfidf_old_job_post_stack = \ combine_old_job_post_features_df.as_matrix() del combine_job_post_features_df combine_work_experience_features_df = \ pd.merge(datavecsavg_tfidf_work_experience_df, work_experience_title_features_df, how='outer', on='id') combine_work_experience_features_df = \ pd.merge(combine_work_experience_features_df, work_experience_skill_features_df, how='outer', on='id') combine_work_experience_features_df = \ pd.merge(combine_work_experience_features_df, work_experience_df, left_on='id', right_on='id') to_update_work_experience_df = \ pd.read_csv(OUTPUT_DIR + 'to_update_work_experiences.csv', header=0) to_add_work_experience_df = \ pd.read_csv(OUTPUT_DIR + 'to_add_work_experiences.csv', header=0) to_update_work_experience_df = pd.merge(to_update_work_experience_df, to_add_work_experience_df, on='work_experience_id', how='outer') combine_new_work_experience_features_df = \ combine_work_experience_features_df[ combine_work_experience_features_df['id'] .isin(to_update_work_experience_df['work_experience_id'])] selected_work_experience_id = combine_work_experience_features_df['id'] selected_work_experience_user_id = \ combine_work_experience_features_df['user_id'] combine_work_experience_features_df = \ combine_work_experience_features_df.drop(['id', 'user_id', 'job_title', 'summary'], axis=1) combine_work_experience_features_df = \ combine_work_experience_features_df.fillna(0) datavecsavg_tfidf_work_experience_stack = \ combine_work_experience_features_df.as_matrix() selected_new_work_experience_id = \ combine_new_work_experience_features_df['id'] selected_new_work_experience_user_id = \ combine_new_work_experience_features_df['user_id'] combine_new_work_experience_features_df = \ combine_new_work_experience_features_df.drop(['id', 'user_id', 'job_title', 'summary'], axis=1) combine_new_work_experience_features_df = \ combine_new_work_experience_features_df.fillna(0) datavecsavg_tfidf_new_work_experience_stack = \ combine_new_work_experience_features_df.as_matrix() # compute cosine similarity scores_all_work_exp_new_job = [] if datavecsavg_tfidf_new_job_post_stack.shape[0] > 0: scores_matrix_all_work_exp_new_job = \ pairwise.cosine_similarity(datavecsavg_tfidf_work_experience_stack, datavecsavg_tfidf_new_job_post_stack) selected_indices = \ np.where(np.round(scores_matrix_all_work_exp_new_job, 2) > 0.01) for i in np.arange(len(selected_indices[0])): if i % 1000000 == 0: logging.info('Processed %s work experiences vs job post' % (i)) work_id = selected_indices[0][i] job_id = selected_indices[1][i] v = scores_matrix_all_work_exp_new_job[work_id, job_id] scores_all_work_exp_new_job\ .append((selected_work_experience_id[work_id], selected_work_experience_user_id[work_id], selected_new_post_id.iloc[job_id], v)) scores_all_work_exp_new_job_df = \ pd.DataFrame.from_records(scores_all_work_exp_new_job, columns=['work_experience_id', 'user_id', 'similar_job_postings_id', 'score']) scores_new_work_exp_old_job = [] if datavecsavg_tfidf_new_work_experience_stack.shape[0] > 0: scores_matrix_new_work_exp_old_job = \ pairwise.cosine_similarity(datavecsavg_tfidf_new_work_experience_stack, datavecsavg_tfidf_old_job_post_stack) selected_indices = \ np.where(np.round(scores_matrix_new_work_exp_old_job, 2) > 0.01) for i in np.arange(len(selected_indices[0])): if i % 1000000 == 0: logging.info('Processed %s work experiences vs job posts' % (i)) work_id = selected_indices[0][i] job_id = selected_indices[1][i] v = scores_matrix_new_work_exp_old_job[work_id, job_id] scores_new_work_exp_old_job\ .append((selected_new_work_experience_id.iloc[work_id], selected_new_work_experience_user_id.iloc[work_id], selected_old_post_id.iloc[job_id], v)) scores_new_work_exp_old_job_df = \ pd.DataFrame.from_records(scores_new_work_exp_old_job, columns=['work_experience_id', 'user_id', 'similar_job_postings_id', 'score']) scores_df = pd.concat([scores_all_work_exp_new_job_df, scores_new_work_exp_old_job_df]) scores_df['model_id'] = pd.Series(6, index=scores_df.index) output_filename = OUTPUT_DIR + 'scores_work_experience_to_job_posts.csv' scores_df.to_csv(output_filename, index=False, encoding='utf-8')
def get_data(table_name, filepath, **kwargs): hook = PostgresHook(postgres_conn_id=POSTGRES_CONN) df = hook.get_pandas_df(sql=f"SELECT * FROM {table_name}") if not len(df): raise ValueError("There is no row") df.to_csv(filepath, index=False)
class KModeSurveyRecOperator(BaseOperator): """ This function calculates the centroid based on the data given using the K-mode method. https://arxiv.org/ftp/cs/papers/0603/0603120.pdf Param rs_conn_id: Connection ID to Redshift Param rs_table: Table to get data from Param features: Column to be included as features Param n_cluster: The clusters to classify users into Param n_iter: The number of iterations to produce the centroid Param init_method: The algorithm to use - Cao Method: Cao et al. [2009] OR Huang Method: Huang [1997] This function returns the Centroid of the survey response and the dataframe with predictions """ @apply_defaults def __init__(self, cluster_name, rs_conn_id, rs_table, rs_schema="public", features=[ 'age', 'gender', 'weight', 'existing_conditions', 'light_exercise' ], n_cluster=5, n_iter=5, init_method="Cao", *args, **kwargs): super(KModeSurveyRecOperator, self).__init__(*args, **kwargs) self.cluster_name = cluster_name self.rs_conn_id = rs_conn_id self.rs_table = rs_table self.rs_schema = rs_schema self.features = features self.n_cluster = n_cluster self.n_iter = n_iter self.init_method = init_method def kmode_calculation(self, data): """ This function calculates the centroid using the k-mode algorithm. This functiontakes in the cleaned data and returns: - Column element mapping dictionary - Centroids - The output data with classification """ col_dict = {} for col in data.columns: data[col] = data[col].astype('category') col_dict.update({col: dict(enumerate(data[col].cat.categories))}) # Get all the cols in the DataFrame cols = [col for col in data.columns] # Transform all values into categorical and numerical values for col in cols: data[col] = data[col].astype('category') data[col] = data[col].cat.codes # Run k-modes using the algorithm kmodes_method = KModes(n_clusters=self.n_cluster, init=self.init_method, n_init=self.n_iter, verbose=1) kmode_result = kmodes_method.fit_predict(data[cols]) # Attach the output label for each data point data['classification'] = pd.Series(kmode_result, index=data.index) return col_dict, kmodes_method.cluster_centroids_, data def get_rs_cols(self): """ This function will get all of the columns into a list generated from the questions in the last 6 days. """ query = """ SELECT DISTINCT question FROM survey_response WHERE 1=1 AND response_time > (current_timestamp - interval '6 day') """.format(schema=self.rs_schema, table=self.rs_table) # Establish connection to Redshift self.rs_hook = PostgresHook(postgres_conn_id=self.rs_conn_id) # Get the cols in a list df = self.rs_hook.get_pandas_df(query) # Convert into list cols_list = df['question'].values.T.tolist() return cols_list def get_rs_query(self, cols_list): """ This function will generate, using the column information to get all of the data """ rs_query = """ SELECT user_id """ for question in cols_list: if question in self.features: rs_query += """ ,COALESCE(CASE WHEN question = '{question}' THEN regexp_replace(response, '\\[|\\]|"', '') END, 'unspecified') AS {question_cleaned} """.format(question=question, question_cleaned=question.replace(" ", "_")) rs_query += """ FROM {schema}.{table} WHERE 1=1 AND response_time > (current_timestamp - interval '7 day') AND user_id IS NOT NULL """.format(schema=self.rs_schema, table=self.rs_table) return rs_query def get_rs_data(self, query): """ This function returns the survey data in the dataframe format """ # Establish connection to Redshift self.rs_hook = PostgresHook(postgres_conn_id=self.rs_conn_id) # Get the data in dataframe survey_df = self.rs_hook.get_pandas_df(query) return survey_df def rs_execute(self, rs_query): """ This function executes the query passed in. """ logging.info("Connecting to Redshift.") rs_conn = PostgresHook(self.rs_conn_id) logging.info("Connection Successful. Executing query.") if rs_query: rs_conn.run(rs_query, False) logging.info("Query Execution Complete.") else: logging.info("No Query to Execute") def dict_to_sql(self, dict_obj): create_query = """ CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_dict ( question VARCHAR(64), value VARCHAR(128), value_mapping int); TRUNCATE {schema}.{table}_{name}_dict; """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) insert_query = """ INSERT INTO {schema}.{table}_{name}_dict VALUES """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) for question, sub_dict in dict_obj.items(): for value_mapping, value in sub_dict.items(): insert_query += """ ('{question}','{value}',{value_mapping}), """.format(question=question, value=value, value_mapping=value_mapping) insert_query = insert_query.strip()[:-1] + ';' return create_query, insert_query def col_mapping(self, dataframe, col_dict): for key, value in col_dict.items(): value.update({-1: 'null'}) dataframe[key].replace(value, inplace=True) return dataframe def df_to_sql(self, dataframe): # Generate Create Query rs_df_create_query = """ CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_cluster ( """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) for column in dataframe.columns: rs_df_create_query += """ {column} VARCHAR(128), """.format(column=column) rs_df_create_query = rs_df_create_query.strip()[:-1] + ');' rs_df_create_query += """ TRUNCATE {schema}.{table}_{name}_cluster; """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) # Generate Insert Query rs_df_insert_query = """ INSERT INTO {schema}.{table}_{name}_cluster VALUES """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) rs_insert_list = [] for index, row in dataframe.iterrows(): rs_insert_list.append( [dataframe[column][index] for column in dataframe.columns]) for row in range(len(rs_insert_list)): if row % 500 == 0 and row != 0: rs_df_insert_query = rs_df_insert_query[:-1] + ';' rs_df_insert_query += """ INSERT INTO {schema}.{table}_{name}_cluster VALUES """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) rs_df_insert_query += str(rs_insert_list[row]).replace( "[", "(").replace("]", ")") rs_df_insert_query += ',' rs_df_insert_query = rs_df_insert_query[:-1] + ';' return rs_df_create_query, rs_df_insert_query def list_to_sql(self, list_obj, col_dict): create_query = """ CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_centroids ( """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) for key in col_dict.keys(): create_query += """ {key} INT, """.format(key=key) create_query += """ cluster INT); TRUNCATE {schema}.{table}_{name}_centroids; """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) insert_query = """ INSERT INTO {schema}.{table}_{name}_centroids VALUES """.format(name=self.cluster_name, schema=self.rs_schema, table=self.rs_table) iter = 0 for item in list_obj: insert_query += str(np.append(item, iter).tolist()).replace( "[", "(").replace("]", ")") insert_query += "," iter = +1 insert_query = insert_query[:-1] + ";" return create_query, insert_query def execute(self, context): # Get the columns from Redshift cols_list = self.get_rs_cols() # Get the query to get the data data_query = self.get_rs_query(cols_list) # Get the data survey_df = self.get_rs_data(data_query) # Calculate clusters using kmodes clustering col_dict, kmodes_centroids, data_result_cat = self.kmode_calculation( survey_df) # Map the data back to original form data_result = self.col_mapping(data_result_cat, col_dict) # Convert the dict object to SQL insert queries and the list to insert query rs_df_create_query, rs_df_insert_query = self.df_to_sql(data_result) rs_col_dict_create_query, rs_col_dict_insert_query = self.dict_to_sql( col_dict) rs_centroid_create_query, rs_centroid_insert_query = self.list_to_sql( kmodes_centroids, col_dict) # Generate list of query to run create_sql_list = [ rs_df_create_query, rs_col_dict_create_query, rs_centroid_create_query ] insert_sql_list = [ rs_df_insert_query, rs_col_dict_insert_query, rs_centroid_insert_query ] # Insert the cluster data into Redshift for create_query in create_sql_list: self.rs_execute(create_query) for insert_query in insert_sql_list: self.rs_execute(insert_query)
def create_stl_table(redshift_conn_id, table, error_table_name, table_id): """ Creates a Redshift table containing all stl error rows associated with the input staging table. All columns within the error table will be converted to VARCHAR given that the errors may be linked to data type issues. Keyword Arguments: redshift_conn_id -- Redshift connection ID (str) table -- Staging table name (str) errror_table_name -- Name to be used to create the error table table_id -- The staging table's table_id defined in the stl_load_errors table """ get_column_names = """ SELECT col_name FROM (SELECT * FROM pg_get_cols('{}') COLS( view_schema name, view_name name, col_name name, col_type varchar, col_num int ) ) """ create_error_table = """ DROP TABLE IF EXISTS {error_table_name}; CREATE TABLE {error_table_name} ( {cast}, err_code INT, err_reason VARCHAR(72) ); """ insert_rows = """ INSERT INTO {error_table_name} SELECT {split_part}, err_code, err_reason FROM stl_load_errors stl WHERE stl.tbl = {id} """ redshift = PostgresHook(redshift_conn_id) # load column names into pandas dataframe col_names_df = redshift.get_pandas_df(get_column_names.format(table)) # put column names into list col_names_list = col_names_df['col_name'].values.tolist() cast_col = "" split_raw_line = "" # loop over table's column names for i,col in enumerate(col_names_list): # if last column don't include ',' at end of string if col == col_names_list[-1]: # adds CAST statement to cast_col string cast_col += "{} VARCHAR".format(col) # adds split_part function to split_raw_line string split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500))".format(i+1) else: cast_col += "{} VARCHAR, ".format(col) split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500)), ".format(i+1) format_dict = { 'table': table, 'error_table_name': error_table_name, 'cast': cast_col, 'split_part':split_raw_line, 'id': table_id } print(f'Creating error table: {error_table_name}') # creates an empty table with duplicate columns of looped table formatted_create_sql = create_error_table.format(**format_dict) redshift.run(formatted_create_sql) # inserts all stl_load_errors raw_line values as strings into apporiate columns within the empty table formatted_insert_sql = insert_rows.format(**format_dict) redshift.run(formatted_insert_sql) error_table_count = redshift.get_records(f'SELECT COUNT(*) FROM {error_table_name}')[0][0] table_count = redshift.get_records(f'SELECT COUNT(*) FROM {table}')[0][0] print(f'{table} COUNT: {table_count}') print(f'{error_table_name} COUNT: {error_table_count}') return error_table_name
def execute(self, context): pg_hook = PostgresHook(postgres_conn_id=self._conn_id) df = pg_hook.get_pandas_df(sql='SELECT * FROM templates') task_instance = context['task_instance'] # type: TaskInstance task_instance.xcom_push('database_df', df)
def compute_description_feature(*args, **kwargs): dw1 = PostgresHook(postgres_conn_id='dw1_etl') job_post_sql = """ SELECT * FROM job_postings WHERE is_deleted = False AND description IS NOT NULL ORDER BY id """ job_post_df = dw1.get_pandas_df(job_post_sql) work_experience_sql = """ SELECT * FROM users_work_experiences WHERE is_deleted = False AND summary IS NOT NULL ORDER BY id """ work_experience_df = dw1.get_pandas_df(work_experience_sql) db1 = PostgresHook(postgres_conn_id='db1_etl') processed_descriptions_sql = """ SELECT * FROM job_posting_description_meta ORDER BY job_posting_id """ processed_descriptions_unpivot_df = \ db1.get_pandas_df(processed_descriptions_sql) processed_descriptions_unpivot_df = \ processed_descriptions_unpivot_df.drop(['created_at', 'updated_at', 'id', 'content_type_metadata'], axis=1) # pivot back processed_descriptions_df = \ processed_descriptions_unpivot_df.pivot(index='job_posting_id', columns='content_type', values='content').reset_index() processed_descriptions_df.sort_values(by='job_posting_id', inplace=True) # flatten words_after_lemma_no_stopwords processed_descriptions_df['words_after_lemma_no_stopwords'] = \ processed_descriptions_df['lemmatized_words_with_no_stopwords']\ .map(lambda x: " ".join([val for sublist in x for val in sublist])) processed_descriptions_df.loc[ processed_descriptions_df['words_after_lemma_no_stopwords'].isnull(), 'words_after_lemma_no_stopwords'] = "" # lowercase processed_descriptions_df['words_after_lemma_no_stopwords'] = \ processed_descriptions_df['words_after_lemma_no_stopwords']\ .map(lambda x: non_letter_removal(x.lower())) processed_descriptions_df['lemmatized_sentences'] = \ processed_descriptions_df['lemmatized_sentences']\ .map(lambda x: [non_letter_removal(sublist.lower()) for sublist in x]) processed_descriptions_df['lemmatized_sentences'] = \ processed_descriptions_df["lemmatized_sentences"]\ .map(lambda y: list(filter(lambda x: x != ' ', y))) sentences_tmp = processed_descriptions_df['lemmatized_sentences'] sentences = [] for description in sentences_tmp: for sentence in description: sentences.append(sentence.split()) files = [] # --------------------train word2vec-------------------------------- # configure logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Set values for various parameters num_features = 500 # Word vector dimensionality min_word_count = 20 # Minimum word count num_workers = 2 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) logging.info('Training model...') # train model with POS and lemmatization model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # save the model model_name = OUTPUT_DIR + 'w2v_500features_20minwords_10context' model.save(model_name) files.append(OUTPUT_DIR + 'w2v_500features_20minwords_10context') db1 = PostgresHook(postgres_conn_id='db1_etl') processed_work_summaries_sql = """ SELECT * FROM user_work_experience_meta ORDER BY work_experience_id """ processed_work_summaries_unpivot_df = \ db1.get_pandas_df(processed_work_summaries_sql) processed_work_summaries_unpivot_df = \ processed_work_summaries_unpivot_df.drop(['created_at', 'updated_at', 'id', 'content_type_metadata'], axis=1) # pivot back processed_work_summaries_df = \ processed_work_summaries_unpivot_df.pivot(index='work_experience_id', columns='content_type', values='content')\ .reset_index() processed_work_summaries_df.sort_values(by='work_experience_id', inplace=True) processed_work_summaries_df['words_after_lemma_no_stopwords'] = \ processed_work_summaries_df[ 'lemmatized_words_with_no_stopwords'].map( lambda x: " ".join([val for sublist in x for val in sublist])) processed_work_summaries_df\ .loc[processed_work_summaries_df[ 'words_after_lemma_no_stopwords'].isnull(), 'words_after_lemma_no_stopwords'] = '' processed_work_summaries_df['words_after_lemma_no_stopwords'] = \ processed_work_summaries_df[ 'words_after_lemma_no_stopwords']\ .map(lambda x: non_letter_removal(x.lower())) # ----------Creating features from Vector Averaging using Tfidf weights---- selected_processed_descriptions_df = \ processed_descriptions_df[ processed_descriptions_df[ 'job_posting_id'].isin(job_post_df['id'])] selected_descriptions = \ selected_processed_descriptions_df['words_after_lemma_no_stopwords'] clean_descriptions = \ processed_descriptions_df['words_after_lemma_no_stopwords'] vectorizer = TfidfVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) vectorizer.fit(clean_descriptions) # need to save when running full model # store the content with open(OUTPUT_DIR + 'description_tfidf.pkl', 'wb') as handle: pickle.dump(vectorizer, handle) files.append(OUTPUT_DIR + 'description_tfidf.pkl') logging.info('Compute feature vectors\n') datavecsavg_tfidf_job_post = \ get_avgfeature_vec_tfidf(selected_descriptions, model, vectorizer, num_features) col_name = [] for i in xrange(0, datavecsavg_tfidf_job_post.shape[1]): col_name.append('feature_%s' % i) datavecsavg_tfidf_job_post_df = \ pd.DataFrame.from_records(datavecsavg_tfidf_job_post, columns=col_name) datavecsavg_tfidf_job_post_df['job_postings_id'] = \ selected_processed_descriptions_df['job_posting_id'] output_filename = OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv' datavecsavg_tfidf_job_post_df.to_csv(output_filename, index=False, encoding='utf-8') selected_processed_work_summaries_df = \ processed_work_summaries_df[ processed_work_summaries_df[ 'work_experience_id'].isin(work_experience_df['id'])] selected_summaries = \ selected_processed_work_summaries_df['words_after_lemma_no_stopwords'] datavecsavg_tfidf_work_experience = \ get_avgfeature_vec_tfidf(selected_summaries, model, vectorizer, num_features) datavecsavg_tfidf_work_experience_df = \ pd.DataFrame.from_records(datavecsavg_tfidf_work_experience, columns=col_name) datavecsavg_tfidf_work_experience_df['id'] = \ selected_processed_work_summaries_df['work_experience_id'].values output_filename = \ OUTPUT_DIR + 'datavecsavg_tfidf_work_experience_features.csv' datavecsavg_tfidf_work_experience_df.to_csv(output_filename, index=False, encoding='utf-8') return files
def compute_similarity_score(*args, **kwargs): dw1 = PostgresHook(postgres_conn_id='dw1_etl') job_post_sql = """ SELECT id, title FROM job_postings WHERE is_deleted = False AND( description IS NOT NULL OR title IS NOT NULL ) ORDER BY id """ job_post_df = dw1.get_pandas_df(job_post_sql) work_experience_sql = """ SELECT id, user_id, job_title, summary FROM users_work_experiences WHERE is_deleted = False AND( job_title IS NOT NULL OR summary IS NOT NULL ) ORDER BY id """ work_experience_df = dw1.get_pandas_df(work_experience_sql) job_post_title_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_job_post_title_features.csv', header=0) work_experience_title_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_work_experience_title_features.csv', header=0) job_post_skill_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_job_post_skill_features.csv', header=0) work_experience_skill_features_df = \ pd.read_csv(OUTPUT_DIR + 'bow_work_experience_skill_features.csv', header=0) datavecsavg_tfidf_job_post_df = \ pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv', header=0) datavecsavg_tfidf_work_experience_df = \ pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_work_experience_features.csv', header=0) combine_job_post_features_df = pd.merge(datavecsavg_tfidf_job_post_df, job_post_title_features_df, how='outer', on='job_postings_id') combine_job_post_features_df = pd.merge(combine_job_post_features_df, job_post_skill_features_df, how='outer', on='job_postings_id') combine_job_post_features_df = pd.merge(combine_job_post_features_df, job_post_df, left_on='job_postings_id', right_on='id') selected_post_id = combine_job_post_features_df['job_postings_id'] combine_job_post_features_df = \ combine_job_post_features_df.drop(['job_postings_id', 'id', 'title'], axis=1) combine_job_post_features_df = combine_job_post_features_df.fillna(0) datavecsavg_tfidf_job_post_stack = combine_job_post_features_df.as_matrix() combine_work_experience_features_df = \ pd.merge(datavecsavg_tfidf_work_experience_df, work_experience_title_features_df, how='outer', on='id') combine_work_experience_features_df = \ pd.merge(combine_work_experience_features_df, work_experience_skill_features_df, how='outer', on='id') combine_work_experience_features_df = \ pd.merge(combine_work_experience_features_df, work_experience_df, left_on='id', right_on='id') selected_work_experience_id = combine_work_experience_features_df['id'] selected_work_experience_user_id = \ combine_work_experience_features_df['user_id'] combine_work_experience_features_df = \ combine_work_experience_features_df.drop(['id', 'user_id', 'job_title', 'summary'], axis=1) combine_work_experience_features_df = \ combine_work_experience_features_df.fillna(0) datavecsavg_tfidf_work_experience_stack = \ combine_work_experience_features_df.as_matrix() # compute cosine similarity scores_matrix = \ pairwise.cosine_similarity(datavecsavg_tfidf_work_experience_stack, datavecsavg_tfidf_job_post_stack) # create dataframefrom cosine similarity matrix logging.info('Matrix size: %s x %s' % (scores_matrix.shape[0], scores_matrix.shape[1])) partnum = kwargs['params']['partnum'] partindex = kwargs['params']['partindex'] selected_indices = np.where(np.round(scores_matrix, 2) > 0.01) del combine_work_experience_features_df del datavecsavg_tfidf_work_experience_stack del datavecsavg_tfidf_job_post_stack del combine_job_post_features_df del work_experience_df del job_post_df del job_post_title_features_df del job_post_skill_features_df del datavecsavg_tfidf_job_post_df del work_experience_title_features_df del work_experience_skill_features_df del datavecsavg_tfidf_work_experience_df end_points = [0] for i in np.arange(partnum): end_points.append((i + 1) * len(selected_indices[0]) / partnum) scores = [] for i in np.arange(end_points[partindex], end_points[partindex + 1]): if i % 1000000 == 0: logging.info('Processed %s job posts' % (i)) work_id = selected_indices[0][i] job_id = selected_indices[1][i] v = scores_matrix[work_id, job_id] scores.append((selected_work_experience_id[work_id], selected_work_experience_user_id[work_id], selected_post_id[job_id], v)) scores_df = pd.DataFrame.from_records(scores, columns=['work_experience_id', 'user_id', 'similar_job_posting_id', 'score']) scores_df['model_id'] = pd.Series(6, index=scores_df.index) output_filename = \ OUTPUT_DIR + \ 'scores_work_experience_to_job_posts_part%d.csv' % (partindex + 1) scores_df.to_csv(output_filename, index=False, encoding='utf-8')
def compute_title_feature(*args, **kwargs): dw1 = PostgresHook(postgres_conn_id='dw1_etl') work_experience_sql = """ SELECT * FROM users_work_experiences WHERE is_deleted = False AND job_title IS NOT NULL ORDER BY id """ job_post_sql = """ SELECT * FROM job_postings WHERE is_deleted = False AND title IS NOT NULL ORDER BY id """ job_post_df = dw1.get_pandas_df(job_post_sql) work_experience_df = dw1.get_pandas_df(work_experience_sql) job_post_titles = job_post_df['title'] num_job_post_titles = len(job_post_titles) work_experience_titles = work_experience_df['job_title'] num_work_experience_titles = len(work_experience_titles) clean_job_post_titles = [] logging.info('Cleaning and parsing the job titles...\n') count = 0 for title in job_post_titles: # If the index is evenly divisible by 1000, print a message if ((count + 1) % 1000 == 0): logging.info('job title %d of %d\n' % (count + 1, num_job_post_titles)) (words, tagged_words) = (text_to_words(title, remove_stopwords=False, use_lem=False)) clean_job_post_titles.append(words) count += 1 (vectorizer, job_post_title_features) = create_bow_vectors(clean_job_post_titles) files = [] # store title bow with open(OUTPUT_DIR + 'title_bow.pkl', 'wb') as handle: pickle.dump(vectorizer, handle) clean_work_experience_titles = [] logging.info('Cleaning and parsing the job titles...\n') count = 0 for title in work_experience_titles: # If the index is evenly divisible by 1000, print a message if ((count + 1) % 1000 == 0): logging.info('work experience title %d of %d\n' % (count + 1, num_work_experience_titles)) (words, tagged_words) = (text_to_words(title, remove_stopwords=False, use_lem=False)) clean_work_experience_titles.append(words) count += 1 # get work exp title feature work_experience_title_features = \ vectorizer.transform(clean_work_experience_titles) col_name = [] for i in xrange(0, job_post_title_features.shape[1]): col_name.append('feature_%s' % i) job_post_title_feature_df = \ pd.DataFrame.from_records(job_post_title_features, columns=col_name) job_post_title_feature_df['job_postings_id'] = job_post_df['id'] output_filename = OUTPUT_DIR + 'bow_job_post_title_features.csv' job_post_title_feature_df.to_csv(output_filename, index=False, encoding='utf-8') work_experience_title_feature_df = \ pd.DataFrame.from_records(work_experience_title_features.toarray(), columns=col_name) work_experience_title_feature_df['id'] = work_experience_df['id'] output_filename = OUTPUT_DIR + '/bow_work_experience_title_features.csv' work_experience_title_feature_df.to_csv(output_filename, index=False, encoding='utf-8') files.append(OUTPUT_DIR + 'title_bow.pkl') return files
def execute(self, context): log.info('Run Pandas over postgres') postgres_instance = PostgresHook(postgres_conn_id=self.connection_id) df = postgres_instance.get_pandas_df(self.sql_query) self.etl_function(df)
def compute_skill_feature(*args, **kwargs): dw1 = PostgresHook(postgres_conn_id='dw1_etl') job_post_sql = """ SELECT * FROM job_postings WHERE is_deleted = False AND skills_group_id IS NOT NULL ORDER BY id """ job_post_df = dw1.get_pandas_df(job_post_sql) skills_groups_sql = """SELECT * FROM skills_groups""" skills_groups_df = dw1.get_pandas_df(skills_groups_sql) join_df = pd.merge(job_post_df, skills_groups_df, left_on='skills_group_id', right_on='id', how='left') job_skills_sql = """ SELECT * FROM job_skills WHERE is_deleted = False ORDER BY "order" """ job_skills_df = dw1.get_pandas_df(job_skills_sql) job_skills = join_df['list'] work_experience_sql = """ SELECT * FROM users_work_experiences WHERE is_deleted = False ORDER BY id """ work_experience_df = dw1.get_pandas_df(work_experience_sql) users_sql = """ SELECT * FROM users WHERE is_deleted = False AND skills_group_id IS NOT NULL ORDER BY id """ users_df = dw1.get_pandas_df(users_sql) users_join_df = pd.merge(users_df, skills_groups_df, left_on='skills_group_id', right_on='id', how='left') work_experience_join_df = pd.merge(work_experience_df, users_join_df, left_on='user_id', right_on='id_x') users_skills = work_experience_join_df['list'] count_vectorizer = \ CountVectorizer(lowercase=False, analyzer=json_array_string_to_list, vocabulary=job_skills_df['name'].tolist(), max_features=500) files = [] # store the content with open(OUTPUT_DIR + 'skills_bow.pkl', 'wb') as handle: pickle.dump(count_vectorizer, handle) job_post_skills_features = count_vectorizer.fit_transform(job_skills) col_name = [] for i in xrange(0, job_post_skills_features.shape[1]): col_name.append('feature_%s' % i) # put back in db temporaily for other tasks to access job_post_skill_feature_df = \ pd.DataFrame.from_records(job_post_skills_features.toarray(), columns=col_name) job_post_skill_feature_df['job_postings_id'] = join_df['id_x'] output_filename = OUTPUT_DIR + 'bow_job_post_skill_features.csv' job_post_skill_feature_df.to_csv(output_filename, index=False, encoding='utf-8') # users_skills work_experience_skills_features = count_vectorizer.fit_transform(users_skills) # put back in db temporaily for other tasks to access work_experience_skill_feature_df = \ pd.DataFrame.from_records(work_experience_skills_features.toarray(), columns=col_name) work_experience_skill_feature_df['id'] = work_experience_join_df['id'] output_filename = OUTPUT_DIR + 'bow_work_experience_skill_features.csv' work_experience_skill_feature_df.to_csv(output_filename, index=False, encoding='utf-8') files.append(OUTPUT_DIR + 'skills_bow.pkl') return files