def get_news_meta(**kwargs): ''' Retrieve metadata for news articles published on our set of sources during the time period between our last crawl and the current execution time. ''' exec_date = kwargs.get('ds') or datetime.now() start_date = exec_date - timedelta(hours=3) api_key = kwargs.get('api_key') api = NewsAPI(api_key) query_args = { 'sources': SOURCE_LIST, 'page_size': 100, 'from': convert_time(start_date), 'to': convert_time(exec_date), 'language': 'en' } pg_hook = PostgresHook(postgres_conn_id='sumzero') pagination = api.paginate_articles(**query_args) for resp in pagination: meta_list = resp.articles if meta_list: rows = (meta_to_row(d) for d in meta_list) pg_hook.insert_rows('news_meta1', rows=rows, target_fields=('title', 'description', 'url', 'published_on'), commit_every=1000) return meta_list
def pull_and_insert(**kwargs): #Pull query from sql directory query = kwargs['templates_dict']['query'] #Initialize snql hook and pull data snql_hook = PostgresHook('snql') results = snql_hook.get_records(query) #Initialize staging hook and insert data to staging table staging_hook = PostgresHook('snql_staging') staging_hook.insert_rows('dim_sneakers', results)
def populate_target(**kwargs): #Pull data from staging staging_hook = PostgresHook('snql_staging') #Pull query from sql directory query = kwargs['templates_dict']['query'] #Execute query staging_results = staging_hook.get_records(query) #Initialize hook to snql snql_hook = PostgresHook('snql') #Delete current rows in target table table snql_hook.run("""DELETE FROM dim_sneakers""") #Insert new rows into target table snql_hook.insert_rows('dim_sneakers', staging_results)
def api_json_to_db(**kwargs): base_dir = Variable.get("tfl_park_base_dir") json_files = [ f for f in listdir(base_dir) if isfile(join(base_dir, f)) if '.json' in f ] db_tuples = [] for current_file in json_files: time_pattern = re.compile('[0-9]*') pattern_match = time_pattern.match(current_file) if pattern_match is not None: timestamp = pattern_match.group() with open(join(base_dir, current_file), 'r') as f: car_parks = json.loads(f.read()) for car_park in car_parks: park_id = car_park['id'] park_name = car_park['name'] for bay in car_park['bays']: park_bay_type = bay['bayType'] park_bay_occupied = bay['occupied'] park_bay_free = bay['free'] park_bay_count = bay['bayCount'] db_tuples.append( (timestamp, park_id, park_name, park_bay_type, park_bay_occupied, park_bay_free, park_bay_count)) target_fields = [ 't_stamp', 'park_id', 'park_name', 'park_bay_type', 'park_bay_occupied', 'park_bay_free', 'park_bay_count' ] db_hook = PostgresHook(postgres_conn_id='tfl_db') db_hook.insert_rows('tfl_data', db_tuples, target_fields) for current_file in json_files: remove(join(base_dir, current_file))