def generate_search_terms(**kwargs): """ Generate subdag to search twitter for terms. """ sqlite = SqliteHook('twitter_sqlite') conn = sqlite.get_conn() query = "select * from twitter_terms" df = pd.read_sql_query(query, conn) return random.choice([ 'search_{}_twitter'.format(re.sub(r'\W+', '', t)) for t in df.search_term.values])
def fill_terms(my_terms=SEARCH_TERMS, **kwargs): """ Fill sqlite database with a few search terms. """ sqlite = SqliteHook('twitter_sqlite') conn = sqlite.get_conn() df = pd.DataFrame(my_terms, columns=['search_term']) try: df.to_sql('twitter_terms', conn) except ValueError: # table already exists pass
def generate_search_terms(**kwargs): """ Generate subdag to search twitter for terms. """ sqlite = SqliteHook('twitter_sqlite') conn = sqlite.get_conn() query = "select * from twitter_terms" df = pd.read_sql_query(query, conn) return random.choice([ 'search_{}_twitter'.format(re.sub(r'\W+', '', t)) for t in df.search_term.values ])
def csv_to_sqlite(directory=RAW_TWEET_DIR, **kwargs): """ Very basic csv to sqlite pipeline using pandas params: directory: str (file path to csv files) """ sqlite = SqliteHook('twitter_sqlite') conn = sqlite.get_conn() for fname in glob.glob('{}/*.csv'.format(directory)): if '_read' not in fname: try: df = pd.read_csv(fname) df.to_sql('tweets', conn, if_exists='append', index=False) shutil.move(fname, fname.replace('.csv', '_read.csv')) except pd.io.common.EmptyDataError: # probably an io error with another task / open file continue
def get_sql_hook(self, sql_conn_id): if 'sqlite' in sql_conn_id: from airflow.hooks import SqliteHook return SqliteHook(sql_conn_id) else: from airflow.hooks import MySqlHook return MySqlHook(sql_conn_id)
def identify_popular_links(directory=RAW_TWEET_DIR, write_mode='w', **kwargs): """ Identify the most popular links from the last day of tweest in the db Writes them to latest_links.txt in the RAW_TWEET_DIR (or directory kwarg) """ sqlite = SqliteHook('twitter_sqlite') conn = sqlite.get_conn() query = """select * from tweets where created > date('now', '-1 days') and urls is not null order by favorite_count""" df = pd.read_sql_query(query, conn) df.urls = df.urls.map(ast.literal_eval) cntr = Counter(itertools.chain.from_iterable(df.urls.values)) with open('{}/latest_links.txt'.format(directory), write_mode) as latest: wrtr = writer(latest) wrtr.writerow(['url', 'count']) wrtr.writerows(cntr.most_common(5))
def execute(self, context): logging.info('Executing: ' + self.sql) hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id) hook.run(self.sql, parameters=self.parameters)