def generate_search_terms(**kwargs): """ Generate subdag to search twitter for terms. """ dbconn = MySqlHook(mysql_conn_id="mysql_default") conn = dbconn.get_connection() cursor = conn.cursor() query = "select * from twitter_terms" df = pd.read_sql_query(query, conn) return random.choice([ "search_{}_twitter".format(re.sub(r"\W+", "", t)) for t in df.search_term.values ])
def fill_terms(my_terms=SEARCH_TERMS, **kwargs): """ Fill sqlite database with a few search terms. """ dbconn = MySqlHook(mysql_conn_id="mysql_default") conn = dbconn.get_connection() cursor = conn.cursor() df = pd.DataFrame(my_terms, columns=["search_term"]) try: df.to_sql("twitter_terms", conn) except ValueError: # table already exists pass
def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs): """ csv to sql pipeline using pandas params: directory: str (file path to csv files) """ dbconn = MySqlHook(mysql_conn_id="mysql_default") conn = dbconn.get_connection() cursor = conn.cursor() for fname in glob.glob("{}/*.csv".format(directory)): if "_read" not in fname: try: df = pd.read_csv(fname) df.to_sql("tweets", dbconn, if_exists="append", index=False) shutil.move(fname, fname.replace(".csv", "_read.csv")) except pd.io.common.EmptyDataError: # probably an io error with another task / open file continue
def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs): """ Identify the most popular links from the last day of tweest in the db Writes them to latest_links.txt in the RAW_TWEET_DIR (or directory kwarg) """ dbconn = MySqlHook(mysql_conn_id="mysql_default") conn = dbconn.get_connection() cursor = conn.cursor() query = """select * from tweets where created > date('now', '-1 days') and urls is not null order by favorite_count""" df = pd.read_sql_query(query, conn) df.urls = df.urls.map(ast.literal_eval) cntr = Counter(itertools.chain.from_iterable(df.urls.values)) with open("{}/latest_links.txt".format(directory), write_mode) as latest: wrtr = writer(latest) wrtr.writerow(["url", "count"]) wrtr.writerows(cntr.most_common(5))