def generate_search_terms(**kwargs):
    """ Generate subdag to search twitter for terms. """
    dbconn = MySqlHook(mysql_conn_id="mysql_default")
    conn = dbconn.get_connection()
    cursor = conn.cursor()
    query = "select * from twitter_terms"
    df = pd.read_sql_query(query, conn)
    return random.choice([
        "search_{}_twitter".format(re.sub(r"\W+", "", t))
        for t in df.search_term.values
    ])
def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
    """ Fill sqlite database with a few search terms. """
    dbconn = MySqlHook(mysql_conn_id="mysql_default")
    conn = dbconn.get_connection()
    cursor = conn.cursor()
    df = pd.DataFrame(my_terms, columns=["search_term"])
    try:
        df.to_sql("twitter_terms", conn)
    except ValueError:
        # table already exists
        pass
예제 #3
0
def csv_to_sql(directory=RAW_TWEET_DIR, **kwargs):
    """ csv to sql pipeline using pandas
        params:
            directory: str (file path to csv files)
    """
    dbconn = MySqlHook(mysql_conn_id="mysql_default")
    conn = dbconn.get_connection()
    cursor = conn.cursor()

    for fname in glob.glob("{}/*.csv".format(directory)):
        if "_read" not in fname:
            try:
                df = pd.read_csv(fname)
                df.to_sql("tweets", dbconn, if_exists="append", index=False)
                shutil.move(fname, fname.replace(".csv", "_read.csv"))
            except pd.io.common.EmptyDataError:
                # probably an io error with another task / open file
                continue
예제 #4
0
def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs):
    """ Identify the most popular links from the last day of tweest in the db
        Writes them to latest_links.txt in the RAW_TWEET_DIR
        (or directory kwarg)
    """
    dbconn = MySqlHook(mysql_conn_id="mysql_default")
    conn = dbconn.get_connection()
    cursor = conn.cursor()

    query = """select * from tweets where
    created > date('now', '-1 days') and urls is not null
    order by favorite_count"""
    df = pd.read_sql_query(query, conn)
    df.urls = df.urls.map(ast.literal_eval)
    cntr = Counter(itertools.chain.from_iterable(df.urls.values))
    with open("{}/latest_links.txt".format(directory), write_mode) as latest:
        wrtr = writer(latest)
        wrtr.writerow(["url", "count"])
        wrtr.writerows(cntr.most_common(5))