def generate_search_terms(**kwargs):
    """ Generate subdag to search twitter for terms. """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    query = "select * from twitter_terms"
    df = pd.read_sql_query(query, conn)
    return random.choice([
        'search_{}_twitter'.format(re.sub(r'\W+', '', t))
        for t in df.search_term.values])
def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
    """ Fill sqlite database with a few search terms. """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    df = pd.DataFrame(my_terms, columns=['search_term'])
    try:
        df.to_sql('twitter_terms', conn)
    except ValueError:
        # table already exists
        pass
Example #3
0
def generate_search_terms(**kwargs):
    """ Generate subdag to search twitter for terms. """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    query = "select * from twitter_terms"
    df = pd.read_sql_query(query, conn)
    return random.choice([
        'search_{}_twitter'.format(re.sub(r'\W+', '', t))
        for t in df.search_term.values
    ])
Example #4
0
def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
    """ Fill sqlite database with a few search terms. """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    df = pd.DataFrame(my_terms, columns=['search_term'])
    try:
        df.to_sql('twitter_terms', conn)
    except ValueError:
        # table already exists
        pass
def csv_to_sqlite(directory=RAW_TWEET_DIR, **kwargs):
    """ Very basic csv to sqlite pipeline using pandas
        params:
            directory: str (file path to csv files)
    """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    for fname in glob.glob('{}/*.csv'.format(directory)):
        if '_read' not in fname:
            try:
                df = pd.read_csv(fname)
                df.to_sql('tweets', conn, if_exists='append', index=False)
                shutil.move(fname, fname.replace('.csv', '_read.csv'))
            except pd.io.common.EmptyDataError:
                # probably an io error with another task / open file
                continue
Example #6
0
def csv_to_sqlite(directory=RAW_TWEET_DIR, **kwargs):
    """ Very basic csv to sqlite pipeline using pandas
        params:
            directory: str (file path to csv files)
    """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    for fname in glob.glob('{}/*.csv'.format(directory)):
        if '_read' not in fname:
            try:
                df = pd.read_csv(fname)
                df.to_sql('tweets', conn, if_exists='append', index=False)
                shutil.move(fname, fname.replace('.csv', '_read.csv'))
            except pd.io.common.EmptyDataError:
                # probably an io error with another task / open file
                continue
Example #7
0
 def get_sql_hook(self, sql_conn_id):
     if 'sqlite' in sql_conn_id:
         from airflow.hooks import SqliteHook
         return SqliteHook(sql_conn_id)
     else:
         from airflow.hooks import MySqlHook
         return MySqlHook(sql_conn_id)
def identify_popular_links(directory=RAW_TWEET_DIR, write_mode='w', **kwargs):
    """ Identify the most popular links from the last day of tweest in the db
        Writes them to latest_links.txt in the RAW_TWEET_DIR
        (or directory kwarg)
    """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    query = """select * from tweets where
    created > date('now', '-1 days') and urls is not null
    order by favorite_count"""
    df = pd.read_sql_query(query, conn)
    df.urls = df.urls.map(ast.literal_eval)
    cntr = Counter(itertools.chain.from_iterable(df.urls.values))
    with open('{}/latest_links.txt'.format(directory), write_mode) as latest:
        wrtr = writer(latest)
        wrtr.writerow(['url', 'count'])
        wrtr.writerows(cntr.most_common(5))
Example #9
0
def identify_popular_links(directory=RAW_TWEET_DIR, write_mode='w', **kwargs):
    """ Identify the most popular links from the last day of tweest in the db
        Writes them to latest_links.txt in the RAW_TWEET_DIR
        (or directory kwarg)
    """
    sqlite = SqliteHook('twitter_sqlite')
    conn = sqlite.get_conn()
    query = """select * from tweets where
    created > date('now', '-1 days') and urls is not null
    order by favorite_count"""
    df = pd.read_sql_query(query, conn)
    df.urls = df.urls.map(ast.literal_eval)
    cntr = Counter(itertools.chain.from_iterable(df.urls.values))
    with open('{}/latest_links.txt'.format(directory), write_mode) as latest:
        wrtr = writer(latest)
        wrtr.writerow(['url', 'count'])
        wrtr.writerows(cntr.most_common(5))
Example #10
0
 def execute(self, context):
     logging.info('Executing: ' + self.sql)
     hook = SqliteHook(sqlite_conn_id=self.sqlite_conn_id)
     hook.run(self.sql, parameters=self.parameters)