Example #1
0
File: dag.py Project: xyzlat/whirl
def ingest_csv_into_mysql(input_csv):
    df = pandas.read_csv(input_csv)
    hook = MySqlHook(mysql_conn_id='mysql_connection')
    df.to_sql(
        name="users",
        if_exists='replace',
        con=hook.get_sqlalchemy_engine()
    )
Example #2
0
def upload_db(table, tmp_file, mysql_conn_id='default_mysql'):
    df = pd.read_csv(tmp_file, )

    mysql_hook = MySqlHook(mysql_conn_id=mysql_conn_id)
    print(df)
    print(
        '###############################################################################################'
    )
    conn = mysql_hook.get_conn()
    cursor = conn.cursor()
    cursor.execute('truncate {}'.format(table))
    conn.commit()
    print(
        '###############################################################################################'
    )
    df.to_sql(table,
              mysql_hook.get_sqlalchemy_engine(),
              if_exists='append',
              index=False)
    def execute(self, context):
        ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id,
                                      key_file=self.key_file)
        try:
            since_formatted = datetime.strptime(
                self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            since_formatted = str(self.since)
        try:
            until_formatted = datetime.strptime(
                self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            until_formatted = str(self.until)
        report = ga_conn.get_analytics_report(
            self.view_id,
            since_formatted,
            until_formatted,
            self.sampling_level,
            self.dimensions,
            self.metrics,
            self.page_size,
            self.include_empty_rows,
            dimension_filter_clauses=self.dimension_filter_clauses)

        columnHeader = report.get('columnHeader', {})
        # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future
        # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..)
        dimensionHeaders = [{
            'name': header.replace('ga:', ''),
            'type': 'varchar(255)'
        } for header in columnHeader.get('dimensions', [])]
        metricHeaders = [{
            'name':
            entry.get('name').replace('ga:', ''),
            'type':
            self.metricMap.get(entry.get('type'), 'varchar(255)')
        } for entry in columnHeader.get('metricHeader', {}).get(
            'metricHeaderEntries', [])]

        rows = report.get('data', {}).get('rows', [])
        all_data = []
        for row_counter, row in enumerate(rows):
            root_data_obj = {}
            dimensions = row.get('dimensions', [])
            metrics = row.get('metrics', [])

            for index, dimension in enumerate(dimensions):
                header = dimensionHeaders[index].get('name').lower()
                root_data_obj[header] = dimension
            for metric in metrics:
                data = {}
                data.update(root_data_obj)

                for index, value in enumerate(metric.get('values', [])):
                    header = metricHeaders[index].get('name').lower()
                    data[header] = value

                data['viewid'] = self.view_id
                data['timestamp'] = self.since

                all_data.append(data)

        df_google_data = pd.DataFrame(all_data)
        mysql_hook = MySqlHook(self.mysql_conn_id)
        df_google_data.to_sql(name=self.destination_table,
                              con=mysql_hook.get_sqlalchemy_engine(),
                              dtype=self.destination_table_dtypes,
                              if_exists=self.if_exists,
                              schema=self.destination_schema)
Example #4
0
from airflow.hooks.mysql_hook import MySqlHook
from sqlalchemy.orm import sessionmaker
from datetime import datetime

# MySQL config
mysqlhook = MySqlHook(mysql_conn_id='PTT')

engine_kwargs = {'connect_args': {'charset': 'utf8mb4'}}
Session = sessionmaker(bind=mysqlhook.get_sqlalchemy_engine(engine_kwargs))

session = Session()


def upsert(row, table):
    url = row['url']
    hits = row['hits']
    title = row['title']
    board = row['board']
    author = row['author']
    posted_date = row['timestamp'].split('T')[0]
    description = row['description']

    record = session.query(table).filter_by(title=title,
                                            author=author,
                                            board=board,
                                            url=url).first()

    if not record:
        record = table()

    record.url = url