Python MySqlHook.MySqlHook 예제들, airflow.hooks.mysql_hook.MySqlHook.MySqlHook Python 예제들

예제 #1

0

파일 보기

    def test_mysql_hook_test_bulk_dump_mock(self, mock_get_conn):
        mock_execute = mock.MagicMock()
        mock_get_conn.return_value.cursor.return_value.execute = mock_execute

        from airflow.hooks.mysql_hook import MySqlHook
        hook = MySqlHook('airflow_ci')
        table = "INFORMATION_SCHEMA.TABLES"
        tmp_file = "/path/to/output/file"
        hook.bulk_dump(table, tmp_file)

        from xTool.utils.tests import assertEqualIgnoreMultipleSpaces
        mock_execute.assert_called_once()
        query = """
            SELECT * INTO OUTFILE '{tmp_file}'
            FROM {table}
        """.format(tmp_file=tmp_file, table=table)
        assertEqualIgnoreMultipleSpaces(self, mock_execute.call_args[0][0],
                                        query)

예제 #2

0

파일 보기

파일: operators.py 프로젝트: wjlow/incubator-airflow

    def test_mysql_to_hive_type_conversion(self, mock_load_file):
        mysql_conn_id = 'airflow_ci'
        mysql_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        m = MySqlHook(mysql_conn_id)

        try:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                c.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT,
                        c1 SMALLINT,
                        c2 MEDIUMINT,
                        c3 INT,
                        c4 BIGINT,
                        c5 TIMESTAMP
                    )
                """.format(mysql_table))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            t = MySqlToHiveTransfer(task_id='test_m2h',
                                    mysql_conn_id=mysql_conn_id,
                                    hive_cli_conn_id='beeline_default',
                                    sql="SELECT * FROM {}".format(mysql_table),
                                    hive_table='test_mysql_to_hive',
                                    dag=self.dag)
            t.run(start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE,
                  ignore_ti_state=True)

            mock_load_file.assert_called_once()
            d = OrderedDict()
            d["c0"] = "SMALLINT"
            d["c1"] = "INT"
            d["c2"] = "INT"
            d["c3"] = "BIGINT"
            d["c4"] = "DECIMAL(38,0)"
            d["c5"] = "TIMESTAMP"
            self.assertEqual(mock_load_file.call_args[1]["field_dict"], d)
        finally:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))

예제 #3

0

파일 보기

    def execute(self, context):
        try:
            _json = {}
            _data = {}

            if 'job' in context['dag_run'].conf:
                logging.debug('{0}: dag_run conf: \n {1}'.format(
                    self.task_id, context['dag_run'].conf['job']))
                _json = context['dag_run'].conf['job']

            mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
            with closing(mysql.get_conn()) as conn:
                with closing(conn.cursor()) as cursor:
                    if 'biowardrobe_uid' in context['dag_run'].conf:
                        _data = get_biowardrobe_data(
                            cursor, context['dag_run'].conf['biowardrobe_uid'])
                        _json = _data['job']

                    update_status(
                        uid=_json['uid'],
                        message='Analysing',
                        code=11,
                        conn=conn,
                        cursor=cursor,
                        optional_column="forcerun=0, dateanalyzes=now()")

                    update_status(uid=_json['uid'],
                                  message='Analysing',
                                  code=11,
                                  conn=conn,
                                  cursor=cursor,
                                  optional_column="dateanalyzed=now()",
                                  optional_where="and dateanalyzed is null")

            return self.cwl_dispatch(_json)

            # fragment = urlsplit(self.dag.default_args["workflow"]).fragment
            # fragment = fragment + '/' if fragment else ''
            # job_order_object_extended = {fragment + key: value for key, value in job_order_object.items()}

        except Exception as e:
            _logger.info('Dispatch Exception {0}: \n {1} {2}'.format(
                self.task_id, type(e), e))
            pass

예제 #4

0

파일 보기

    def test_mysql_to_hive_type_conversion(self, mock_load_file):
        mysql_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        hook = MySqlHook()

        try:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                conn.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT,
                        c1 SMALLINT,
                        c2 MEDIUMINT,
                        c3 INT,
                        c4 BIGINT,
                        c5 TIMESTAMP
                    )
                """.format(mysql_table))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            op = MySqlToHiveTransfer(
                task_id='test_m2h',
                hive_cli_conn_id='hive_cli_default',
                sql="SELECT * FROM {}".format(mysql_table),
                hive_table='test_mysql_to_hive',
                dag=self.dag)
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

            assert mock_load_file.call_count == 1
            ordered_dict = OrderedDict()
            ordered_dict["c0"] = "SMALLINT"
            ordered_dict["c1"] = "INT"
            ordered_dict["c2"] = "INT"
            ordered_dict["c3"] = "BIGINT"
            ordered_dict["c4"] = "DECIMAL(38,0)"
            ordered_dict["c5"] = "TIMESTAMP"
            self.assertEqual(mock_load_file.call_args[1]["field_dict"],
                             ordered_dict)
        finally:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))

예제 #5

0

파일 보기

파일: hello_world3.py 프로젝트: jimleckband/Insight_Data_Eng

def get_data_from_mysql(filename, tablename):
    hook = MySqlHook(mysql_conn_id='mysql_baseball')

    sql = "select * from "
    sql += tablename

    cur = hook.get_records(sql)

    #    f = open(filename,'w')
    #    print >>f, cur

    c = csv.writer(open(filename, "wb"), quoting=csv.QUOTE_NONNUMERIC)

    for row in cur:
        c.writerow(row)

    c.close()

    return cur

예제 #6

0

파일 보기

    def get_mysql_dataset(**kwargs):
        if ("mysql_conn_id" not in kwargs or "schema" not in kwargs
                or "sql" not in kwargs):
            raise Exception("Miss parameter mysql_conn_id or metadata or sql.")

        maxrows = 0 if "maxrows" not in kwargs else kwargs["maxrows"]
        how = 1 if "how" not in kwargs else kwargs["how"]

        mysql = MySqlHook(mysql_conn_id=kwargs["mysql_conn_id"],
                          schema=kwargs["schema"])
        conn = mysql.get_conn()
        if not conn.open:
            raise Exception("Could not open connection.")
        conn.query(kwargs["sql"])
        result = conn.store_result()
        dataset = result.fetch_row(maxrows=maxrows, how=how)
        conn.close()

        return dataset

예제 #7

0

파일 보기

파일: load_oltp_operator.py 프로젝트: javidy/airflow_pa

 def __init__(self,
              src_file="/usr/local/airflow/csv_lz_2/cleaned_depts.csv",
              src_file_attr={
                  "action": "action",
                  "key": "department_id"
              },
              tgt_table_conn_id='mysql_default',
              db_table=None,
              df=None,
              output_filename="/usr/local/airflow/csv_lz_2/depts_td.csv",
              *args,
              **kwargs):
     super(LoadOltpOperator, self).__init__(*args, **kwargs)
     self.src_file = src_file
     self.src_file_attr = src_file_attr
     self.tgt_table_conn_id = tgt_table_conn_id
     self.db_table = db_table
     self.conn = MySqlHook(conn_name_attr=tgt_table_conn_id)
     self.output_filename = output_filename

예제 #8

0

파일 보기

def upload_db(table, tmp_file, mysql_conn_id='default_mysql'):
    df = pd.read_csv(tmp_file, )

    mysql_hook = MySqlHook(mysql_conn_id=mysql_conn_id)
    print(df)
    print(
        '###############################################################################################'
    )
    conn = mysql_hook.get_conn()
    cursor = conn.cursor()
    cursor.execute('truncate {}'.format(table))
    conn.commit()
    print(
        '###############################################################################################'
    )
    df.to_sql(table,
              mysql_hook.get_sqlalchemy_engine(),
              if_exists='append',
              index=False)

예제 #9

0

파일 보기

def identify_popular_links(directory=RAW_TWEET_DIR, write_mode="w", **kwargs):
    """ Identify the most popular links from the last day of tweest in the db
        Writes them to latest_links.txt in the RAW_TWEET_DIR
        (or directory kwarg)
    """
    dbconn = MySqlHook(mysql_conn_id="mysql_default")
    conn = dbconn.get_connection()
    cursor = conn.cursor()

    query = """select * from tweets where
    created > date('now', '-1 days') and urls is not null
    order by favorite_count"""
    df = pd.read_sql_query(query, conn)
    df.urls = df.urls.map(ast.literal_eval)
    cntr = Counter(itertools.chain.from_iterable(df.urls.values))
    with open("{}/latest_links.txt".format(directory), write_mode) as latest:
        wrtr = writer(latest)
        wrtr.writerow(["url", "count"])
        wrtr.writerows(cntr.most_common(5))

예제 #10

0

파일 보기

파일: workflow.py 프로젝트: wizensoft/airflow

def get_users(user_id, context):
    db = MySqlHook(mysql_conn_id='mariadb', schema="dbo")
    sql = f"""
    select
        user_id, name, culture, group_id, employee_num, anonymous_name, email, theme_code, date_format_code, time_format_code, time_zone, row_count, language_code, 
        interface_id, phone, mobile, fax, icon, addsign_img, is_plural, is_notification, is_absence, is_deputy 
    from
        users
    where 
        user_id = %s
    """
    task = {}
    rows = db.get_records(sql, parameters=[user_id])
    for row in rows:
        model = {
            'user_id': row[0],
            'name': row[1],
            'culture': row[2],
            'group_id': row[3],
            'employee_num': row[4],
            'anonymous_name': row[5],
            'email': row[6],
            'theme_code': row[7],
            'date_format_code': row[8],
            'time_format_code': row[9],
            'row_count': row[10],
            'language_code': row[11],
            'interface_id': row[12],
            'phone': row[13],
            'mobile': row[14],
            'fax': row[15],
            'icon': row[16],
            'addsign_img': row[17],
            'is_plural': row[18],
            'is_notification': row[19],
            'is_absence': row[20],
            'is_deputy': row[21]
        }
        task = model

    context['ti'].xcom_push(key=USERS, value=task)
    return task

예제 #11

0

파일 보기

파일: covid_dag.py 프로젝트: jonathanDeLeonGES/Shinyapps

def insert_process(**kwargs):
    ti = kwargs['ti']
    #Extrae el archivo
    source_file = ti.xcom_pull(task_ids='transform_process')
    #Conexion a Base de Datos
    db_connection = MySqlHook('airflow_db').get_sqlalchemy_engine()

    df = pd.read_csv(source_file)

    with db_connection.begin() as transaction:
        #Elimina lo que existe en la tabla cada vez que se ejecuta
        transaction.execute(
            "DELETE FROM covid.time_series_covid19_confirmed_global WHERE 1=1")
        df.to_sql(
            "time_series_covid19_confirmed_global",  #Nombre tabla
            con=transaction,
            schema="covid",
            if_exists="append",
            index=False)
    os.remove(source_file)

예제 #12

0

파일 보기

 def index(self):
     sql = """
     SELECT
         a.name as db, db_location_uri as location,
         count(1) as object_count, a.desc as description
     FROM DBS a
     JOIN TBLS b ON a.DB_ID = b.DB_ID
     GROUP BY a.name, db_location_uri, a.desc
     """.format(**locals())
     h = MySqlHook(METASTORE_MYSQL_CONN_ID)
     df = h.get_pandas_df(sql)
     df.db = ('<a href="/admin/metastorebrowserview/db/?db=' + df.db +
              '">' + df.db + '</a>')
     table = df.to_html(
         classes="table table-striped table-bordered table-hover",
         index=False,
         escape=False,
         na_rep='',
     )
     return self.render("metastore_browser/dbs.html", table=table)

예제 #13

0

파일 보기

파일: Zomato_test_dag.py 프로젝트: preeti4924/Zomato-DWH

def url_details_create_insert(*args, **kwargs):
    with open("/Users/preetiyerkuntwar/documents/Zomato-test/all_restro.json",
              "r") as f:
        result = json.load(f)
    f.close()
    normalizedL1 = pd.json_normalize(result["restaurants"], max_level=1).drop(
        columns=['restaurant.apikey']).set_index('restaurant.id')

    daL2 = normalizedL1.reset_index()

    url_details = daL2[dfNameDict['url']]
    mysql_hook = MySqlHook(mysql_conn_id="mysql_zomato")
    conn = mysql_hook.get_conn()
    connection = conn
    curr = connection.cursor()

    for i, row in url_details.iterrows():
        curr.execute(insert_into_sql_statements.INSERT_TABLE_URL_DETAILS, row)
        connection.commit()
    curr.close()

예제 #14

0

파일 보기

def create_table():
    # Drop and Re-create table
    connection = MySqlHook(mysql_conn_id='mysql_default')

    sql = '''CREATE TABLE IF NOT EXISTS `swapi_data`.`swapi_people` (
        `id` int(11) NOT NULL auto_increment,    
        `name` varchar(100)  NOT NULL default '',     
        `birth_year` varchar(100) NOT NULL default '',
        `film`  varchar(100) NOT NULL default '',
        `film_name`  varchar(100) NOT NULL default '',
        `url` varchar(100) NOT NULL default '', 
        `birth_year_number` DECIMAL(4,1)  NOT NULL default 0,
        PRIMARY KEY  (`id`)
    );'''
    connection.run(sql, autocommit=True, parameters=())

    sql = '''DELETE FROM `swapi_data`.`swapi_people`;'''
    connection.run(sql, autocommit=True, parameters=())

    return True

예제 #15

0

파일 보기

파일: biowardrobetriggers.py 프로젝트: bbstudio-dev/biowardrobe-airflow-analysis

    def execute(self, context):

        biowardrobe_uid = context['dag_run'].conf['biowardrobe_uid'] \
            if 'biowardrobe_uid' in context['dag_run'].conf else None

        if not biowardrobe_uid:
            raise Exception('biowardrobe_id must be provided')

        run_id = context['dag_run'].conf['run_id'] \
            if 'run_id' in context['dag_run'].conf else 'trig__{}__{}'.format(biowardrobe_uid, uuid.uuid4())

        _logger.info('Successfully finished: {}'.format(biowardrobe_uid))

        mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
        with closing(mysql.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(
                    "update ems.labdata set libstatus=10, libstatustxt='downloaded' where uid=%s",
                    (biowardrobe_uid, ))
                conn.commit()
                data = get_biowardrobe_data(cursor=cursor,
                                            biowardrobe_uid=biowardrobe_uid)
                dag_id = os.path.basename(
                    os.path.splitext(data['workflow'])[0])

                payload = {
                    'biowardrobe_uid': biowardrobe_uid,
                    'run_id': run_id
                }

                _logger.info("Trigger basic analysis with: {}".format(payload))
                session = settings.Session()
                dr = DagRun(dag_id=dag_id,
                            run_id=run_id,
                            conf=payload,
                            execution_date=datetime.now(),
                            external_trigger=True)
                logging.info("Creating DagRun {}".format(dr))
                session.add(dr)
                session.commit()
                session.close()

예제 #16

0

파일 보기

파일: biowardrobejobgatherer.py 프로젝트: michael-kotliar/biowardrobe-airflow-analysis

    def execute(self, context):

        _job_result, promises = self.cwl_gather(context)

        mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
        with closing(mysql.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                _data = get_biowardrobe_data(cursor, promises['uid'])
                _params = loads(_data['params'])

                _promoter = _params[
                    'promoter'] if 'promoter' in _params else 1000
                _params = _job_result
                _params['promoter'] = _promoter

                try:
                    upload_results_to_db2(upload_rules=loads(
                        _data['upload_rules']),
                                          uid=promises['uid'],
                                          output_folder=self.output_folder,
                                          cursor=cursor,
                                          conn=conn)
                    update_status(
                        uid=promises['uid'],
                        message='Complete:upgraded',
                        code=12,
                        conn=conn,
                        cursor=cursor,
                        optional_column="dateanalyzee=now(),params='{}'".
                        format(dumps(_params)))
                except BiowBasicException as ex:
                    update_status(
                        uid=promises['uid'],
                        message=f'Fail:{ex}',
                        code=2010,
                        conn=conn,
                        cursor=cursor,
                        optional_column="dateanalyzee=now(),params='{}'".
                        format(dumps(_params)))

        return _job_result

예제 #17

0

파일 보기

파일: trending_movie.py 프로젝트: ombapit/airflow_demo

def insert_db(**context):
    # get variable
    obj = Variable.get('TMDB_API', deserialize_json=True)
    # request get trending movie
    response = requests.get(obj["host"] + 'trending/all/week?api_key=' +
                            obj["key"])

    page = 1
    resp = response.json()
    # for i in range(data.total_pages):
    for i in range(2):
        response = requests.get(obj["host"] + 'trending/all/week?api_key=' +
                                obj["key"] + '&page=' + str(page))
        data = response.json()

        # looping data ke variable
        rows = []
        for key in data['results']:
            # value = data['results']
            if 'title' in key:
                title = key['title'].encode("utf-8")
                ori_title = key['original_title'].encode("utf-8")
                release_date = key['release_date']
            else:
                title = key['name'].encode("utf-8")
                ori_title = key['original_name'].encode("utf-8")
                release_date = key['first_air_date']
            # convert genre ke separated comma
            genres = ','.join(map(str, key['genre_ids']))

            row = (key['id'], title, release_date, ori_title, genres,
                   key['media_type'], key['vote_average'])
            rows.append(row)

        # simpan data perpage
        api: MySqlHook = MySqlHook(default_conn_name='mysql_default')
        api.insert_rows(table='movie', rows=tuple(rows))

        page += 1

    return 'success'

예제 #18

0

파일 보기

파일: core.py 프로젝트: watxaut/airflow-test

def get_columns_and_exclude(conn_id, table_name, l_columns_exclude):
    """

    :param conn_id: connection id to connect to
    :param table_name: table to get the columns
    :param l_columns_exclude: list of strings of columns to exclude
    :return: list of strings without the excluded columns
    """
    mysql_hook = MySqlHook(conn_id)

    sql_query = "SHOW COLUMNS FROM {}".format(table_name)
    all_records = mysql_hook.get_records(sql_query)

    l_columns_after_exclude = [
        "t.`{}`".format(l_row[0]) for l_row in all_records
        if l_row[0] not in l_columns_exclude
    ]
    logging.debug(
        "Columns after exclude: '{}'".format(l_columns_after_exclude))

    return l_columns_after_exclude

예제 #19

0

파일 보기

파일: operators.py 프로젝트: yuhwan/incubator-airflow

    def test_mysql_hook_test_bulk_load(self):
        records = ("foo", "bar", "baz")

        import tempfile
        with tempfile.NamedTemporaryFile() as t:
            t.write("\n".join(records).encode('utf8'))
            t.flush()

            from airflow.hooks.mysql_hook import MySqlHook
            h = MySqlHook('airflow_ci')
            with h.get_conn() as c:
                c.execute("""
                    CREATE TABLE IF NOT EXISTS test_airflow (
                        dummy VARCHAR(50)
                    )
                """)
                c.execute("TRUNCATE TABLE test_airflow")
                h.bulk_load("test_airflow", t.name)
                c.execute("SELECT dummy FROM test_airflow")
                results = tuple(result[0] for result in c.fetchall())
                self.assertEqual(sorted(results), sorted(records))

예제 #20

0

파일 보기

def copy(ds, **kwargs):
    source_query = """select * from address;"""
    dest_query = "insert into address values %s"

    source_hook = create_engine(
        'postgresql+psycopg2://airflow:airflow@postgres/airflow')
    source_conn = source_hook.connect()
    records = source_conn.execute(source_query)

    dest_hook = MySqlHook(mysql_conn_id="target", schema="mysql")
    dest_conn = dest_hook.get_conn()
    dest_cursor = dest_conn.cursor()

    if records:
        # logging.info("Inserting rows into MySQL")
        dest_hook.insert_rows(table="address", rows=records)

    dest_cursor.close()

    source_conn.close()
    dest_conn.close()

예제 #21

0

파일 보기

파일: Zomato_test_dag.py 프로젝트: preeti4924/Zomato-DWH

def user_ratings_insert(*args, **kwargs):
    with open("/Users/preetiyerkuntwar/documents/Zomato-test/all_restro.json",
              "r") as f:
        result = json.load(f)
    f.close()
    normalizedL1 = pd.json_normalize(result["restaurants"], max_level=1).drop(
        columns=['restaurant.apikey']).set_index('restaurant.id')

    daL2 = normalizedL1.reset_index()

    user_rating = daL2[dfNameDict['user_rating']]
    user_rating = pd.concat([
        pd.json_normalize(
            user_rating.drop(['restaurant.id'],
                             axis=1)['restaurant.user_rating']),
        user_rating['restaurant.id']
    ],
                            axis=1)
    user_rating['restaurant.id'] = pd.to_numeric(user_rating['restaurant.id'])
    user_rating['aggregate_rating'] = user_rating['aggregate_rating'].astype(
        float)
    user_rating['rating_text'] = user_rating['rating_text'].astype(str)
    user_rating['rating_color'] = user_rating['rating_color'].astype(str)
    user_rating['rating_obj.title.text'] = user_rating[
        'rating_obj.title.text'].astype(float)
    user_rating['rating_obj.bg_color.type'] = user_rating[
        'rating_obj.bg_color.type'].astype(str)
    user_rating['rating_obj.bg_color.tint'] = user_rating[
        'rating_obj.bg_color.tint'].astype(str)

    mysql_hook = MySqlHook(mysql_conn_id="mysql_zomato")
    conn = mysql_hook.get_conn()
    connection = conn
    curr = connection.cursor()

    for i, row in user_rating.iterrows():
        # print(row)
        curr.execute(insert_into_sql_statements.INSERT_TABLE_USER_RATINGS, row)
        connection.commit()
    curr.close()

예제 #22

0

파일 보기

파일: download.py 프로젝트: michael-kotliar/biowardrobe-airflow-analysis

def copy_from_func(**context):
    biowardrobe_uid = context['dag_run'].conf['biowardrobe_uid'] \
        if 'biowardrobe_uid' in context['dag_run'].conf else None

    if not biowardrobe_uid:
        raise Exception('biowardrobe_id must be provided')

    data = {}
    _tmpfiles1 = []
    _tmpfiles2 = []
    mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
    with closing(mysql.get_conn()) as conn:
        with closing(conn.cursor()) as cursor:
            data = get_biowardrobe_data(cursor=cursor,
                                        biowardrobe_uid=biowardrobe_uid)

            cursor.execute("select uid from labdata where id in (" + data['url'].replace(' ', ',') + ")")
            for row in cursor.fetchall():
                _copy_from = get_biowardrobe_data(cursor=cursor,
                                                  biowardrobe_uid=row['uid'])
                if _copy_from['pair'] == data['pair']:
                    _tmpfiles1.append(_copy_from['fastq_file_upstream'])
                    if data['pair']:
                        _tmpfiles2.append(_copy_from['fastq_file_downstream'])

    pathlib.Path(data['output_folder']).mkdir(parents=True, exist_ok=True, mode=0o777)

    bufsize = 16 * 1024
    with open(data['fastq_file_upstream'], "wb") as outfile:
        for filename in _tmpfiles1:
            _logger.info("Adding " + filename + "...")
            with open(filename, "rb") as fq_file:
                copyfileobj(fq_file, outfile, bufsize)

    if data['pair']:
        with open(data['fastq_file_downstream'], "wb") as outfile:
            for filename in _tmpfiles2:
                _logger.info("Adding " + filename + "...")
                with open(filename, "rb") as fq_file:
                    copyfileobj(fq_file, outfile, bufsize)

예제 #23

0

파일 보기

파일: S3_Mysql_Load_Airflow.py 프로젝트: adithyapathipaka/Airflow_S3_MySQL

def readS3FilesAndLoadtoMySql(**kwargs):
    """
    Read Data from S3 Files and load to Mysql
    :param kwargs:
    :return:
    """
    s3_files = fetchFilesBasedonPattern(**kwargs)
    
    tmp_trg_file_path = "/tmp/s3mysqlload_" + str(round(datetime.now().timestamp())) + "/"
    if s3_files is None:
        raise Exception("No Files are Available to process")
    else:
        files_df = pd.DataFrame()
        print(type(s3_files))
        data = ",".join(s3_files)
        kwargs['ti'].xcom_push(key='s3_data_files', value=data)
        s3_client = generateS3Hook(kwargs["aws_conn_id"])

        for path in s3_files:
            file_name = getFileName(path)
            if (file_name.lower().__contains__(".csv")):
                
                files_df = files_df.append(pd.read_csv(
                    io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read())))
            elif file_name.lower().__contains__(".json"):
                files_df = files_df.append(pd.read_json(
                    io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read())))

        if len(files_df) > 0:

            if not os.path.exists(tmp_trg_file_path):
                os.makedirs(tmp_trg_file_path)
            file_path = tmp_trg_file_path + str(round(datetime.now().timestamp())) + ".tsv"
            files_df.to_csv(file_path, sep="\t", index=False,header=False,line_terminator="\n")
            mysql_client = MySqlHook(mysql_conn_id=kwargs["mysql_conn"])
            mysql_client.bulk_load(table=kwargs["schema"]+"."+kwargs["table"], tmp_file=file_path)
            
            shutil.rmtree(tmp_trg_file_path)
        else:
            raise Exception("Source Files are Empty")

예제 #24

0

파일 보기

파일: swapi_people_aggregate_dag.py 프로젝트: anujspandit/tryst-with-airflow

def send_aggregate_to_requestbin():
    target = 'http://requestbin.net/r/zorarbzo'

    connection = MySqlHook(mysql_conn_id='mysql_default')
    sql = '''
        SELECT 
            film_name, name, birth_year 
        FROM
            `swapi_data`.`swapi_people_aggregate`;
    '''
    result = connection.get_records(sql)
    data = []
    for item in result:
        data.append({
            "film_name": item[0],
            "name": item[1],
            "birth_year": str(item[2])
        })

    result = requests.post(target, data=json.dumps(data))

    return result

예제 #25

0

파일 보기

def local_to_mysql():

    connection = MySqlHook(mysql_conn_id='youtube_db')
    query = '''
        CREATE TABLE IF NOT EXISTS `group3`.`youtube7` (
          `video_id` VARCHAR(100) NOT NULL,
          `title` VARCHAR(100) NULL,
          `publishedAt` VARCHAR(45) NULL,
          `channelId` VARCHAR(45) NULL,
          `channelTitle` VARCHAR(60) NULL,
          `categoryId` INT NULL,
          `trending_date` DATETIME NULL,
          `tags` LONGTEXT NULL,
          `view_count` INT NULL,
          `likes` INT NULL,
          `dislikes` INT NULL,
          `comment_count` INT NULL,
          `thumbnail_link` VARCHAR(100) NULL,
          `comments_disabled` TINYINT NULL,
          `ratings_disabled` TINYINT NULL,
          `description` LONGTEXT NULL,
           PRIMARY KEY (`video_id`));
    '''
    connection.run(query, autocommit=True)
    # df = pd.read_csv(temp_youtube_trending_vids)
    df = pd.concat(
        [pd.read_csv(f, sep=',') for f in glob.glob('/temp' + "/*.csv")],
        ignore_index=True)
    df = df.where((pd.notnull(df)), None)
    for i, row in df.iterrows():
        query = '''
            INSERT IGNORE INTO group3.youtube7 (video_id, title, publishedAt, channelId, channelTitle, categoryId,
            trending_date, tags, view_count, likes, dislikes, comment_count, thumbnail_link, comments_disabled, ratings_disabled,
            description) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        '''
        try:
            connection.run(query, autocommit=True, parameters=tuple(row))
        except:
            pass

예제 #26

0

파일 보기

 def objects(self):
     where_clause = ''
     if DB_WHITELIST:
         dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST])
         where_clause = "AND b.name IN ({})".format(dbs)
     if DB_BLACKLIST:
         dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST])
         where_clause = "AND b.name NOT IN ({})".format(dbs)
     sql = """
     SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE
     FROM TBLS a
     JOIN DBS b ON a.DB_ID = b.DB_ID
     WHERE
         a.TBL_NAME NOT LIKE '%tmp%' AND
         a.TBL_NAME NOT LIKE '%temp%' AND
         b.NAME NOT LIKE '%tmp%' AND
         b.NAME NOT LIKE '%temp%'
     {where_clause}
     LIMIT {LIMIT};
     """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT)
     h = MySqlHook(METASTORE_MYSQL_CONN_ID)
     d = [{'id': row[0], 'text': row[0]} for row in h.get_records(sql)]
     return json.dumps(d)

예제 #27

0

파일 보기

    def execute(self, context):
        dest_mysql = MySqlHook(mysql_conn_id=self.dest_mysqls_conn_id)

        self.cursor = self.cursor if not data_cursor else kwargs['ti'].xcom_pull(
            key=None, task_ids=data_cursor)

        logging.info(
            "Transferring cursor into new Mysql database.")

        if self.mysql_preoperator:
            logging.info("Running Mysql preoperator")
            dest_mysql.run(self.mysql_preoperator)

            dest_mysql.insert_rows(table=self.dest_table, rows=self.cursor)
            logging.info(self.cursor.rowcount, " rows inserted")
        else:
            logging.info("No rows inserted")

        if self.mysql_postoperator:
            logging.info("Running Mysql postoperator")
            dest_mysql.run(self.mysql_postoperator)

        logging.info("Done.")

예제 #28

0

파일 보기

파일: workflow.py 프로젝트: wizensoft/airflow

def set_signers(doc, group, context):
    db = MySqlHook(mysql_conn_id='mariadb', schema="dapp")
    sql = f"""
    insert into signers(instance_id, sign_area_id, sequence, sub_instance_id, sign_section, sign_position, sign_action, is_executed, group_culture, group_id, group_name, 
        created_date, received_date, approved_date)
    values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
    """
    is_executed = True
    sub_instance_id = 0
    sign_action = STATUS_00  # 기결재
    db.run(sql,
           autocommit=True,
           parameters=[
               doc.find('instance_id').text,
               doc.find('sign_area_id').text,
               doc.find('sequence').text, sub_instance_id,
               doc.attrib['sign_section'], doc.attrib['sign_position'],
               sign_action, is_executed, group['culture'],
               doc.find('group_id').text, group['name'],
               datetime.now(),
               datetime.now(),
               datetime.now()
           ])

예제 #29

0

파일 보기

def save_transactions(**kwargs):
    transactions = kwargs['task_instance'].xcom_pull(
        key='transactions', task_ids='get_transactions')
    transactions['dt'] = month
    transactions['value'] = transactions['value'].astype(float)
    transactions['date'] = transactions['date'].dt.strftime('%Y-%m-%d')

    mysql = MySqlHook(mysql_conn_id='credit_card_processor')
    mysql_conn = mysql.get_conn()

    cursor = mysql_conn.cursor()
    cursor.execute(config['mysql']['create_transaction_table'])

    wildcards = ','.join(['%s'] * len(transactions.columns))
    colnames = ','.join(transactions.columns)

    insert_sql = config['mysql']['create_transaction'] % (
        config['mysql']['transaction_table'], colnames, wildcards)
    data = [tuple([v for v in rw]) for rw in transactions.values]
    cursor.executemany(insert_sql, data)

    mysql_conn.commit()
    cursor.close()

예제 #30

0

파일 보기

파일: sales_dag.py 프로젝트: rub3ng0nzalez/MSDegree

def etl_process(**kwargs):
    logger.info(kwargs["execution_date"])
    file_path = FSHook(FILE_CONNECTION_NAME).get_path()
    filename = 'sales.csv'
    mysql_connection = MySqlHook(
        mysql_conn_id=CONNECTION_DB_NAME).get_sqlalchemy_engine()
    full_path = f'{file_path}/{filename}'
    df = (pd.read_csv(full_path,
                      encoding="ISO-8859-1",
                      usecols=COLUMNS.keys(),
                      parse_dates=DATE_COLUMNS).rename(columns=COLUMNS))

    with mysql_connection.begin() as connection:
        connection.execute("DELETE FROM test.sales WHERE 1=1")
        df.to_sql('sales',
                  con=connection,
                  schema='test',
                  if_exists='append',
                  index=False)

    os.remove(full_path)

    logger.info(f"Rows inserted {len(df.index)}")