Esempio n. 1
0
    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        src_pg = PostgresHook(postgres_conn_id=self.src_postgres_conn_id)
        dest_pg = PostgresHook(postgres_conn_id=self.dest_postgress_conn_id)

        logging.info(
            "Transferring Postgres query results into other Postgres database."
        )
        conn = src_pg.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.parameters)

        if self.pg_preoperator:
            logging.info("Running Postgres preoperator")
            dest_pg.run(self.pg_preoperator)

        logging.info("Inserting rows into Postgres")

        dest_pg.insert_rows(table=self.pg_table, rows=cursor)

        if self.pg_postoperator:
            logging.info("Running Postgres postoperator")
            dest_pg.run(self.pg_postoperator)

        logging.info("Done.")
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        # RedShift Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        # Get number of records in the table
        records = redshift.get_records(
            f"SELECT COUNT(*) FROM {self.table_name}")
        data_rows = redshift.get_records(self.sql)

        if self.load_mode == "clean":
            # Clear data
            self.log.info(f"Clearing data from {self.table_name} table")
            redshift.run("DELETE FROM {}".format(self.table_name))
            self.log.info(
                f"Deleted {records[0][0]} records from {self.table_name}")

        # Populate table
        self.log.info("Populating data to {} table".format(self.table_name))
        redshift.insert_rows(table=self.table_name,
                             rows=data_rows,
                             commit_every=1000,
                             replace=False)
        self.log.info("Inserted {} records to {}".format(
            len(data_rows), self.table_name))
Esempio n. 3
0
def add_execution_in_database(
    table, data={}, connection_id="postgres_report_connection"
):
    """Registra informações em um banco PostgreSQL de forma dinâmica."""

    data = dict(data)

    if data is None or len(data.keys()) == 0:
        logging.info(
            "Cannot insert `empty data` into the database. Please verify your data attributes."
        )
        return

    hook = PostgresHook(postgres_conn_id=connection_id)

    try:
        hook.get_conn()
    except AirflowException:
        logging.info("Cannot insert data. Connection '%s' is not configured.", connection_id)
        return

    if data.get("payload"):
        data["payload"] = json.dumps(data["payload"])

    columns = list(data.keys())
    values = list(data.values())

    try:
        hook.insert_rows(table, [values], target_fields=columns)
    except (AirflowException, ProgrammingError) as exc:
        logging.error(exc)
    else:
        logging.info("Registering `%s` into '%s' table.", data, table)
Esempio n. 4
0
def predictions(filepath, table_name, test=True, **kwargs):
    df = preprocess(filepath)
    y_test = df["churn"]
    features = kwargs["ti"].xcom_pull(key="features", task_ids="train")
    with open(f"scaler_{kwargs.get('ds')}.pickle", 'rb') as f:
        scaler = load(f)
    df[['monthly_charges', 'extra_charges'
        ]] = scaler.transform(df[['monthly_charges', 'extra_charges']])
    with open(f"log_model_{kwargs.get('ds')}.pickle", 'rb') as f:
        log = load(f)
    pred = log.predict(df[features])
    hook = PostgresHook(postgres_conn_id=POSTGRES_CONN)
    if test:
        report = classification_report(y_test, pred, output_dict=True)
        hook.insert_rows(table_name,
                         rows=[(
                             datetime.now(),
                             report["accuracy"],
                             report["macro avg"]["recall"],
                             report["macro avg"]["precision"],
                             report["macro avg"]["f1-score"],
                         )])
    else:
        rows = df[["customer_id", "churn"]]
        rows["churn"] = pred.astype(bool)
        hook.insert_rows(table_name, rows=rows.values.tolist())
def copy_data_in_batch(target_base_table, fetch_iterator):

    masterdb_hook = PostgresHook()
    items = []

    # setup SQL insert values the be executed
    for row in fetch_iterator:
        items.append([column_value for column_value in row])

    result = len(items)

    # execute SQL insert statement
    # the Airflow PostgresHook.insert_rows instance method is used to "executemany" SQL query
    # which also serializes the data to a save SQL format
    if result:
        try:
            masterdb_hook.insert_rows(target_base_table,
                                      items,
                                      target_fields=None,
                                      commit_every=1000,
                                      replace=False)
        except Exception as e:
            raise Exception("Failed to insert batch data: {}".format(
                str(e)[0:150]))

    return result
Esempio n. 6
0
def extract_airtable_shows():
    '''
    Load shows from airtable into a Postgres table
    Airtable shows are identified by their IMDB ID.

    This task stores the IDs and ratings only.
    '''
    hook = PostgresHook(postgres_conn_id='postgres_movies')

    base = airtable.Airtable(base_id, api_key)

    ratings = []
    for record in base.iterate(shows_table):
        fields = record['fields']
        rating = fields.get('Mat Rating')
        imdb_url = fields.get('IMDB')

        if rating is not None and validate_imdb_url(imdb_url):
            ratings.append((imdb_url, rating))

    logger.info('Rated %d movies', len(ratings))

    hook.insert_rows('my_ratings',
                     rows=ratings,
                     target_fields=('imdb_url', 'stars'))
Esempio n. 7
0
def upload_to_postgres(*args, **kwargs):
    ti = kwargs['ti']
    file_date, data = ti.xcom_pull(task_ids="get_data_from_api")
    file_date = file_date.strftime("%Y-%m-%d %H:%M:%S%z")
    data = [extract_content_id(i) for i in list(data)]
    # DbApiHook.insert_rows
    pg_hook = PostgresHook(postgres_conn_id='postgres_conn', schema='seoulbike')    
    rows = [ [st_data['stationName'], st_data['rackTotCnt'],st_data['parkingBikeTotCnt'], file_date] for st_data in data ]
    pg_hook.insert_rows(table='bike_realtime_log_tz', rows=rows) # autocommit
    pg_hook.get_conn().close()
Esempio n. 8
0
 def execute(self, context):
     pg_hook = PostgresHook(postgres_conn_id=self.connection_id)
     api_hook = Covid19Hook(http_conn_id='covid19-api', method='GET')
     pg_hook.run("DELETE FROM covid19 where day = %s",
                 parameters=[context['ds']])
     rows = map(
         lambda x: (json.dumps(x[0]), context['ds']),
         filter(
             lambda x: len(x),
             api_hook.get_data(start_date=context['ds'],
                               end_date=context['ds'])))
     pg_hook.insert_rows("covid19", rows, ["data", "day"])
def move_mysql_to_redshift(tablename):
    mysql_hook = MySqlHook(mysql_conn_id='mysql_baseball')
    redshift_hook = PostgresHook(postgres_conn_id='redshift_host')

    sql = "select league_id, name, abbr from "
    sql += tablename

    cur = mysql_hook.get_records(sql)

    redshift_hook.insert_rows(tablename,cur)

    return cur;
def _insert_file_to_sql(**context):
    df_result = pd.read_csv(f"{STORE_DIR}/output.csv", names=["word", "count"])
    df_result["date"] = context["ds"]
    if not df_result.empty:
        for c in df_result.columns:
            if c == 'count':
                df_result[c] = df_result[c].astype(float)
    df_result = df_result.squeeze()  # squeezing single row dataframe

    df_tuple = [(df_result["date"], df_result["word"], df_result["count"])]

    hook = PostgresHook(postgres_conn_id=CONNECTION_ID)
    hook.insert_rows(SQL_TABLE, df_tuple)
Esempio n. 11
0
    def execute(self, context):
        self.log.info('LoadFactOperator for {}'.format(self.table))

        # get records using sql_queries.songplay_table_insert
        pg_hook = PostgresHook(self.redshift_conn_id)
        records = pg_hook.get_records(SqlQueries.songplay_table_insert)
        self.log.info('get {} records from {}'.format(len(records),
                                                      self.table))
        self.log.info('{}'.format(records[0]))

        # insert records into songplays
        pg_hook.insert_rows(self.table, records)
        self.log.info('inserted into {}'.format(self.table))
Esempio n. 12
0
    def copy_table(self, mysql_conn_id, postgres_conn_id):

        print("### fetching records from MySQL table ###")
        mysqlserver = MySqlHook(mysql_conn_id)
        sql_query = "SELECT * from clean_store_transactions "
        data = mysqlserver.get_records(sql_query)

        print("### inserting records into Postgres table ###")
        postgresserver = PostgresHook(postgres_conn_id)
        postgres_query = "INSERT INTO clean_store_transactions VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);"
        postgresserver.insert_rows(table='clean_store_transactions', rows=data)

        return True
def copy_table(src_conn_id, dst_conn_id, table):
    src = PostgresHook(postgres_conn_id=src_conn_id)
    src_conn = src.get_conn()
    src_cursor = src_conn.cursor()

    dst = PostgresHook(postgres_conn_id=dst_conn_id)
    dst_conn = dst.get_conn()
    dst_cursor = dst_conn.cursor()

    dst_cursor.execute('SELECT MAX(id) FROM %s;', [table])
    last_migrated_id = dst_cursor.fetchone()[0]
    if last_migrated_id is None:
        last_migrated_id = 0

    src_cursor.execute('SELECT * FROM %s WHERE id > %s;',
                       [table, last_migrated_id])
    dst.insert_rows(table=table, rows=src_cursor)
Esempio n. 14
0
def insert_data_into_postgres(**kwargs):
    '''
    Inserting data from NFS to Destination DB
    '''
    ti = kwargs['ti']
    fetched_val = ti.xcom_pull(task_ids='read_configuration_dest')
    print(fetched_val)
    postgres_hook = PostgresHook(postgres_conn_id=fetched_val['conn_id'],
                                 schema=fetched_val['schema'])
    f = open("testfile.csv", "r")
    lst = []
    for line in f.readlines():
        print(line.replace('\n', '').split(',')[1:])
        lst.append(tuple(line.replace('\n', '').split(',')[1:]))
    postgres_hook.insert_rows(table=fetched_val['tablename'],
                              rows=iter(lst),
                              target_fields=fetched_val['colname'].split(','))
    print("------- Data stored successfully in Destination DB ------------")
Esempio n. 15
0
def read_and_insert(**kwargs):
    filenames = glob("dags/source_files/*.json")
    rows_to_insert = []
    ti = kwargs['ti']

    table_name = ti.xcom_pull(task_ids='ddl', key='table_name')

    for fn in filenames:
        with open(fn) as f:
            lines = f.readlines()
            for line in lines:
                line_dict = json.loads(line)
                rows_to_insert.append(line_dict.values())

    pg_hook = PostgresHook('postgres_default')
    pg_hook.insert_rows(table_name, rows_to_insert)
    rowcount = int(pg_hook.get_first("SELECT COUNT(1) FROM {table_name}".format(table_name=table_name))[0])
    kwargs['ti'].xcom_push(key='rowcount', value=rowcount)
Esempio n. 16
0
    def execute(self, context):
        pg = PostgresHook(postgres_conn_id=self.pg_conn_id)

        data = open(self.source_file_name, 'r')

        lines = data.read().split('\n')

        tuples = []

        for line in lines:
            if len(line) > 0:
                line_with_default = timestamp + '|' + self.source_file_name + '|' + line
                vals = list(line_with_default.split('|'))
                str_vals = map(lambda x: str(x), vals)
                tuples.append(str_vals)

        self.log.info("Inserting %s records to %s" %
                      (str(len(tuples)), self.target_table))

        pg.insert_rows(self.target_table, tuples, commit_every=1000)
Esempio n. 17
0
    def execute(self, context):
        self.log.info('LoadDimensionOperator for {}'.format(self.table))

        # load records using sql_queries
        lookup = {
            'users': SqlQueries.user_table_insert,
            'songs': SqlQueries.song_table_insert,
            'artists': SqlQueries.artist_table_insert,
            'time': SqlQueries.time_table_insert,
        }
        pg_hook = PostgresHook('redshift')
        records = pg_hook.get_records(lookup[self.table])
        self.log.info('get {} records from {}'.format(len(records),
                                                      self.table))

        # insert them into corresponding tables
        if self.replace:
            pg_hook.run(f"DELETE FROM {self.table}")
        pg_hook.insert_rows(self.table, records)
        self.log.info('inserted into {}'.format(self.table))
    def execute(self, context):
        # read json data from s3
        s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        s3_obj = s3.get_key(
            key=self.filename,
            bucket_name=self.bucket_name
        )
        bytes_buffer = io.BytesIO()
        s3_obj.download_fileobj(bytes_buffer)
        data_str = bytes_buffer.getvalue().decode('utf-8')
        data = json.loads(data_str)

        cols = ['id', 'publishedAt', 'title', 'description', 'categoryId', 'duration', 'definition', 'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount', 'embedHtml']
        # prevent duplicates in playlist
        rows = set([tuple(d[k] for k in cols) for d in data])
        # insert into target table
        postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        postgres.insert_rows(
            table=self.table,
            rows=rows
        )
Esempio n. 19
0
def waze_jams_to_db():
    """Waze jams feed to PostGIS."""
    pg_hook = PostgresHook(postgres_conn_id='waze')
    tempfile = conf['temp_data_dir'] + '/waze_temp.csv'

    temp_df = pd.read_csv(tempfile, header=None, encoding='utf-8')

    rows_db = temp_df.values

    cols = ['uuid', 'waze_timestamp', 'street', 'start_node',
            'end_node', 'city', 'length', 'delay', 'speed', 'level',
            'road_type', 'geom']


    logging.info('Pushing Waze data to Postgis.')
    pg_hook.insert_rows('public.waze_jams',
                        rows_db,
                        target_fields=cols,
                        commit_every=0)

    return 'Successfully pushed data to PostGIS.'
    def execute(self, context):
        pg_hook = PostgresHook(postgres_conn_id=self.connection_id)
        pg_hook_analytics = PostgresHook(postgres_conn_id=self.connection_id)
        day = context['ds']
        pg_hook_analytics.run("DELETE FROM covid19_stats where day = %s",
                              parameters=[day])
        sql_read = """SELECT data #>> '{Country}' as country, 
                             day,
                             sum((data #>> '{Confirmed}')::int) as confirmed,
                             sum((data #>> '{Deaths}')::int) as deaths,
                             sum((data #>> '{Recovered}')::int) as recovered,   
                             sum((data #>> '{Active}')::int) as active
                      FROM covid19 where day =%s
                      GROUP BY country, day"""

        rows = pg_hook.get_records(sql_read, parameters=[day])
        pg_hook_analytics.insert_rows('covid19_stats',
                                      rows=rows,
                                      target_fields=[
                                          'country', 'day', 'confirmed',
                                          'deaths', 'recovered', 'active'
                                      ])
Esempio n. 21
0
    def execute(self, context):

        http = HttpHook(self.method, http_conn_id=self.http_conn_id)

        self.log.info("Calling HTTP method")

        response = http.run(self.endpoint)

        json_resp = json.loads(response.content)

        self.log.info("Response received, saving results to Postgres")

        pg = PostgresHook(postgres_conn_id=self.pg_conn_id)

        tuples = []

        for res in json_resp:
            vals = list(res.values())
            str_vals = map(lambda x: str(x), vals)
            tuples.append(str_vals)

        self.log.info("Inserting " + str(len(tuples)) + " records to Postgres")

        pg.insert_rows(self.destination_table, tuples)
Esempio n. 22
0
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        # RedShift Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        # Get number of records in the table
        records = redshift.get_records(
            f"SELECT COUNT(*) FROM {self.table_name}")
        # Fields and data
        df = redshift.get_pandas_df(self.sql)
        fields = list(df.columns.values)
        data_rows = redshift.get_records(self.sql)

        if self.load_mode == "clean":
            # Clear data
            self.log.info(f"Clearing data from {self.table_name} table")
            redshift.run("DELETE FROM {}".format(self.table_name))
            self.log.info(
                f"Deleted {records[0][0]} records from {self.table_name}")
        else:
            job_execution_ts = self.filter_key[0].format(**context)
            next_job_execution_ts = self.filter_key[1].format(**context)
            filtered_df = df[(df['start_time'] >= job_execution_ts)
                             & (df['start_time'] < next_job_execution_ts)]
            data_rows = [tuple(x) for x in filtered_df.values]

        # Populate table
        self.log.info("Populating data to {} table".format(self.table_name))
        redshift.insert_rows(table=self.table_name,
                             rows=data_rows,
                             target_fields=fields,
                             commit_every=1000,
                             replace=False)
        self.log.info("Inserted {} records to {}".format(
            len(data_rows), self.table_name))
Esempio n. 23
0
def fetch_and_save_features(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {
        "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")
    }

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(postgres_conn_id=pg_connection,
                           keepalives=1,
                           keepalives_idle=30,
                           keepalives_interval=10,
                           keepalives_count=5)

    def get_rock_content_item(originId):
        params = {
            "loadAttributes": "expanded",
            "$orderby": "ModifiedDateTime desc",
            "attributeKeys": "childrenHaveComments",
        }

        r = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems/{originId}",
            params=params,
            headers=headers)
        return r.json()

    # Rock transports { foo: 'bar', baz: 'bat' } as 'foo^bar|baz^bat`
    def parse_key_value_attribute(value):
        if value == '':
            return []

        entries = value.split('|')

        def string_to_dict(s):
            k, v = s.split('^')
            return {"key": k, "value": unquote(v)}

        return map(string_to_dict, entries)

    # def get_

    def get_features(content):
        features = []

        postgres_id, parent_rock_id = pg_hook.get_first(
            f'select "contentItems".id, "parentItem"."originId" from "contentItems" left outer join "contentItems" as "parentItem" on "parentItem".id = "contentItems"."parentId" where "contentItems"."originId" = \'{content["Id"]}\''
        )
        # Add features from a key value list
        # This is now the only way to add scripture and text features
        # We previously supported a textFeature and
        key_value_features = parse_key_value_attribute(
            safeget_no_case(content, 'AttributeValues', 'Features', 'Value')
            or '')
        for kv in key_value_features:
            feature_type = kv["key"]

            if feature_type == "scripture":
                features.append({
                    "type": "Scripture",
                    "data": {
                        "reference": kv["value"]
                    },
                    "parent_id": postgres_id
                })
            if feature_type == "text":
                features.append({
                    "type": "Text",
                    "data": {
                        "text": kv["value"]
                    },
                    "parent_id": postgres_id
                })

        button_link_feature = safeget_no_case(content, 'AttributeValues',
                                              'ButtonText', 'Value')
        if (button_link_feature):
            features.append({
                "type": "ButtonLink",
                "data": {
                    "title":
                    safeget_no_case(content, 'AttributeValues', 'ButtonText',
                                    'Value'),
                    "url":
                    safeget_no_case(content, 'AttributeValues', 'ButtonLink',
                                    'Value'),
                    "action":
                    "OPEN_AUTHENTICATED_URL",
                },
                "parent_id": postgres_id
            })

        comment_feature = safeget_no_case(content, 'AttributeValues',
                                          'comments', 'value') or 'False'
        if comment_feature == 'True':
            features.append({
                "type": "AddComment",
                "data": {
                    "initialPrompt": "Write Something...",
                    "addPrompt": "What stands out to you?"
                },
                "parent_id": postgres_id
            })
            features.append({
                "type": "CommentList",
                "data": {
                    "initialPrompt": "Write Something...",
                    "addPrompt": "What stands out to you?"
                },
                "parent_id": postgres_id
            })
        elif parent_rock_id:
            parent_item = get_rock_content_item(parent_rock_id)
            parent_comment_feature = safeget_no_case(
                parent_item, 'AttributeValues', 'childrenHaveComments',
                'value') or 'False'
            if parent_comment_feature == 'True':
                features.append({
                    "type": "AddComment",
                    "data": {
                        "initialPrompt": "Write Something...",
                        "addPrompt": "What stands out to you?"
                    },
                    "parent_id": postgres_id
                })
                features.append({
                    "type": "CommentList",
                    "data": {
                        "initialPrompt": "Write Something...",
                        "addPrompt": "What stands out to you?"
                    },
                    "parent_id": postgres_id
                })

        return features

    fetched_all = False
    skip = 0
    top = 10000

    while not fetched_all:
        # Fetch people records from Rock.

        params = {
            "$top": top,
            "$skip": skip,
            # "$select": "Id,Content",
            "loadAttributes": "expanded",
            "$orderby": "ModifiedDateTime desc",
            "attributeKeys": "features, comments, buttontext, buttonlink",
        }

        r = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems",
            params=params,
            headers=headers)
        rock_objects = r.json()

        def fix_casing(col):
            return "\"{}\"".format(col)

        features_by_item = list(map(get_features, rock_objects))
        print(features_by_item)
        flat_features_list = [
            item for items in features_by_item for item in items
        ]

        # "createdAt", "updatedAt", "apollosType", "data", "type", "parentId", "parentType"
        def feature_attributes(obj):
            return (kwargs['execution_date'], kwargs['execution_date'],
                    obj["type"] + "Feature", Json(obj['data']), obj["type"],
                    obj['parent_id'], "ContentItem")

        insert_features = list(map(feature_attributes, flat_features_list))

        columns = list(
            map(fix_casing, ("createdAt", "updatedAt", "apollosType", "data",
                             "type", "parentId", "parentType")))

        pg_hook.insert_rows('"features"',
                            list(insert_features),
                            columns,
                            0,
                            True,
                            replace_index=('"parentId"', '"type"', '"data"'))

        add_apollos_ids = """
        UPDATE "features"
        SET "apollosId" = "apollosType" || ':' || id::varchar
        WHERE "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)

        skip += top
        fetched_all = len(r.json()) < top
Esempio n. 24
0
def fetch_and_save_people(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {
        "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")
    }

    fetched_all = False
    skip = 0
    top = 10000

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(postgres_conn_id=pg_connection,
                           keepalives=1,
                           keepalives_idle=30,
                           keepalives_interval=10,
                           keepalives_count=5)

    # Rock stores gender as an integer
    gender_map = ('UNKNOWN', 'MALE', 'FEMALE')

    # Fetch the available campuses.
    campuses = pg_hook.get_records("""
        SELECT "originId", id
        FROM campuses
        WHERE "originType" = 'rock'
    """)

    campus_map = dict(campuses)
    campus_map['None'] = None

    while fetched_all == False:
        # Fetch people records from Rock.

        params = {
            "$top": top,
            "$skip": skip,
            "$expand": "Photo",
            "$select":
            "Id,NickName,LastName,Gender,BirthDate,PrimaryCampusId,Email,Photo/Path",
            "$orderby": "ModifiedDateTime desc",
        }

        if not kwargs['do_backfill']:
            params[
                '$filter'] = f"ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null"

        print(params)

        r = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/People",
            params=params,
            headers=headers)
        rock_objects = r.json()

        if not isinstance(rock_objects, list):
            print(rock_objects)
            print("oh uh, we might have made a bad request")
            print("top: {top}")
            print("skip: {skip}")
            skip += top
            continue

        skip += top
        fetched_all = len(rock_objects) < top

        def photo_url(path):
            if path is None:
                return None
            elif path.startswith("~"):
                rock_host = (Variable.get(kwargs['client'] +
                                          '_rock_api')).split("/api")[0]
                return path.replace("~", rock_host)
            else:
                return path

        # "createdAt", "updatedAt", "originId", "originType", "apollosType", "firstName", "lastName", "gender", "birthDate", "campusId", "email", "profileImageUrl"
        def update_people(obj):
            return (kwargs['execution_date'], kwargs['execution_date'],
                    obj['Id'], 'rock', 'Person', clean_string(obj['NickName']),
                    clean_string(obj['LastName']), gender_map[obj['Gender']],
                    obj['BirthDate'], campus_map[str(obj["PrimaryCampusId"])],
                    clean_string(obj['Email']),
                    photo_url(safeget(obj, 'Photo', 'Path')))

        def fix_casing(col):
            return "\"{}\"".format(col)

        people_to_insert = list(map(update_people, rock_objects))
        columns = list(
            map(fix_casing,
                ("createdAt", "updatedAt", "originId", "originType",
                 "apollosType", "firstName", "lastName", "gender", "birthDate",
                 "campusId", "email", "profileImageUrl")))

        pg_hook.insert_rows('people',
                            people_to_insert,
                            columns,
                            0,
                            True,
                            replace_index=('"originId"', '"originType"'))

        add_apollos_ids = """
        UPDATE people
        SET "apollosId" = 'Person:' || id::varchar
        WHERE "originType" = 'rock' and "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)
Esempio n. 25
0
def fetch_and_save_persona_tags(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")}

    fetched_all = False
    skip = 0
    top = 10000

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(
        postgres_conn_id=pg_connection,
        keepalives=1,
        keepalives_idle=30,
        keepalives_interval=10,
        keepalives_count=5
    )

    person_entity_id = requests.get(
        f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes",
        params={"$filter": "Name eq 'Rock.Model.Person'"},
        headers=headers
    ).json()[0]['Id']

    while not fetched_all:

        params = {
            "$top": top,
            "$skip": skip,
            "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}",
            "$select": "Id,Name,Guid",
            "$orderby": "ModifiedDateTime desc",
        }

        if not kwargs['do_backfill']:
            params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'"

        print(params)

        r = requests.get(
                f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews",
                params=params,
                headers=headers)
        rock_objects = r.json()

        if not isinstance(rock_objects, list):
            print(rock_objects)
            print("oh uh, we might have made a bad request")
            print("top: {top}")
            print("skip: {skip}")
            skip += top
            continue


        skip += top
        fetched_all = len(rock_objects) < top

        # "createdAt", "updatedAt", "originId", "originType", "apollosType", "name", "data", "type"
        def update_tags(obj):
            return (
                kwargs['execution_date'],
                kwargs['execution_date'],
                obj['Id'],
                'rock',
                'Tag',
                obj['Name'],
                Json({ "guid": obj["Guid"] }),
                "Persona"
            )

        def fix_casing(col):
            return "\"{}\"".format(col)

        tags_to_insert = list(map(update_tags, rock_objects))
        columns = list(map(fix_casing, ("createdAt","updatedAt", "originId", "originType", "apollosType", "name", "data", "type")))

        pg_hook.insert_rows(
            'tags',
            tags_to_insert,
            columns,
            0,
            True,
            replace_index = ('"originId"', '"originType"')
        )

        add_apollos_ids = """
        UPDATE "tags"
        SET "apollosId" = "apollosType" || ':' || id::varchar
        WHERE "originType" = 'rock' and "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)
Esempio n. 26
0
def load_data(table_name, filepath, **kwargs):
    hook = PostgresHook(postgres_conn_id=POSTGRES_CONN)
    df = pd.read_csv(filepath, dtype=TYPES)
    hook.insert_rows(table_name, rows=df.values.tolist())
def fetch_and_save_content_items(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {
        "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")
    }

    fetched_all = False
    skip = 0
    top = 10000

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(postgres_conn_id=pg_connection,
                           keepalives=1,
                           keepalives_idle=30,
                           keepalives_interval=10,
                           keepalives_count=5)

    while fetched_all == False:
        # Fetch people records from Rock.

        params = {
            "$top": top,
            "$skip": skip,
            # "$expand": "Photo",
            # "$select": "Id,Content",
            "loadAttributes": "expanded",
            "attributeKeys": "Summary",
            "$orderby": "ModifiedDateTime desc",
        }

        if not kwargs['do_backfill']:
            params[
                '$filter'] = f"ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null"

        print(params)

        r = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems",
            params=params,
            headers=headers)
        rock_objects = r.json()

        if not isinstance(rock_objects, list):
            print(rock_objects)
            print("oh uh, we might have made a bad request")
            print("top: {top}")
            print("skip: {skip}")
            skip += top
            continue

        skip += top
        fetched_all = len(rock_objects) < top

        config = Variable.get(kwargs['client'] + "_rock_config",
                              deserialize_json=True)

        # "createdAt","updatedAt", "originId", "originType", "apollosType", "summary", "htmlContent", "title", "publishAt"
        def update_content(obj):
            return (
                kwargs['execution_date'],
                kwargs['execution_date'],
                obj['Id'],
                'rock',
                get_typename(obj, config),
                create_summary(obj),
                create_html_content(obj),
                obj['Title'],
                obj['StartDateTime'],
            )

        def fix_casing(col):
            return "\"{}\"".format(col)

        content_to_insert = list(map(update_content, rock_objects))
        columns = list(
            map(fix_casing, ("createdAt", "updatedAt", "originId",
                             "originType", "apollosType", "summary",
                             "htmlContent", "title", 'publishAt')))

        pg_hook.insert_rows('"contentItems"',
                            content_to_insert,
                            columns,
                            0,
                            True,
                            replace_index=('"originId"', '"originType"'))

        add_apollos_ids = """
        UPDATE "contentItems"
        SET "apollosId" = "apollosType" || ':' || id::varchar
        WHERE "originType" = 'rock' and "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)
Esempio n. 28
0
def fetch_and_save_media(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {
        "Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")
    }

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(postgres_conn_id=pg_connection,
                           keepalives=1,
                           keepalives_idle=30,
                           keepalives_interval=10,
                           keepalives_count=5)

    def get_content_item_id(rockOriginId):
        return pg_hook.get_first(
            f'SELECT id FROM "contentItems" WHERE "originId"::Integer = {rockOriginId}'
        )[0]

    def mapContentItems(contentItem):

        nodeId = get_content_item_id(contentItem['Id'])

        def filter_media_attributes(attribute):
            return is_media_image(attribute, contentItem) or is_media_video(
                attribute, contentItem) or is_media_audio(
                    attribute, contentItem)

        def get_media_type(attribute):
            if (is_media_image(attribute, contentItem)):
                return 'IMAGE'
            elif (is_media_video(attribute, contentItem)):
                return 'VIDEO'
            elif (is_media_audio(attribute, contentItem)):
                return 'AUDIO'
            else:
                return 'UNKNOWN_MEDIA_TYPE'

        def get_media_value(attribute):
            mediaType = get_media_type(attribute)
            attributeKey = attribute['Key']
            attributeValue = contentItem['AttributeValues'][attributeKey][
                'Value']

            if (mediaType == 'IMAGE'):
                rock_host = (Variable.get(kwargs['client'] +
                                          '_rock_api')).split("/api")[0]
                return rock_host + "/GetImage.ashx?guid=" + attributeValue if len(
                    attributeValue) > 0 else ''
            else:
                return attributeValue

        def map_attributes(attribute):
            attributeKey = attribute['Key']
            attributeFieldType = attribute['FieldTypeId']
            attributeValue = contentItem['AttributeValues'][attributeKey][
                'Value']
            attributeValueId = str(contentItem['Id']) + "/" + str(
                attribute['Id'])
            mediaType = get_media_type(attribute)
            mediaValue = get_media_value(attribute)
            metadata = {}

            if (mediaType == 'IMAGE' and mediaValue):
                imageDimensions = getsizes(mediaValue)
                metadata['width'] = imageDimensions[0]
                metadata['height'] = imageDimensions[1]

            if (mediaValue):
                return ('Media', kwargs['execution_date'],
                        kwargs['execution_date'], nodeId, 'ContentItem',
                        mediaType, mediaValue, attributeValueId, 'rock',
                        json.dumps(metadata))

            return None

        filteredAttributes = filter(filter_media_attributes,
                                    contentItem['Attributes'].values())
        mappedAttributes = map(map_attributes, filteredAttributes)

        return list(filter(lambda media: bool(media), mappedAttributes))

    fetched_all = False
    skip = 0
    top = 10000

    while fetched_all == False:
        # Fetch people records from Rock.

        params = {
            "$top": top,
            "$skip": skip,
            # "$select": "Id,Content",
            "loadAttributes": "expanded",
            "$orderby": "ModifiedDateTime desc",
        }
        r = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/ContentChannelItems",
            params=params,
            headers=headers)

        def fix_casing(col):
            return "\"{}\"".format(col)

        mediaAttributeLists = list(map(mapContentItems, r.json()))
        mediaAttributes = [
            mediaAttribute for sublist in mediaAttributeLists
            for mediaAttribute in sublist
        ]
        columns = list(
            map(fix_casing,
                ('apollosType', 'createdAt', 'updatedAt', 'nodeId', 'nodeType',
                 'type', 'url', 'originId', 'originType', 'metadata')))

        print('Media Items Added: ')
        print(len(list(mediaAttributes)))

        pg_hook.insert_rows('"media"',
                            list(mediaAttributes),
                            columns,
                            0,
                            True,
                            replace_index=('"originId"', '"originType"'))

        add_apollos_ids = """
        UPDATE "media"
        SET "apollosId" = "apollosType" || ':' || id::varchar
        WHERE "originType" = 'rock' and "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)

        skip += top
        fetched_all = len(r.json()) < top
Esempio n. 29
0
 def insertRow(tableName, **context):
     userName = context['ti'].xcom_pull(key=None, task_ids=push_current_user)
     hook = PostgresHook()
     hook.insert_rows(table=tableName,
                      rows=[(random.randint(0, 10000000), userName)],
                      target_fields=["id", "userName"])