Example #1
0
def load_s3_data(ds, **kwargs):
    """
    Load the table from PG and upload it as a parquet to S3.
    """
    bucket = kwargs.get("bucket")
    if bucket:
        logging.info("Uploading data to s3")
        yesterday = pandas.to_datetime(ds) - pandas.Timedelta(days=1)
        sql = f"""
        SELECT *
        FROM "{SCHEMA}"."{TABLE}"
        WHERE DATE(scheduled_depart) = '{str(yesterday.date())}'
        """
        engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
        df = pandas.read_sql_query(sql, engine)
        if len(df) == 0:
            logging.info(f"Got no trips for {yesterday.date()}. Exiting early")
            return

        name = kwargs.get("name",
                          "dash-trips-{}.parquet".format(yesterday.date()))
        path = os.path.join("/tmp", name)
        # Write to parquet, allowing timestamps to be truncated to millisecond.
        # This is much more precision than we will ever need or get.
        df.to_parquet(path, allow_truncated_timestamps=True)
        s3con = S3Hook(S3_ID)
        s3con.load_file(path, f"dash/{name}", bucket, replace=True)
        os.remove(path)
Example #2
0
def create_table(**kwargs):
    """
    Create the schema/tables to hold the hare data.
    """
    logging.info("Creating tables")
    engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
    if not engine.dialect.has_schema(engine, SCHEMA):
        engine.execute(sqlalchemy.schema.CreateSchema(SCHEMA))
    metadata.create_all(engine)
Example #3
0
def load_pg_data(ds, **kwargs):
    """
    Query trips data from the Syncromatics REST API and upload it to Postgres.
    """

    # Fetch the data from the rest API for the previous day.
    DOWNTOWN_DASH_ID = "LADOTDT"
    token = get_bearer_token()
    yesterday = str(pandas.to_datetime(ds) - timedelta(1))
    logging.info(f"Fetching DASH data for {yesterday}")
    r = requests.get(
        f"https://track-api.syncromatics.com/1/{DOWNTOWN_DASH_ID}"
        f"/exports/stop_times.json?start={yesterday}&end={yesterday}",
        headers={"Authorization": f"Bearer {token}"},
    )
    time_cols = ["arrive", "depart", "scheduled_arrive", "scheduled_depart"]
    df = pandas.read_json(
        r.content,
        convert_dates=time_cols,
        dtype={
            "run_name": str,
            "vehicle_name": str,
            "arrive_variance": float,
            "depart_variance": float,
        },
    )
    # The trips may be zero due to holidays or missing data.
    if len(df) == 0:
        logging.info("No trips found -- is this a holiday?")
        return

    # Drop unnecesary driver info.
    df = df.drop(columns=["driver_first_name", "driver_last_name"])

    # Drop null trip ids and make sure they are integers.
    df = df.dropna(subset=["trip_id"])
    df.trip_id = df.trip_id.astype("int64")

    # Set the timezone to local time with TZ info
    for col in time_cols:
        df[col] = df[col].dt.tz_localize("UTC").dt.tz_convert(LOCAL_TIMEZONE)

    check_columns(dash_trips, df)

    # Upload the final dataframe to Postgres. Since pandas timestamps conform to the
    # datetime interface, psycopg can correctly handle the timestamps upon insert.
    logging.info("Uploading to PG")
    engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
    insert = sqlalchemy.dialects.postgresql.insert(
        dash_trips).on_conflict_do_nothing()
    conn = engine.connect()
    conn.execute(insert, *df.to_dict(orient="record"))
Example #4
0
    def execute(self, context):
        df = self.get_data_frame()
        if df is not None:
            destination_hook = PostgresHook.get_hook(self.destination_conn_id)
            if self.preoperator:
                self.log.info("Running preoperator")
                self.log.info(self.preoperator)
                destination_hook.run(self.preoperator)

            self.log.info(
                f"Inserting data frame into {self.destination_conn_id}")
            destination_hook.insert_rows(table=self.destination_table,
                                         rows=df.itertuples())
Example #5
0
def copy_pg_to_csv(sql, file_name):
    '''
        Функция экспорта результатов sql-зароса в CSV-файл,
        разделенный запятыми. Файл сохраняется локально.
        Алгоритм работы:
        получаем соединение с postgres с помощью хука,
        вызываем метод хука copy_export, который позволяет 
        экспортировать данные в файл.
    '''
    filename = os.path.join("/home/administrator/filestorage/", file_name)
    pg_hook = PostgresHook.get_hook("postgres_mock")
    logging.info(f"Exporting query to file {filename}")
    pg_hook.copy_expert(f"copy ({sql}) to stdout delimiter ',' csv header",
                        filename=filename)
def copy_table_to_hdfs(**kwargs):
    client = InsecureClient(f'http://127.0.0.1:50070/', user='******')

    logging.info(f"Creating dir /bronze on hadoop")
    client.makedirs('/bronze')

    _table_name = kwargs['table_name']
    pg_hook = PostgresHook.get_hook(POSTGRES_CONN_ID)

    with client.write(f'/bronze/{_table_name}.csv', ) as csv_file:
        logging.info("Exporting table to csv file '%s'", csv_file.name)
        pg_hook.copy_expert(
            f"COPY (SELECT * FROM {_table_name})  TO STDOUT WITH HEADER CSV",
            filename=csv_file)
Example #7
0
def load_s3_data(**kwargs):
    """
    Load the table from PG and upload it as a parquet to S3.
    """
    bucket = kwargs.get("bucket")
    name = kwargs.get("name", "bikeshare_trips.parquet")
    if bucket:
        logging.info("Uploading data to s3")
        engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
        df = pandas.read_sql_table(TABLE, engine, schema=SCHEMA)
        path = os.path.join("/tmp", name)
        df.to_parquet(path)
        s3con = S3Hook(S3_ID)
        s3con.load_file(path, name, bucket, replace=True)
        os.remove(path)
Example #8
0
def load_to_postgres(**kwargs):
    """
    Loads the care, care plus response codes
    to postgres as an upsert
    """
    dataset_id = "jvre-2ecm"
    client = Socrata(
        "data.lacity.org",
        SOCRATA_APP_TOKEN,
        username=SOCRATA_USERNAME,
        password=SOCRATA_PASSWORD,
    )
    results = []
    req_count = 0
    page_size = 2000
    data = None
    while data != []:
        data = client.get(
            dataset_id,  # view, limited to correct reason codes
            content_type="json",
            offset=req_count * page_size,
            limit=page_size,
        )
        req_count += 1
        results.append(data)
    df = pd.DataFrame.from_dict(list(chain.from_iterable(results)))
    srid = 4326
    df["latitude"] = pd.to_numeric(df["latitude"])
    df["longitude"] = pd.to_numeric(df["longitude"])
    df["geom"] = df.dropna(subset=["latitude", "longitude"]).apply(
        lambda x: WKTElement(Point(x.longitude, x.latitude).wkt, srid=srid),
        axis=1)
    df = df.drop("location", axis=1)

    # Create the connection
    engine = PostgresHook.get_hook("postgres_default").get_sqlalchemy_engine()

    # Write the dataframe to the database
    df.to_sql(
        "311-cases-homelessness",
        engine,
        schema="public-health",
        if_exists="replace",
        dtype={"geom": Geometry("POINT", srid=srid)},
    )

    return True
Example #9
0
def load_s3_data(**kwargs):
    """
    Load the table from PG and upload it as a parquet to S3.
    """
    bucket = kwargs.get("bucket")
    name = kwargs.get("name", "dash_trips.parquet")
    if bucket:
        logging.info("Uploading data to s3")
        engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
        df = pandas.read_sql_table(TABLE, engine, schema=SCHEMA)
        path = os.path.join("/tmp", name)
        # Write to parquet, allowing timestamps to be truncated to millisecond.
        # This is much more precision than we will ever need or get.
        df.to_parquet(path, allow_truncated_timestamps=True)
        s3con = S3Hook(S3_ID)
        s3con.load_file(path, name, bucket, replace=True)
        os.remove(path)
Example #10
0
def update_code_75(**kwargs):
    # Connecting to ArcGIS
    arcconnection = BaseHook.get_connection("arcgis")
    gis = arcgis.GIS("http://lahub.maps.arcgis.com/", arcconnection.login,
                     arcconnection.password)

    # Getting Code 75 data
    gis_item = gis.content.get(kwargs.get("arcfeatureid"))
    sdf = gis_item.layers[0].query().sdf
    sdf.drop("SHAPE", axis=1, inplace=True)

    # Creating table if does not exist
    engine = PostgresHook.get_hook("postgres_default").get_sqlalchemy_engine()
    engine.connect().execute('CREATE SCHEMA IF NOT EXISTS "public-health"')
    create_table_statement = """CREATE TABLE IF NOT EXISTS "public-health"."code75s" (
    index BIGINT,
    "OBJECTID" BIGINT,
    address TEXT,
    cd FLOAT(53),
    closeddate TEXT,
    createddate TEXT,
    latitude FLOAT(53),
    longitude FLOAT(53),
    reasoncode BIGINT,
    resolutioncode TEXT,
    srnumber TEXT,
    status TEXT
    )"""
    engine.connect().execute(create_table_statement)

    # Deleting old records
    object_id_list = ",".join(list(sdf["OBJECTID"].astype(str)))
    engine.connect().execute(
        'DELETE FROM "public-health".code75s WHERE "OBJECTID" IN (%s)' %
        object_id_list)

    # Sending updates and new records to postgres
    sdf.to_sql(
        "code75s",
        engine,
        schema="public-health",
        if_exists="append",
    )
Example #11
0
import boto3
import botocore
import mds
import mds.db
import mds.providers
import requests
from airflow import DAG
from airflow.hooks.base_hook import BaseHook
from airflow.hooks.postgres_hook import PostgresHook
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.python_operator import PythonOperator
from mds.api.auth import AuthorizationToken
from mds.versions import Version

POSTGRES_URI = PostgresHook.get_hook("postgres_default").get_uri()


default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2018, 10, 30),
    "email": [
        "*****@*****.**",
        "*****@*****.**",
        "*****@*****.**",
    ],
    "email_on_failure": True,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=2),
def load_to_postgres(**kwargs):
    """
    Loads the care, care plus response codes
    to postgres as an upsert
    """
    dataset_id = "jvre-2ecm"
    SOCRATA_APP_TOKEN = Variable.get("SOCRATA_APP_TOKEN")
    SOCRATA_USERNAME = Variable.get("SOCRATA_USERNAME")
    SOCRATA_PASSWORD = Variable.get("SOCRATA_PASSWORD")

    client = Socrata(
        "data.lacity.org",
        SOCRATA_APP_TOKEN,
        username=SOCRATA_USERNAME,
        password=SOCRATA_PASSWORD,
    )
    results = []
    req_count = 0
    page_size = 2000
    data = None
    while data != []:
        data = client.get(
            dataset_id,  # view, limited to correct reason codes
            content_type="json",
            offset=req_count * page_size,
            limit=page_size,
        )
        req_count += 1
        results.append(data)
    df = pd.DataFrame.from_dict(list(chain.from_iterable(results)))
    srid = 4326
    df["latitude"] = pd.to_numeric(df["latitude"])
    df["longitude"] = pd.to_numeric(df["longitude"])
    df["geom"] = df.dropna(subset=["latitude", "longitude"]).apply(
        lambda x: WKTElement(Point(x.longitude, x.latitude).wkt, srid=srid),
        axis=1)
    df = df.drop("location", axis=1)

    # Create the connection
    engine = PostgresHook.get_hook("postgres_default").get_sqlalchemy_engine()

    # Write the dataframe to the database
    df.to_sql(
        "311-cases-homelessness",
        engine,
        schema="public-health",
        if_exists="replace",
        dtype={"geom": Geometry("POINT", srid=srid)},
    )

    # Grant ita_readonly and iteam_ro permissions to read the table.
    # We need to do this every time due to replacing existing tables,
    # which wipes previous permissions.
    engine.execute(
        'GRANT SELECT ON TABLE "public-health"."311-cases-homelessness" '
        "TO ita_readonly")
    engine.execute(
        'GRANT SELECT ON TABLE "public-health"."311-cases-homelessness" '
        "TO iteam_ro")

    return True
Example #13
0
def update_code_55(**kwargs):
    # Collecting each dataset
    code_55_closed_last_90 = get_code_55_df(0, "Closed")
    code_55_pending_schedule = get_code_55_df(1, "Pending Scheduled")
    code_55_scheduled = get_code_55_df(2, "Scheduled")

    # Prioritizing data where Closed Last 90 > Scheduled > Pending Schedule
    code_55_pending_schedule = code_55_pending_schedule.loc[
        ~(code_55_pending_schedule["CompositeID"].isin(
            code_55_closed_last_90["CompositeID"]))
        & ~(code_55_pending_schedule["CompositeID"].isin(
            code_55_scheduled["CompositeID"]))]
    code_55_scheduled = code_55_scheduled.loc[
        ~code_55_scheduled["CompositeID"].
        isin(code_55_closed_last_90["CompositeID"])]

    # Merging all data into one dataframe
    df = pd.concat(
        [code_55_closed_last_90, code_55_pending_schedule, code_55_scheduled])

    # Creating table if does not exist
    engine = PostgresHook.get_hook("postgres_default").get_sqlalchemy_engine()
    engine.connect().execute('CREATE SCHEMA IF NOT EXISTS "public-health"')
    create_table_statement = """CREATE TABLE IF NOT EXISTS "public-health"."code55s" (
    index BIGINT,
    "Id" BIGINT,
    "GlobalID" TEXT,
    "Number" TEXT,
    "AssociatedServiceRequestNumber" TEXT,
    "DateCreated" TIMESTAMP WITHOUT TIME ZONE,
    "DateApproved" TIMESTAMP WITHOUT TIME ZONE,
    "ExpirationDate" TIMESTAMP WITHOUT TIME ZONE,
    "Status" BIGINT,
    "Address" TEXT,
    "City" TEXT,
    "ZipCode" TEXT,
    "CrossStreet" TEXT,
    "CouncilDistrict" TEXT,
    "APREC" TEXT,
    "LocationComments" TEXT,
    "SubmittedBy" TEXT,
    "ReportingPerson" TEXT,
    "RPContactNo" TEXT,
    "Details" TEXT,
    "HEAssessmentBy" TEXT,
    "AssessmentContactNo" TEXT,
    "HEAssessmentDetails" TEXT,
    "AssessmentLocationDescription" TEXT,
    "AnchorPhotos" TEXT,
    "DateCompleted" TIMESTAMP WITHOUT TIME ZONE,
    "CleanupDate" TIMESTAMP WITHOUT TIME ZONE,
    "CompositeID" TEXT,
    "Stage" TEXT
    )"""
    engine.connect().execute(create_table_statement)

    # Deleting old records
    object_id_list = "','".join(list(df["CompositeID"]))
    object_id_list = "'" + object_id_list + "'"
    engine.connect().execute(
        'DELETE FROM "public-health".code55s WHERE "CompositeID" IN (%s)' %
        object_id_list)

    # Sending updates and new records to postgres
    df.to_sql(
        "code55s",
        engine,
        schema="public-health",
        if_exists="append",
    )
Example #14
0
def load_pg_data(**kwargs):
    """
    Load data from the Tableau server and upload it to Postgres.
    """
    # Sign in to the tableau server.
    TABLEAU_SERVER = "https://10az.online.tableau.com"
    TABLEAU_SITENAME = "echo"
    TABLEAU_VERSION = "2.7"
    TABLEAU_USER = Variable.get("BIKESHARE_TABLEAU_USER")
    TABLEAU_PASSWORD = Variable.get("BIKESHARE_TABLEAU_PASSWORD")
    TRIP_TABLE_VIEW_ID = "7530c937-887e-42da-aa50-2a11d279bf51"
    logging.info("Authenticating with Tableau")
    tableau_auth = tableauserverclient.TableauAuth(
        TABLEAU_USER,
        TABLEAU_PASSWORD,
        TABLEAU_SITENAME,
    )
    tableau_server = tableauserverclient.Server(TABLEAU_SERVER,
                                                TABLEAU_VERSION)
    tableau_server.auth.sign_in(tableau_auth)

    # Get the Trips table view. This is a view specifically created for
    # this DAG. Tableau server doesn't allow the download of underlying
    # workbook data via the API (though one can from the UI). This view
    # allows us to get around that.
    logging.info("Loading Trips view")
    all_views, _ = tableau_server.views.get()
    view = next(v for v in all_views if v.id == TRIP_TABLE_VIEW_ID)
    if not view:
        raise Exception("Cannot find the trips table!")
    tableau_server.views.populate_csv(view)
    df = pandas.read_csv(
        io.BytesIO(b"".join(view.csv)),
        parse_dates=["Start Datetime", "End Datetime"],
        thousands=",",
        dtype={
            "Visible ID": str,
            "End Station": str,
            "Start Station": str
        },
    )

    # The data has a weird structure where trip rows are duplicated, with variations
    # on a "Measure" column, containing trip length, duration, etc. We pivot on that
    # column to create a normalized table containing one row per trip.
    logging.info("Cleaning Data")
    df = pandas.merge(
        df.set_index("Trip ID").groupby(level=0).first().drop(
            columns=["Measure Names", "Measure Values"]),
        df.pivot(index="Trip ID",
                 columns="Measure Names",
                 values="Measure Values"),
        left_index=True,
        right_index=True,
    ).reset_index()
    df = df.rename(
        {
            n: n.lower().strip().replace(" ", "_").replace("(", "").replace(
                ")", "")
            for n in df.columns
        },
        axis="columns",
    )
    check_columns(bike_trips, df)

    # Upload the final dataframe to Postgres. Since pandas timestamps conform to the
    # datetime interface, psycopg can correctly handle the timestamps upon insert.
    logging.info("Uploading to PG")
    engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
    insert = sqlalchemy.dialects.postgresql.insert(
        bike_trips).on_conflict_do_nothing()
    conn = engine.connect()
    conn.execute(insert, *df.to_dict(orient="record"))