Ejemplo n.º 1
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google cloud storage.
     """
     hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
     for object, tmp_file_handle in files_to_upload.items():
         hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
Ejemplo n.º 3
0
 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google cloud storage.
     """
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     for tmp_file in files_to_upload:
         hook.upload(self.bucket, tmp_file.get('file_name'),
                     tmp_file.get('file_handle').name,
                     mime_type=tmp_file.get('file_mime_type'))
Ejemplo n.º 4
0
    def execute(self, context):
        """
        Uploads the file to Google cloud storage
        """
        hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)

        hook.upload(
            bucket=self.bucket,
            object=self.dst,
            mime_type=self.mime_type,
            filename=self.src)
Ejemplo n.º 5
0
 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google cloud storage.
     """
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     for tmp_file in files_to_upload:
         hook.upload(self.bucket,
                     tmp_file.get('file_name'),
                     tmp_file.get('file_handle').name,
                     mime_type=tmp_file.get('file_mime_type'),
                     gzip=self.gzip if tmp_file.get('file_name')
                     == self.schema_filename else False)
 def execute(self, context):
     gcp_text_to_speech_hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id)
     result = gcp_text_to_speech_hook.synthesize_speech(
         input_data=self.input_data,
         voice=self.voice,
         audio_config=self.audio_config,
         retry=self.retry,
         timeout=self.timeout,
     )
     with NamedTemporaryFile() as temp_file:
         temp_file.write(result.audio_content)
         cloud_storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id)
         cloud_storage_hook.upload(
             bucket=self.target_bucket_name, object=self.target_filename, filename=temp_file.name
         )
Ejemplo n.º 7
0
    def execute(self, context):
        # get data from cloud function API
        httphook = HttpHook(method=self.method, http_conn_id=self.http_conn_id)
        response = httphook.run(endpoint=self.endpoint)
        # store date locally in temp file
        with NamedTemporaryFile() as tempfile:
            tempfile.write(response.content)
            tempfile.flush()

            #upload to bucket
            gcshook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id)
            gcshook.upload(bucket=self.gcs_bucket,
                           object=self.gcs_path,
                           filename=tempfile.name)
Ejemplo n.º 8
0
    def execute(self, context):
        """
        Uploads the file to Google cloud storage
        """
        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to)

        hook.upload(
            bucket_name=self.bucket,
            object_name=self.dst,
            mime_type=self.mime_type,
            filename=self.src,
            gzip=self.gzip,
        )
    def execute(self, context):
        http = HttpHook(self.method, http_conn_id=self.http_conn_id)

        self.log.info("Calling HTTP method")
        response = http.run(self.endpoint)

        with NamedTemporaryFile() as tmp_file_handle:
            tmp_file_handle.write(response.content)
            tmp_file_handle.flush()

            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcs_conn_id)
            hook.upload(bucket=self.bucket,
                        object=self.gcs_path,
                        filename=tmp_file_handle.name)
Ejemplo n.º 10
0
def get_weather(**kwargs):
    """
    Query openweathermap.com's API and to get the weather for
    Jakarta, ID and then dump the json to the /src/data/ directory
    with the file name "<today's date>.json"
    """

    # My API key is defined in my config.py file.
    paramaters = {'q': 'Jakarta, ID', 'appid': API_KEY}
    logging.info("API_KEY={}".format(API_KEY))

    result = requests.get("http://api.openweathermap.org/data/2.5/weather?",
                          paramaters)

    # If the API call was sucessful, get the json and dump it to a file with
    # today's date as the title.
    if result.status_code == 200:

        # Get the json data
        json_data = result.json()
        logging.info("Response from API: {}".format(json_data))

        # Save output file
        file_name = str(kwargs["execution_date"]) + '.json'
        dir_path = os.path.join(os.path.dirname(__file__), '..', '..', 'data',
                                kwargs["dag"].dag_id, kwargs["task"].task_id)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        tot_name = os.path.join(dir_path, file_name)
        logging.info("Will write output to {}".format(tot_name))

        with open(tot_name, 'w') as outputfile:
            json.dump(json_data, outputfile)
            logging.info("Successfully write local output file")

        # upload to GCS
        gcs = GoogleCloudStorageHook('gcp_airflow_lab')
        gcs_dest_object = os.path.join(kwargs["dag"].dag_id,
                                       kwargs["task"].task_id, file_name)
        gcs.upload(GCS_BUCKET,
                   gcs_dest_object,
                   tot_name,
                   mime_type='application/octet-stream')
        logging.info(
            "Successfully write output file to GCS: gs://{}/{}".format(
                GCS_BUCKET, gcs_dest_object))
    else:
        raise ValueError('"Error In API call."')
Ejemplo n.º 11
0
 def execute(self, context):
     hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id)
     result = hook.synthesize_speech(
         input_data=self.input_data,
         voice=self.voice,
         audio_config=self.audio_config,
         retry=self.retry,
         timeout=self.timeout,
     )
     with NamedTemporaryFile() as temp_file:
         temp_file.write(result.audio_content)
         cloud_storage_hook = GoogleCloudStorageHook(
             google_cloud_storage_conn_id=self.gcp_conn_id)
         cloud_storage_hook.upload(bucket_name=self.target_bucket_name,
                                   object_name=self.target_filename,
                                   filename=temp_file.name)
Ejemplo n.º 12
0
    def execute(self, context):
        response = super().execute(context)

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id)

        with tempfile.NamedTemporaryFile(prefix="gcs-local") as file:
            file.write(response.encode('utf-8'))
            file.flush()

            hook.upload(
                bucket=self.bucket,
                filename=file.name,
                object=self.filename,
                mime_type=self.mime_type
            )
Ejemplo n.º 13
0
def simpleNumpyToGCS(csv_name: str,
                     folder_name: str,
                     bucket_name="airflow-gcp-bucket",
                     **kwargs):
    hook = GoogleCloudStorageHook()

    data = {'col1': [1, 2], 'col2': [3, 4]}

    df = pd.DataFrame(data=data)

    df.to_csv('example1.csv', index=False)

    hook.upload(bucket_name,
                object='{}/{}.csv'.format(folder_name, csv_name),
                filename='example1.csv',
                mime_type='text/csv')
Ejemplo n.º 14
0
    def execute(self, context):
        http = HttpHook(self.method, http_conn_id=self.http_conn_id)
        gchook = GoogleCloudStorageHook()

        self.log.info("Calling HTTP method")

        response = http.run(self.endpoint, self.data, self.headers,
                            self.extra_options)
        if self.log_response:
            self.log.info(response.text)
        if self.response_check:
            if not self.response_check(response):
                raise AirflowException("Response check returned False.")
        f = open("aaaa", "w")
        f.write(response.text)
        f.close()
        gchook.upload(object="bucketie", filename="aaaa", bucket="buckster")
class HttpToGcsOperator(BaseOperator):
    """
    Calls an endpoint on an HTTP system to execute an action

    :param http_conn_id: The connection to run the operator against
    :type http_conn_id: string
    :param endpoint: The relative part of the full url. (templated)
    :type endpoint: string
    :param gcs_path: The path of the GCS to store the result
    :type gcs_path: string
    """

    template_fields = ('endpoint', 'gcs_path')
    template_ext = ()
    ui_color = '#f4a460'

    @apply_defaults
    def __init__(self, endpoint, gcs_path, http_conn_id, gcs_conn_id, *args,
                 **kwargs):
        super(HttpToGcsOperator, self).__init__(*args, **kwargs)

        self.http_conn_id = http_conn_id
        self.gcs_conn_id = gcs_conn_id
        self.gcs_path = gcs_path
        self.endpoint = endpoint

    def execute(self, context):

        self.http_hook = HttpHook(self.http_conn_id, method='GET')
        self.gcs_hook = GoogleCloudStorageHook(self.gcs_conn_id)

        bucket, blob = self.gcs_hook._parse_gcs_url(self.gcs_path)

        # Parse the query into components, extract query part
        parsed = urlparse(self.endpoint)
        base_url = urlunparse(list(parsed[:4]) + ["", ""])

        # Create temporary
        with tempfile.NamedTemporaryFile() as fp:
            # Get response
            response = self.http_hook.run(base_url, date=parsed.query)
            fp.write(response)
            fp.close()

            # Upload the file to storage
            self.gcs_hook.upload(bucket, blob, filename=fp.name)
Ejemplo n.º 16
0
    def execute(self, context):
        http = HttpHook(self.method, http_conn_id=self.http_conn_id)

        self.log.info("Calling HTTP method")

        response = http.run(self.endpoint,
                            self.data,
                            self.headers,
                            self.extra_options)
        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)
        local_filename = '/tmp/' + self.filename
        f = open(local_filename, "w")
        f.write(response.text)
        f.close()
        hook.upload(bucket=self.bucket, object=self.filename, filename=local_filename, mime_type='application/json')
Ejemplo n.º 17
0
    def execute(self, context):
        # build moat tile
        logging.info('Instantiate Moat Tile')
        tile = MoatTile(self.brand_id, self.level_filter, self.dimensions)

        token = Variable.get('rtf_moat_token')
        ## somehow get token in a better way

        logging.info('Fetch Token')

        if self.level_filter:
            filter_id = [*self.level_filter.values()][0]

        time.sleep(random.randint(1, 5))  # f**k shit up

        filename = tile.get_data(self.s, self.e, token)

        if filename:
            logging.info('Response Saved Locally @ {}'.format(filename))

        else:
            logging.error('No Response')
            raise AirflowSkipException()

        file_tokens = [self.brand_id, filter_id, self.suffix]

        blob_name = "_".join([str(x) for x in file_tokens
                              if x])  ## PRETTY CLEVER KYLE

        if self.prefix:
            blob_name = str(self.prefix) + blob_name + '.json'

        hook = GoogleCloudStorageHook()

        hook.upload(bucket=self.bucket, object=blob_name,
                    filename=filename)  ## docs don't match repo

        logging.info("{} uploaded to {}".format(blob_name, self.bucket))
        os.remove(filename)

        logging.info("{} deleted from local".format(filename))

        return (
            self.bucket, blob_name
        )  ## should get pushed to xcom if do_xcom_push is set to True in baseclass
Ejemplo n.º 18
0
def text2speech(**kwargs):
    ti = kwargs['ti']
    data = {"message": ti.xcom_pull(task_ids="input")}
    response = requests.post(
        "https://us-central1-devops-218113.cloudfunctions.net/Text2Speech",
        json=data)
    fileName = str(uuid.uuid4())
    with open(fileName, "wb") as outfile:
        outfile.write(response.content)

    gcs = GoogleCloudStorageHook()
    gcs.upload("workflowstorage",
               fileName,
               fileName,
               mime_type='application/octet-stream')

    os.remove(fileName)
    return fileName
Ejemplo n.º 19
0
def compression(**kwargs):
    ti = kwargs['ti']
    fileName = ti.xcom_pull(task_ids="conversion")
    gcs = GoogleCloudStorageHook()
    gcs.download("workflowstorage", fileName, fileName)
    file = {"to_compress": open(fileName, 'rb')}
    response = requests.post(
        "https://us-central1-devops-218113.cloudfunctions.net/Compression",
        files=file)
    newFileName = str(uuid.uuid4())
    with open(newFileName, "wb") as outfile:
        outfile.write(response.content)
    gcs.upload("workflowstorage",
               newFileName,
               newFileName,
               mime_type='application/octet-stream')
    os.remove(newFileName)
    return newFileName
Ejemplo n.º 20
0
    def execute(self, context):
        gcshook = GoogleCloudStorageHook(self.gcp_conn_id)
        self.log.info(gcshook.list("testcovidlinh"))     

        # Create a temporary folder
        # print(os.path.)
        if not path.exists("tmp"):
            os.mkdir("tmp")
        
        # Track failure 
        failure_count = 0

        # Passing filename to next job
        file_list = []

        # Consume API
        for state in self.state_code: 
            URL = "https://covidtracking.com/api/v1/states/" + state.lower() + "/daily.json"
            # self.log.info(URL)
            response = requests.get(URL).json()
            
            try:
                # If we have any message error
                self.log.info(response["message"])
                failure_count += 1
                continue
            except:
                # The response is successfully
                filename = "tmp/"+state+".json"
                # self.log.info(filename)
                with open(filename,'w', encoding='utf-8') as f:
                    dict2str = [json.dumps(i,sort_keys=True) for i in response]
                    json_output = "\n".join(dict2str)
                    f.write(json_output)
                    # json.dump(response, f, ensure_ascii=False)
                
                object_name = 'US-' + state + "/" + "covidstat.json"
                file_list.append(object_name)
                gcshook.upload(bucket=self.gcs_bucket, object=object_name, filename=filename)
        
        self.log.info("Number of failure cases: "+str(failure_count))

        task_instance = context['task_instance']
        task_instance.xcom_push(self.xcom_task_id_key, file_list)
Ejemplo n.º 21
0
    def execute(self, context):
        http = HttpHook(self.method, http_conn_id=self.http_conn_id)

        response = http.run(self.endpoint, self.data, self.headers,
                            self.extra_options)

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to,
        )

        tmp_file_handle = NamedTemporaryFile(delete=True)
        tmp_file_handle.write(response.content)
        tmp_file_handle.flush()

        hook.upload(self.bucket, self.filename, tmp_file_handle.name,
                    "application/json")

        tmp_file_handle.close()
Ejemplo n.º 22
0
    def _upload_to_gcs(self, tmp_file_handles):
        """
        Upload all of the file splits (and optionally the schema .json file) to
        Google cloud storage.
        """
        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        for object_name, tmp_file_handle in tmp_file_handles.items():
            # File is not empty
            if tmp_file_handle.tell() > 0:
                self.log.info(
                    f'Uploading file {tmp_file_handle.name} to GCS as gs://{self.bucket}/{object_name}'
                )
                hook.upload(self.bucket, object_name, tmp_file_handle.name,
                            'application/json',
                            (self.gzip if object_name != self.schema_filename
                             else False))
Ejemplo n.º 23
0
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook()

        # splitting the file path to extract the desired parts (which should be a path like gs://bucket/path/file.csv)
        file_parts = self.gcs_file_path.split('/')
        # gets the bucket
        bucket = file_parts[2]
        # getting the path to the file
        file_path = '/'.join(file_parts[3:-1])
        # getting the file name
        file_name = file_parts[-1]

        # setting the local path with a "Pre" and preparing a processed path for the file
        local_file_path = '/home/airflow/gcs/data/Pre_{}'.format(file_name)
        prepared_file_path = '/home/airflow/gcs/data/{}'.format(file_name)

        # obtaining the Geocode job id
        task_instance = context['task_instance']
        create_resp = task_instance.xcom_pull(task_ids=self.create_job_task)
        job_id = create_resp['resourceSets'][0]['resources'][0]['id']

        # calling and downloading the file
        bm_hook = BingMapsHook(bing_maps_conn_id=self.bing_maps_conn_id)

        method = '{}/output/succeeded'.format(job_id)
        bm_hook.call(method=method,
                     api_params={},
                     operation='GET',
                     file_path=local_file_path)

        # processing the file and uploading to the bucket
        with open(local_file_path, 'r') as rf:
            with open(prepared_file_path, 'w') as wf:
                for num, line in enumerate(rf, 1):
                    if num == 1:
                        pass
                    elif num == 2:
                        wf.write(line.replace('/', '_'))
                    else:
                        wf.write(line)

        gcs_hook.upload(bucket, '{}/{}'.format(file_path, file_name),
                        prepared_file_path)
Ejemplo n.º 24
0
    def generateSchema(self, keyword, stagetable_flag=True):
        """
            Generate schema for bigquery
        """

        schema_json = [{
            "name": "date",
            "type": "STRING"
        }, {
            "name": "state",
            "type": "STRING"
        }]
        data_type = 'STRING'
        file_path = self.AIRFLOW_HOME + "/tmp/googletrend_schema.json"
        if stagetable_flag:
            schema_json = [{
                "name": "date",
                "type": "STRING"
            }, {
                "name": "state",
                "type": "STRING"
            }]
            data_type = 'STRING'
            file_path = self.AIRFLOW_HOME + "/tmp/googletrend_schema_stage.json"

        d = {}
        print(keyword)
        for word in keyword:
            d["name"] = word.replace(" ", "_")
            d["type"] = data_type
            schema_json.append(d)
            d = {}

        with open(file_path, "w") as f:
            json.dump(schema_json, f, indent=4)

        # Upload schema to GCS
        object_name = "googletrend_schema.json"
        gcshook = GoogleCloudStorageHook(self.gcp_conn_id)
        gcshook.upload(bucket=self.gcs_bucket,
                       object=object_name,
                       filename=file_path)
    def execute(self, context):
        self.log.info("Fetching launch data")
        launch_hook = LaunchHook(conn_id=self._launch_conn_id)
        result = launch_hook.get_launches(start_date=self._start_date,
                                          end_date=self._end_date)
        self.log.info("Fetched data for %d launches", len(result))

        self.log.info("Uploading data to gcs://%s/%s", self._output_bucket,
                      self._output_path)
        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self._gcp_conn_id)

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_path = os.path.join(tmp_dir, "result.json")
            with open(tmp_path, "w") as file_:
                json.dump(result, file_)

            gcs_hook.upload(bucket=self._output_bucket,
                            object=self._output_path,
                            filename=tmp_path)
Ejemplo n.º 26
0
def load_table(**kwargs):
    """
    Processes the json data, checks the types and enters into the
    Postgres database.
    """

    pg_hook = PostgresHook(postgres_conn_id='weatherdb_postgres_conn')
    gcs = GoogleCloudStorageHook('gcp_airflow_lab')
    prev_task_id = 'transform_data'

    # Set source file
    source_file_name = str(kwargs["execution_date"]) + '.csv'
    source_dir_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                   'data', kwargs["dag"].dag_id, prev_task_id)
    source_full_path = os.path.join(source_dir_path, source_file_name)

    # download from GCS
    gcs_src_object = os.path.join(kwargs["dag"].dag_id, prev_task_id,
                                  source_file_name)
    gcs.upload(GCS_BUCKET,
               gcs_src_object,
               source_full_path,
               mime_type='application/octet-stream')
    logging.info("Successfully download file from GCS: gs://{}/{}".format(
        GCS_BUCKET, gcs_src_object))

    # open the csv source file and read it in
    with open(source_full_path, 'r') as inputfile:
        csv_reader = csv.reader(inputfile, delimiter=',')
        for row in csv_reader:
            insert_cmd = """INSERT INTO weather 
                            (city, country, latitude, longitude,
                            todays_date, humidity, pressure, 
                            min_temp, max_temp, temp, weather)
                            VALUES
                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""

            pg_hook.run(insert_cmd, parameters=row)
            logging.info(
                "Successfully insert to database using command: {}".format(
                    insert_cmd))
Ejemplo n.º 27
0
def write_str_to_gcp(string: str,
                     gcp_path: str,
                     conn_id: str = 'google_cloud_default'):
    """Dump a string into a file in google bucket"""
    storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=conn_id)
    destination_uri = urlparse(gcp_path)
    with tempfile.TemporaryDirectory() as tmp_folder:
        temp_path_abs = os.path.join(tmp_folder, 'config_file')
        with open(temp_path_abs, 'w') as f:
            f.write(string)

        if destination_uri.path.startswith('/'):
            destination_path = destination_uri.path[1:]
        else:
            destination_path = destination_uri.path

        storage_hook.upload(
            destination_uri.netloc,
            destination_path,
            temp_path_abs
        )
Ejemplo n.º 28
0
def upload_to_gcs(**kwargs):
    """
    Generates a CSV that is then uploaded to Google Cloud Storage using the
    GoogleCloudStorageHook.

    This is meant to imitate the first step of a traditional ETL DAG: ingesting
    data from some external sourceself.

    """

    df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                      columns=['col_a', 'col_b', 'col_c', 'col_d'])

    df.to_csv('test_data.csv', index=False)

    hook = GoogleCloudStorageHook(google_cloud_storage_conn_id='astro_gcs')

    hook.upload(bucket='psl-poc-viraj',
                object='test_data.csv',
                filename='test_data.csv',
                mime_type='text/plain')
Ejemplo n.º 29
0
def download_and_transform_erf(self, partner_id=None):
  """Load and Transform ERF files to Newline Delimeted JSON.

  Then upload this file to the project GCS.

  Args:
    self: The operator this is being used in.
    partner_id: A string of the DCM id of the partner.

  Returns:
    entity_read_file_ndj: The filename for the converted entity read file.
  """
  if partner_id:
    self.erf_bucket = 'gdbm-%s' % partner_id
  else:
    self.erf_bucket = 'gdbm-public'

  gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id)
  entity_read_file = tempfile.NamedTemporaryFile(delete=False)
  gcs_hook.download(self.erf_bucket, self.erf_object, entity_read_file.name)
  temp_file = None
  # Creating temp file. Not using the delete-on-close functionality
  # as opening the file for reading while still open for writing
  # will not work on all platform
  # https://docs.python.org/2/library/tempfile.html#tempfile.NamedTemporaryFile
  try:
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
    temp_file.writelines(json_to_jsonlines(entity_read_file.name))
    temp_file.close()
    # Random here used as a nonce for writing multiple files at once.
    filename = '%s_%s_%d.json' % (randint(1, 1000000), self.entity_type,
                                  time.time() * 1e+9)
    gcs_hook.upload(self.gcs_bucket, filename, temp_file.name)

  finally:
    if temp_file:
      temp_file.close()
    os.unlink(temp_file.name)

  return filename
Ejemplo n.º 30
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super().execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(
                        bucket_name=dest_gcs_bucket,
                        object_name=dest_path,
                        filename=f.name,
                        gzip=self.gzip
                    )

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
Ejemplo n.º 31
0
    def execute(self, context):
        facebook_conn = FacebookAdsHook(self.facebook_conn_id)
        gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id)

        time_range = {
            "since":
            datetime.strptime(self.since,
                              "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"),
            "until":
            datetime.strptime(self.until,
                              "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d"),
        }

        file_name = "/tmp/{key}.jsonl".format(key=self.gcs_key)
        with open(file_name, "w") as insight_file:
            for account_id in self.account_ids:
                insights = facebook_conn.get_insights_for_account_id(
                    account_id,
                    self.insight_fields,
                    self.breakdowns,
                    time_range,
                    self.time_increment,
                    self.level,
                    self.limit,
                )

                if len(insights) > 0:
                    for insight in insights[:-1]:
                        insight_file.write(json.dumps(insight) + "\n")
                    insight_file.write(json.dumps(insights[-1:][0]))
                else:
                    return

        gcs_conn.upload(filename=file_name,
                        bucket=gcs_bucket,
                        object=gcs_key,
                        gzip=True)
        os.remove(file_name)
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        logging.info('Extracting data from Hive')
        logging.info(self.hql)

        data = hive.get_pandas_df(self.hql, schema=self.schema)
        gcp_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id)
        logging.info('Inserting rows onto google cloud storage')

        with tempfile.NamedTemporaryFile(suffix='.json', prefix='tmp') as tmp_file:
            data = data.to_json(orient='records')
            recs = json.loads(data)
            for record in recs:
                tmp_file.write(json.dumps(record))
                tmp_file.write("\n")
            tmp_file.flush()

            remote_file_name = self.file_pattern.format('aa')
            remote_name = os.path.join(self.subdir, remote_file_name)
            gcp_hook.upload(self.bucket, remote_name, tmp_file.name)

        logging.info('Done.')
    def execute(self, context):
        ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id)
        gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id)
        try:
            since_formatted = datetime.strptime(self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            since_formatted = str(self.since)
        try:
            until_formatted = datetime.strptime(self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            until_formatted = str(self.until)
        report = ga_conn.get_analytics_report(self.view_id,
                                              since_formatted,
                                              until_formatted,
                                              self.sampling_level,
                                              self.dimensions,
                                              self.metrics,
                                              self.page_size,
                                              self.include_empty_rows)

        columnHeader = report.get('columnHeader', {})
        # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future
        # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..)
        dimensionHeaders = [
            {'name': header.replace('ga:', ''), 'type': 'varchar(255)'}
            for header
            in columnHeader.get('dimensions', [])
        ]
        metricHeaders = [
            {'name': entry.get('name').replace('ga:', ''),
             'type': self.metricMap.get(entry.get('type'), 'varchar(255)')}
            for entry
            in columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
        ]

        with NamedTemporaryFile("w") as ga_file:
            rows = report.get('data', {}).get('rows', [])

            for row_counter, row in enumerate(rows):
                root_data_obj = {}
                dimensions = row.get('dimensions', [])
                metrics = row.get('metrics', [])

                for index, dimension in enumerate(dimensions):
                    header = dimensionHeaders[index].get('name').lower()
                    root_data_obj[header] = dimension

                for metric in metrics:
                    data = {}
                    data.update(root_data_obj)

                    for index, value in enumerate(metric.get('values', [])):
                        header = metricHeaders[index].get('name').lower()
                        data[header] = value

                    data['viewid'] = self.view_id
                    data['timestamp'] = self.since

                    ga_file.write(json.dumps(data) + ('' if row_counter == len(rows) else '\n'))

            gcs_conn.upload(self.gcs_bucket, self.gcs_object, ga_file.name)
Ejemplo n.º 34
0
 def _upload_to_gcs(self, files_to_upload):
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     for object, tmp_file_handle in files_to_upload.items():
         hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
Ejemplo n.º 35
0
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super(S3ToGoogleCloudStorageOperator, self).execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(bucket_name,
                                                    prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info('{0} files are going to be synced: {1}.'.format(
                    len(files), files))
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files
Ejemplo n.º 36
0
class GoogleCampaignManagerDownloadReportOperator(
        GoogleMarketingPlatformBaseOperator):
    """Downloads a Campaign Manager report into Google Cloud Storage.

  Attributes:
    report_id: The DCM report ID with which the report file is associated with.
        (templated)
    file_id: The DCM file ID of the report file to download. (templated)
    destination_bucket: The destination Google cloud storage bucket where the
        report should be written to. (templated)
    destination_object: The destination name of the object in the destination
        Google cloud storage bucket. (templated)
        If the destination points to an existing folder, the report will be
        written under the specified folder.
    gcp_conn_id: The connection ID to use when fetching connection info.
    delegate_to: The account to impersonate, if any.

  XComs:
    destination_bucket: The Google cloud storage bucket the report was written
        to.
    destination_object: The Google cloud storage URI for the report.
  """

    template_fields = [
        'report_id', 'file_id', 'destination_bucket', 'destination_object'
    ]

    def __init__(self,
                 report_id,
                 file_id,
                 destination_bucket,
                 destination_object=None,
                 gcp_conn_id='google_cloud_default',
                 chunk_size=5 * 1024 * 1024,
                 delegate_to=None,
                 *args,
                 **kwargs):
        super(GoogleCampaignManagerDownloadReportOperator,
              self).__init__(*args, **kwargs)
        self.file_id = file_id
        self.report_id = report_id
        self.destination_bucket = destination_bucket
        self.destination_object = destination_object
        self.chunk_size = chunk_size
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.gcs_hook = None
        self.cm_hook = None

    def _download_report(self, report_id, file_id, destination_file,
                         chunk_size):
        file_metadata = self.cm_hook.get_service().files().get(
            reportId=report_id, fileId=file_id).execute()

        if file_metadata['status'] != 'REPORT_AVAILABLE':
            msg = 'File with ID = %s and Report ID = %s not available, status = %s.' % (
                file_id, report_id, file_metadata['status'])
            raise Exception(msg)

        request = self.cm_hook.get_service().files().get_media(
            reportId=report_id, fileId=file_id)

        downloader = http.MediaIoBaseDownload(destination_file,
                                              request,
                                              chunksize=chunk_size)

        download_finished = False
        while not download_finished:
            _, download_finished = downloader.next_chunk()

        return file_metadata['fileName']

    def _get_destination_uri(self, destination_object, report_file_name):
        report_file_name = '%s.csv.gz' % report_file_name

        if destination_object is None:
            return report_file_name

        if destination_object.endswith('/'):
            return destination_object + report_file_name

        return destination_object

    def execute(self, context):
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id,
                delegate_to=self.delegate_to)
        if self.cm_hook is None:
            self.cm_hook = GoogleCampaignManagerHook(
                gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)

        temp_file = tempfile.NamedTemporaryFile(delete=False)
        try:
            report_file_name = self._download_report(self.report_id,
                                                     self.file_id, temp_file,
                                                     self.chunk_size)

            destination_object_name = self._get_destination_uri(
                self.destination_object, report_file_name)

            self.gcs_hook.upload(bucket=self.destination_bucket,
                                 object=destination_object_name,
                                 filename=temp_file.name,
                                 gzip=True,
                                 multipart=True)

            context['task_instance'].xcom_push('destination_bucket',
                                               self.destination_bucket)
            context['task_instance'].xcom_push('destination_object',
                                               destination_object_name)
        finally:
            temp_file.close()
            os.unlink(temp_file.name)
Ejemplo n.º 37
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS. Requires
    airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and
    REMOTE_LOG_CONN_ID configuration options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcp_api].
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.hook = None

        try:
            from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=remote_conn_id)
        except:
            logging.error(
                'Could not create a GoogleCloudStorageHook with connection id '
                '"{}". Please make sure that airflow[gcp_api] is installed '
                'and the GCS connection exists.'.format(remote_conn_id))

    def read(self, remote_log_location, return_error=False):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=False):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:
            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                from tempfile import NamedTemporaryFile
                with NamedTemporaryFile(mode='w+') as tmpfile:
                    tmpfile.write(log)
                    # Force the file to be flushed, since we're doing the
                    # upload from within the file context (it hasn't been
                    # closed).
                    tmpfile.flush()
                    self.hook.upload(bkt, blob, tmpfile.name)
            except:
                # raise/return error if we get here
                logging.error('Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)
Ejemplo n.º 38
0
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super().execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(
                bucket_name, prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info(
                    '%s files are going to be synced: %s.', len(files), files
                )
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files