Exemple #1
0
class FileToGCS(luigi.Task):
    """Uploads a file from local to Google Cloud Storage.

    Parameters
    ----------
    client: `luigi.contrib.gcs.GCSClient()` instance, optional
        (default is a new instance)
    source: str
        E.g. "./path/to/my/file.csv"
    destination: str
        E.g. "gs://bucket/my/file.csv"

    References
    ----------
    https://luigi.readthedocs.io/en/stable/api/luigi.contrib.gcs.html
    """
    client = luigi.Parameter(default=gcs.GCSClient())
    source = luigi.Parameter()
    destination = luigi.Parameter() # "gs://bi_poc/my-test.txt"

    def output(self):
        return gcs.GCSTarget(self.destination, client=self.client)

    def run(self):
        with open(self.source, 'r') as infile:
            with gcs.GCSTarget(self.destination, client=self.client).open(mode='w') as outfile:
                outfile.write(infile.read())
Exemple #2
0
    def setUp(self):
        self.gcs_client = gcs.GCSClient(CREDENTIALS)
        self.bq_client = bigquery.BigQueryClient(CREDENTIALS)

        self.table_id = "avro_bq_table"
        self.gcs_dir_url = 'gs://' + BUCKET_NAME + "/foo"
        self.addCleanup(self.gcs_client.remove, self.gcs_dir_url)
        self.addCleanup(
            self.bq_client.delete_dataset,
            bigquery.BQDataset(PROJECT_ID, DATASET_ID, EU_LOCATION))
        self._produce_test_input()
Exemple #3
0
    def setUp(self):
        self.bq_client = bigquery.BigQueryClient(CREDENTIALS)
        self.gcs_client = gcs.GCSClient(CREDENTIALS)

        # Setup GCS input data
        try:
            self.gcs_client.client.buckets().insert(project=PROJECT_ID,
                                                    body={
                                                        'name': BUCKET_NAME,
                                                        'location': EU_LOCATION
                                                    }).execute()
        except googleapiclient.errors.HttpError as ex:
            # todo verify that existing dataset is not US
            if ex.resp.status != 409:  # bucket already exists
                raise

        self.gcs_client.remove(bucket_url(''), recursive=True)
        self.gcs_client.mkdir(bucket_url(''))

        text = '\n'.join(
            map(json.dumps, [{
                'field1': 'hi',
                'field2': 1
            }, {
                'field1': 'bye',
                'field2': 2
            }]))
        self.gcs_file = bucket_url(self.id())
        self.gcs_client.put_string(text, self.gcs_file)

        # Setup BigQuery datasets
        self.table = bigquery.BQTable(project_id=PROJECT_ID,
                                      dataset_id=DATASET_ID,
                                      table_id=self.id().split('.')[-1],
                                      location=None)
        self.table_eu = bigquery.BQTable(project_id=PROJECT_ID,
                                         dataset_id=EU_DATASET_ID,
                                         table_id=self.id().split('.')[-1] +
                                         '_eu',
                                         location=EU_LOCATION)

        self.addCleanup(self.gcs_client.remove, bucket_url(''), recursive=True)
        self.addCleanup(self.bq_client.delete_dataset, self.table.dataset)
        self.addCleanup(self.bq_client.delete_dataset, self.table_eu.dataset)

        self.bq_client.delete_dataset(self.table.dataset)
        self.bq_client.delete_dataset(self.table_eu.dataset)
        self.bq_client.make_dataset(self.table.dataset, body={})
        self.bq_client.make_dataset(self.table_eu.dataset, body={})
Exemple #4
0
    def setUp(self):
        self.client = gcs.GCSClient(CREDENTIALS)

        global ATTEMPTED_BUCKET_CREATE
        if not ATTEMPTED_BUCKET_CREATE:
            try:
                self.client.client.buckets().insert(
                    project=PROJECT_ID, body={'name': BUCKET_NAME}).execute()
            except googleapiclient.errors.HttpError as ex:
                if ex.resp.status != 409:  # bucket already exists
                    raise

            ATTEMPTED_BUCKET_CREATE = True

        self.client.remove(bucket_url(''), recursive=True)
        self.client.mkdir(bucket_url(''))
Exemple #5
0
 def __init__(self, path, client=None):
     client = client or get_default_client()
     self.gcs_client = gcs.GCSClient(client.oauth())
     super(AtomicGCSFile, self).__init__(path)
Exemple #6
0
import google.auth
from google.cloud import storage, bigquery

import luigi
from luigi.contrib import gcs as luigi_gcs
from luigi.contrib import bigquery as luigi_bigquery
from luigi.contrib import external_program

# Google Cloud
PROJECT_ID = 'senpai-io'
BUCKET_NAME = 'senpai-io.appspot.com'
BUCKET_PATH = 'gs://{}'.format(BUCKET_NAME)
BUCKET_SUBDIR = 'quandl-stage'
CREDENTIALS, _ = google.auth.default()
GCS_CLIENT = luigi_gcs.GCSClient(CREDENTIALS)
GCS_BUCKET = storage.Client().get_bucket(BUCKET_NAME)
# BQ_CLIENT = luigi_bigquery.BigQueryClient(CREDENTIALS)

# Dates
TODAY = datetime.today()
YESTERDAY = TODAY - timedelta(days=1)

# Logging
logger = logging.getLogger('luigi-interface')
logger.setLevel(logging.INFO)
fh = logging.FileHandler(
    'logs/{date:%Y-%m-%d}-luigi.log'.format(date=YESTERDAY))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
Exemple #7
0
	def run(self):
		client = gcs.GCSClient(oauth_credentials=self.credentials)
		client.put(self.input().path, self.output().path)