Beispiel #1
0
        year=lambda x: x['year'].astype('str').str.split('.').str[0],
        type=lambda x: x['type'].astype('str').str.split('.').str[0],
        setup=lambda x: x['setup'].astype('str').str.split('.').str[0],
        cic=lambda x: x['cic'].astype('str')))
    chunk.to_csv('ASIF_9807_chunk_{}.csv'.format(i), index=False)
    s3.upload_file('ASIF_9807_chunk_{}.csv'.format(i),
                   "DATA/ECON/FIRM_SURVEY/ASIF_CHINA/UNZIP_DATA_CSV")
    os.remove('ASIF_9807_chunk_{}.csv'.format(i))
    i += 1

# Create schema
# Load schema from
# https://docs.google.com/spreadsheets/d/1gfdmBKzZ1h93atSMFcj_6YgLxC7xX62BCxOngJwf7qE
project = 'valid-pagoda-132423'
auth = authorization_service.get_authorization(
    path_credential_gcp="{}/creds/service.json".format(parent_path),
    path_credential_drive="{}/creds".format(parent_path),
    verbose=False)

gd_auth = auth.authorization_drive()
drive = connect_drive.drive_operations(gd_auth)

spreadsheet_id = drive.find_file_id('var_name02-07', to_print=False)
var = (drive.upload_data_from_spreadsheet(
    sheetID=spreadsheet_id, sheetName="var_name02-07.csv",
    to_dataframe=True).loc[lambda x: ~x['Var_name'].isin(list_to_remove)])

schema = []
for i in chunk.columns:

    temp = var.loc[lambda x: x['Var_name'].isin([i])]
    temp
parent_path = str(Path(path).parent.parent.parent)
name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-3'
bucket = 'datalake-datascience'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

#### AWS
con = aws_connector.aws_instantiate(credential=path_cred, region=region)
client = con.client_boto()
s3 = service_s3.connect_S3(client=client, bucket=bucket, verbose=True)
PATH_S3 = "DATA/ECON/LOOKUP_DATA/CHINA/INDUSTRY_CHARACTERISTICS/HIGH_TECH"  ### Copy destination in S3 without bucket and "/" at the end
### GCP
auth = authorization_service.get_authorization(
    #path_credential_gcp=os.path.join(parent_path, "creds", "service.json"),
    path_credential_drive=os.path.join(parent_path, "creds"),
    verbose=False,
    scope=[
        'https://www.googleapis.com/auth/spreadsheets.readonly',
        "https://www.googleapis.com/auth/drive"
    ])
gd_auth = auth.authorization_drive(
    path_secret=os.path.join(parent_path, "creds", "credentials.json"))
drive = connect_drive.drive_operations(gd_auth)

### DOWNLOAD DATA TO temporary_local_data folder
FILENAME_DRIVE = 'high_tech.csv'
FILEID = drive.find_file_id(FILENAME_DRIVE, to_print=False)

var = (drive.download_file(filename=FILENAME_DRIVE,
                           file_id=FILEID,
                           local_path=os.path.join(parent_path,
                                                   "00_data_catalog",
from GoogleDrivePy.google_drive import connect_drive
from GoogleDrivePy.google_authorization import authorization_service
path = os.getcwd()
parent_path = str(Path(path).parent.parent.parent)
name_credential = 'financial_dep_SO2_accessKeys.csv'
region = 'eu-west-3'
bucket = 'datalake-datascience'
path_cred = "{0}/creds/{1}".format(parent_path, name_credential)

con = aws_connector.aws_instantiate(credential=path_cred, region=region)
client = con.client_boto()
s3 = service_s3.connect_S3(client=client, bucket=bucket, verbose=True)

auth = authorization_service.get_authorization(
    path_credential_gcp=os.path.join(parent_path, "creds", "service.json"),
    path_credential_drive=os.path.join(parent_path, "creds"),
    verbose=False  #
)

gd_auth = auth.authorization_drive()
drive = connect_drive.drive_operations(gd_auth)

### Load from spreadsheet
### DOWNLOAD SPREADSHEET TO temporary_local_data folder
FILENAME_SPREADSHEET = "Sigmas_3digit_China"
spreadsheet_id = drive.find_file_id(FILENAME_SPREADSHEET, to_print=False)
sheetName = 'Sigmas'
sigmas = (drive.upload_data_from_spreadsheet(sheetID=spreadsheet_id,
                                             sheetName=sheetName,
                                             to_dataframe=True))