def schemaMapping(self, fields):
        schema = {}
        for field in fields:
            if field['type'] == 'bool':
                schema[boa.constrict(field['id'])] = 'bool'
            else:
                schema[boa.constrict(field['id'])] = 'varchar'
            schema['id'] = 'int'

        return schema
Example #2
0
 def schemaMapping(self, fields):
     schema = {}
     for field in fields:
         if type(fields[field]) == int:
             schema[boa.constrict(field)] = 'INTEGER'
         elif type(fields[field]) == str:
             schema[boa.constrict(field)] = 'VARCHAR'
         elif type(fields[field]) == float:
             schema[boa.constrict(field)] = 'FLOAT'
     print(schema)
     return schema
Example #3
0
    def execute(self, context):

        response = self.get_data()
        response.columns = response.columns.map(boa.constrict)

        json_data = json.loads(response.to_json(orient='records'))
        schema_map = self.schemaMapping(json_data[0])

        s3 = S3Hook(s3_conn_id=self.s3_conn_id)

        if self.s3_key.endswith('.json'):
            split = path.splitext(self.s3_key)
            schema_key = '{0}_schema{1}'.format(split[0], split[1])

        results = [
            dict([boa.constrict(k), v] for k, v in i.items())
            for i in json_data
        ]
        results = '\n'.join([json.dumps(i) for i in results])

        s3.load_string(string_data=str(schema_map),
                       bucket_name=self.s3_bucket,
                       key=schema_key,
                       replace=True)

        s3.load_string(string_data=results,
                       bucket_name=self.s3_bucket,
                       key=self.s3_key,
                       replace=True)
        s3.load_string
        s3.connection.close()
Example #4
0
    def outputManager(self, context, output, key, bucket):
        if len(output) == 0 or output is None:
            if self.total_output_files == 0:
                logging.info("No records pulled from Hubspot.")

                downstream_tasks = context['task'].get_flat_relatives(upstream=False)

                logging.info('Skipping downstream tasks...')
                logging.debug("Downstream task_ids %s", downstream_tasks)

                if downstream_tasks:
                    self.skip(context['dag_run'],
                              context['ti'].execution_date,
                              downstream_tasks)
        else:
            logging.info('Logging {0} to GCS...'.format(key))

            output = [flatten(e) for e in output]
            output = '\n'.join([json.dumps({boa.constrict(k): v
                               for k, v in i.items()}) for i in output])

            gcs = GoogleCloudStorageHook(self.gcs_conn_id)

            with open("__temp__", "w") as fid:
                fid.write(output)

            gcs_conn.upload(self.gcs_bucket, self.gcs_object, "__temp__")

            self.total_output_files += 1
Example #5
0
    def outputManager(self, context, output, key, bucket):
        if len(output) == 0 or output is None:
            if self.total_output_files == 0:
                logging.info("No records pulled from Hubspot.")

                downstream_tasks = context['task'].get_flat_relatives(upstream=False)

                logging.info('Skipping downstream tasks...')
                logging.debug("Downstream task_ids %s", downstream_tasks)

                if downstream_tasks:
                    self.skip(context['dag_run'],
                              context['ti'].execution_date,
                              downstream_tasks)
        else:
            logging.info('Logging {0} to S3...'.format(key))

            output = [flatten(e) for e in output]
            output = '\n'.join([json.dumps({boa.constrict(k): v
                               for k, v in i.items()}) for i in output])

            s3 = S3Hook(self.s3_conn_id)
            s3.load_string(
                string_data=str(output),
                key=key,
                bucket_name=bucket,
                replace=True
            )
            s3.connection.close()

            self.total_output_files += 1
Example #6
0
    def download_attachments(self,
                             search_criteria,
                             output_dir,
                             mailbox='INBOX'):
        downloaded_files = []
        self.server.select(mailbox)
        search_result, emails = self.server.search(None, search_criteria)
        for mid in emails[0].split():
            fetch_result, item = self.server.fetch(mid, "(BODY.PEEK[])")
            email_body = item[0][1]
            message = email.message_from_bytes(email_body)
            date = datetime.strptime(
                message['date'][:-6],
                "%a, %d %b %Y %H:%M:%S %z").strftime('%Y_%m_%d_%H_%M_%S')

            if message.get_content_maintype() != 'multipart':
                self.server.store(mid, '+FLAGS', '\Seen')
                continue
            for part in message.walk():
                if (part.get_content_maintype() != 'multipart'
                        and part.get('Content-Disposition') is not None):
                    file_name = boa.constrict(part.get_filename().rsplit(
                        '.', 1)[0])
                    ext = part.get_filename().rsplit('.', 1)[1]
                    file_path = '{dir}/{date}_{filename}.{ext}'.format(
                        dir=output_dir, date=date, filename=file_name, ext=ext)
                    open(file_path, 'wb').write(part.get_payload(decode=True))
                    downloaded_files.append(file_path)
                    self.server.store(mid, '+FLAGS', '\Seen')

        return downloaded_files
Example #7
0
    def outputManager(self, hook, output, key, bucket):
        """
        This method handles the output of the data.
        """
        if self.total_output_files == 0:
            logging.info("No records pulled.")
            if self.skip_if_null:
                downstream_tasks = self.context['task'].get_flat_relatives(
                    upstream=False)

                logging.info('Skipping downstream tasks...')
                logging.debug("Downstream task_ids %s", downstream_tasks)

                if downstream_tasks:
                    self.skip(self.context['dag_run'],
                              self.context['ti'].execution_date,
                              downstream_tasks)
        else:
            logging.info('Logging {0} to ...'.format(key))

            output = [flatten(e) for e in output]
            output = '\n'.join([
                json.dumps({boa.constrict(k): v
                            for k, v in i.items()}) for i in output
            ])

            if self.cs_type == 's3':
                hook.load_string(string_data=str(output),
                                 key=key,
                                 bucket_name=bucket,
                                 replace=True)
                hook.connection.close()

                self.total_output_files += 1
Example #8
0
    def output_manager(self, output):
        def flatten(record, parent_key='', sep='_'):
            flattened_record = []
            for k, v in record.items():
                new_key = parent_key + sep + k if parent_key else k
                if isinstance(v, dict):
                    flattened_record.extend(flatten(v,
                                                    new_key,
                                                    sep=sep).items())
                else:
                    flattened_record.append((new_key, v))
            return dict(flattened_record)

        output = '\n'.join([json.dumps({boa.constrict(k): v
                                        for k, v
                                        in flatten(record).items()})
                            for record in output])

        s3 = S3Hook(self.s3_conn_id)

        s3.load_string(
            string_data=output,
            key=self.s3_key,
            bucket_name=self.s3_bucket,
            replace=True
        )
    def output_manager(self, s3, output_name, output_data, context, sheet_name, schema_name=None):
        self.s3_bucket = BaseHook.get_connection(self.s3_conn_id).host
        if self.output_format == 'json':
            output = '\n'.join([json.dumps({boa.constrict(str(k)): v
                                            for k, v in record.items()})
                                for record in output_data])

            enc_output = str.encode(output, 'utf-8')

            # if file is more than bound then apply gzip compression
            if len(enc_output) / 1024 / 1024 >= self.compression_bound:
                logging.info("File is more than {}MB, gzip compression will be applied".format(self.compression_bound))
                output = gzip.compress(enc_output, compresslevel=5)
                self.xcom_push(context, key='is_compressed_{}'.format(sheet_name), value="compressed")
                self.load_bytes(s3,
                                bytes_data=output,
                                key=output_name,
                                bucket_name=self.s3_bucket,
                                replace=True
                                )
            else:
                logging.info("File is less than {}MB, compression will not be applied".format(self.compression_bound))
                self.xcom_push(context, key='is_compressed_{}'.format(sheet_name), value="non-compressed")
                s3.load_string(
                    string_data=output,
                    key=output_name,
                    bucket_name=self.s3_bucket,
                    replace=True
                )

            if self.include_schema is True:
                output_keys = output_data[0].keys()
                schema = [{'name': boa.constrict(a),
                           'type': 'varchar(512)'} for a in output_keys if a is not None]
                schema = {'columns': schema}

                s3.load_string(
                    string_data=json.dumps(schema),
                    key=schema_name,
                    bucket_name=self.s3_bucket,
                    replace=True
                )

            logging.info('Successfully output of "{}" to S3.'.format(output_name))
    def paginate_data(self, endpoint=None, payload=None):
        if not endpoint:
            endpoint = self.endpoint

        def make_request(http_conn_id,
                         endpoint,
                         payload=None,
                         token=None):

            return (MarketoHook(http_conn_id=http_conn_id)
                    .run(endpoint, payload, token=token)
                    .json())

        final_payload = {}

        for param in self.payload:
            final_payload[param] = self.payload[param]

        if payload:
            for param in payload:
                final_payload[param] = payload[param]

        response = make_request(self.marketo_conn_id,
                                self.methodMapper(endpoint),
                                final_payload,
                                self.token)

        if endpoint == 'paging_token':
            return response['nextPageToken']
        else:
            output = response['result']

            if 'moreResult' in list(response.keys()):
                final_payload['moreResult'] = response['moreResult']
            else:
                final_payload['moreResult'] = False

            while final_payload['moreResult']:
                response = make_request(self.marketo_conn_id,
                                        self.methodMapper(endpoint),
                                        final_payload,
                                        self.token)
                final_payload['nextPageToken']
                if 'result' in (response.keys()):
                    output += response['result']
                    if 'moreResult' in list(response.keys()):
                        final_payload['moreResult'] = response['moreResult']
                        final_payload['nextPageToken'] = response['nextPageToken']
                    else:
                        final_payload['moreResult'] = False
                else:
                    final_payload['moreResult'] = False

            output = [{boa.constrict(k): v for k, v in i.items()} for i in output]
            return output
Example #11
0
def create_dag(workflow, schedule_interval=None, dag_cls=None, dag_type=None):
    """
    Creates a DAG instance from a workflow-like dict.
    Workflow objects are expected to have a name, schedule,
    and activityList
    :param workflow: The dict describing the DAG to build
    :type workflow: dict
    :param schedule_interval: A fallback schedule if a workflow does not define
    its own
    :type schedule_interval: string
    :param dag_type: describes the type of DAG being built
    :type dag_type: string
    :return DAG
    """
    if not dag_cls:
        raise Exception('must pass DAG class to create_dag')
    # override default_args in workflow.default_args e.g. start_date
    workflow_args = workflow.get('default_args')
    if isinstance(workflow_args, dict):
        args = {**default_args, **workflow_args}
    else:
        args = default_args
    id_ = workflow.get('_id')
    workflow_name = boa.constrict(workflow.get('name', '').lower())
    schedule = workflow.get('schedule', schedule_interval)

    if dag_type is not None:
        dag_name = '{workflow_name}__{dag_type}__{id_}'.format(
            workflow_name=workflow_name,
            dag_type=dag_type,
            id_=id_)
    else:
        dag_name = '{workflow_name}__{id_}'.format(
            workflow_name=workflow_name,
            id_=id_)

    print('Building DAG: {name}'.format(name=dag_name))
    dag = dag_cls(dag_name, default_args=args, schedule_interval=schedule)
    create_tasks(dag, workflow)
    return dag
Example #12
0
def create_dag(workflow, schedule_interval=None, dag_cls=None, dag_type=None):
    """
    Creates a DAG instance from a workflow-like dict.
    Workflow objects are expected to have a name, schedule,
    and activityList
    :param workflow: The dict describing the DAG to build
    :type workflow: dict
    :param schedule_interval: A fallback schedule if a workflow does not define
    its own
    :type schedule_interval: string
    :param dag_type: describes the type of DAG being built
    :type dag_type: string
    :return DAG
    """
    if not dag_cls:
        raise Exception('must pass DAG class to create_dag')
    # override default_args in workflow.default_args e.g. start_date
    workflow_args = workflow.get('default_args')
    if isinstance(workflow_args, dict):
        args = {**default_args, **workflow_args}
    else:
        args = default_args
    id_ = workflow.get('_id')
    workflow_name = boa.constrict(workflow.get('name', '').lower())
    schedule = workflow.get('schedule', schedule_interval)

    if dag_type is not None:
        dag_name = '{workflow_name}__{dag_type}__{id_}'.format(
            workflow_name=workflow_name, dag_type=dag_type, id_=id_)
    else:
        dag_name = '{workflow_name}__{id_}'.format(workflow_name=workflow_name,
                                                   id_=id_)

    print('Building DAG: {name}'.format(name=dag_name))
    dag = dag_cls(dag_name, default_args=args, schedule_interval=schedule)
    create_tasks(dag, workflow)
    return dag
    def execute(self, context):
        imap_conn = ImapHook(self.imap_conn_id)
        s3_conn = S3Hook(self.s3_conn_id)
        tmp_dir = '/tmp/{key}'.format(key=self.s3_key)

        if os.path.exists(tmp_dir):
            shutil.rmtree(tmp_dir)

        os.mkdir(tmp_dir)

        criteria = '(FROM "{imap_email}" SUBJECT "{imap_subject}" UNSEEN)'.format(imap_email=self.imap_email,
                                                                                  imap_subject=self.imap_subject)
        attachments = imap_conn.download_attachments(criteria, tmp_dir)

        file_name = '{tmp_dir}/{key}.jsonl'.format(tmp_dir=tmp_dir, key=self.s3_key)
        s3_upload_file = open(file_name, 'w')

        for attachment in attachments:
            with open(attachment, 'r', errors='replace') as f:
                reader = csv.reader(f)
                headers = [boa.constrict(header) for header in next(reader)]
                for row in reader:
                    json_line = {}
                    for index, col in enumerate(row):
                        json_line[headers[index]] = col
                    json.dump(json_line, s3_upload_file)
                    s3_upload_file.write('\n')

        s3_upload_file.close()

        s3_conn.load_file(file_name,
                          self.s3_key,
                          self.s3_bucket,
                          True)

        shutil.rmtree(tmp_dir)
Example #14
0
def getSalesforceRecords(name, **kwargs):
    sf = get_salesforce_conn()
    formatted_name = "{}.json".format(name.lower())
    templates_dict = kwargs.get('templates_dict', {})
    fields = json.loads(templates_dict.get('fields', '[]'))
    query_string = "SELECT {0} FROM {1}".format(','.join(fields), name)
    print(query_string)
    response = sf.query_all(query_string)
    output = response['records']
    output = '\n'.join([json.dumps(flatten({boa.constrict(k): v\
                        for k, v in i.items()})) for i in output])

    with  NamedTemporaryFile("w") as f:
        f.write(output)
        s3_key = 'salesforce/{}'.format(formatted_name)
        s3 = S3Hook(s3_conn_id='INSERT_S3_CONN_ID_HERE')
        s3.load_file(
            filename=f.name,
            key=s3_key,
            bucket_name='INSERT_S3_BUCKET_NAME_HERE',
            replace=True
        )
        s3.connection.close()
        return s3_key
Example #15
0
    def paginate_data(self,
                      h,
                      endpoint,
                      context,
                      company_id=None,
                      campaign_id=None):
        """
        This method takes care of request building and pagination.
        It retrieves 100 at a time and continues to make
        subsequent requests until it retrieves less than 100 records.
        """
        output = []
        try:
            initial_offset = Variable.get('INCREMENTAL_KEY__{0}_{1}_vidOffset'.format(context['ti'].dag_id,
                                                                                      context['ti'].task_id))
            print('INITIAL OFFSET: ' + str(initial_offset))
        except:
            initial_offset = 0

        final_payload = {'vidOffset': initial_offset}

        if self.hubspot_object in ('events', 'timeline'):
            final_payload['limit'] = 1000
        elif self.hubspot_object == 'deals':
            final_payload['limit'] = 250
        elif self.hubspot_object == 'contacts':
            final_payload['count'] = 100

        for param in self.hubspot_args:
            # If time used as filter in request and is a string object
            # (e.g. when using {{ execution_date}}), convert the timestamp
            # to Hubspot formatting as needed by Hubspot API.
            if param in ('startTimestamp', 'endTimestamp'):
                param_time = datetime.datetime.strptime(self.hubspot_args[param],
                                                        "%Y-%m-%d %H:%M:%S")
                self.hubspot_args[param] = int(time.mktime(param_time.timetuple())
                                               * 1000)
            final_payload[param] = self.hubspot_args[param]
        logging.info('FINAL PAYLOAD: ' + str(final_payload))
        response = h.run(endpoint, final_payload).json()
        if not response:
            logging.info('Resource Unavailable.')
            return ''
        if self.hubspot_object == 'owners':
            output.extend([e for e in response])
            # output = [self.filterMapper(record) for record in output]
            output = self.subTableMapper(output)
            return output
        elif self.hubspot_object == 'engagements':
            output.extend([e for e in response['results']])
        elif self.hubspot_object == 'contacts_by_company':
            if endpoint == 'companies/v2/companies/paged':
                if response['companies']:
                    output.extend([e for e in response['companies']])
                else:
                    logging.info('No companies currently available.')
                    return ''
            else:
                output.extend([{"vid": e, "company_id": company_id}
                               for e in response['vids']])
        elif self.hubspot_object == 'campaigns':
            if 'email/public/v1/campaigns/' in endpoint:
                output.append(response)
        elif self.hubspot_object in ('deal_pipelines', 'social'):
                output.extend([e for e in response])
        else:
            output.extend([e for e in response[self.hubspot_object]])

        if isinstance(response, dict):
            if 'hasMore' in list(response.keys()):
                more = 'hasMore'
            elif 'has-more' in list(response.keys()):
                more = 'has-more'
            else:
                more = 'has-more'
                response['has-more'] = False
            n = 0

            if 'vid-offset' in list(response.keys()):
                offset_variable = 'vid-offset'
            elif 'offset' in list(response.keys()):
                offset_variable = 'offset'

            while response[more] is True:
                if offset_variable == 'vid-offset':
                    final_payload['vidOffset'] = response['vid-offset']
                    logging.info('Retrieving: ' + str(response['vid-offset']))
                elif offset_variable == 'offset':
                    final_payload['offset'] = response['offset']
                    logging.info('Retrieving: ' + str(response['offset']))
                try:
                    response = h.run(endpoint, final_payload).json()
                except:
                    pass

                if endpoint == 'companies/v2/companies/paged':
                    if response['companies']:
                        output.extend([e for e in response['companies']])
                else:
                    output.extend([e for e in response[self.hubspot_object]])

                n += 1
                time.sleep(0.2)
                if n % 50 == 0:
                    # output = [self.filterMapper(record) for record in output]
                    output = self.subTableMapper(output)
                    if self.hubspot_object == 'contacts_by_company':
                        companies = self.retrieve_data(h, self.methodMapper('companies'))
                        if not companies:
                            logging.info('No companies currently available.')
                            downstream_tasks = context['task'].get_flat_relatives(upstream=False)
                            logging.info('Skipping downstream tasks...')
                            logging.debug("Downstream task_ids %s", downstream_tasks)
                            if downstream_tasks:
                                self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks)
                            return True
                        final_output = []
                        for company in companies:
                            final_output.extend(output)
                        key = '{0}_core_{1}{2}'.format(self.split[0],
                                                       str(n),
                                                       self.split[1])
                        self.outputManager(context,
                                           output,
                                           key,
                                           self.s3_bucket)

                    else:
                        for e in output:
                            for k, v in e.items():
                                if k == 'core':
                                    key = '{0}_core_{1}{2}'.format(self.split[0],
                                                                   str(n),
                                                                   self.split[1])
                                else:
                                    key = '{0}_{1}_{2}{3}'.format(self.split[0],
                                                                  boa.constrict(k),
                                                                  str(n),
                                                                  self.split[1])
                                logging.info('Sending to Output Manager...')
                                self.outputManager(context,
                                                   v,
                                                   key,
                                                   self.s3_bucket)
                                if self.hubspot_object == 'contacts':
                                    if response[offset_variable] == 0:
                                        logging.info('No new records received.')
                                        logging.info('Offset variable is still: ' + str(initial_offset))
                                    else:
                                        new_offset = ('INCREMENTAL_KEY__{0}_{1}_vidOffset'
                                                      .format(context['ti'].dag_id,
                                                              context['ti'].task_id))
                                        logging.info('New Variable offset is now: ' +\
                                                     str(response[offset_variable]))

                                        Variable.set(new_offset, response[offset_variable])

                    output = []

            if self.hubspot_object == 'contacts':
                if response[offset_variable] == 0:
                    logging.info('No new records received.')
                    logging.info('Offset variable is still: ' + str(initial_offset))
                else:
                    new_offset = ('INCREMENTAL_KEY__{0}_{1}_vidOffset'
                                  .format(context['ti'].dag_id,
                                          context['ti'].task_id))
                    logging.info('New Variable offset is now: ' + str(response[offset_variable]))

                    Variable.set(new_offset, response[offset_variable])


        # output = [self.filterMapper(record) for record in output]
        output = self.subTableMapper(output)

        return output
                "department": "varchar",
                "job_title": "varchar",
                "reports_to": "varchar",
            }

        elif self.method == 'getEmploymentStatus':
            schema = {
                "id": "int",
                "employee_id": "int",
                "date": "date",
                "employment_status": "varchar",
                "benetrac_status": "varchar",
                "gusto": "varchar",
            }

        results = [dict([boa.constrict(k), v]
                        for k, v in i.items()) for i in results]
        results = '\n'.join([json.dumps(i) for i in results])

        s3.load_string(
            string_data=json.dumps(schema),
            bucket_name=self.s3_bucket,
            key=schema_key,
            replace=True
        )

        s3.load_string(
            string_data=results,
            bucket_name=self.s3_bucket,
            key=self.s3_key,
            replace=True
Example #17
0
    def execute(self, context):
        g_conn = GoogleHook(self.google_conn_id)

        if isinstance(self.sheet_names, str) and ',' in self.sheet_names:
            sheet_names = self.sheet_names.split(',')
        else:
            sheet_names = self.sheet_names

        sheets_object = g_conn.get_service_object('sheets', 'v4', [
            'https://spreadsheets.google.com/feeds',
            'https://www.googleapis.com/auth/drive'
        ])
        print('Retrieved Sheets Object')

        response = sheets_object.spreadsheets().get(
            spreadsheetId=self.sheet_id, includeGridData=True).execute()

        title = response.get('properties').get('title')
        sheets = response.get('sheets')

        final_output = dict()

        total_sheets = []
        for sheet in sheets:
            name = sheet.get('properties').get('title')

            total_sheets.append(name)

            if self.sheet_names:
                if name not in sheet_names:
                    print('{} is not found in available sheet names.'.format(
                        name))
                    continue

            table_name = name
            data = sheet.get('data')[0].get('rowData')
            output = []

            for row in data:
                row_data = []
                values = row.get('values')
                for value in values:
                    ev = value.get('effectiveValue')
                    if ev is None:
                        row_data.append(None)
                    else:
                        for v in ev.values():
                            row_data.append(v)

                output.append(row_data)

            if self.output_format == 'json':
                headers = output.pop(0)
                output = [dict(zip(headers, row)) for row in output]

            final_output[table_name] = output

        s3 = S3Hook(self.s3_conn_id)

        for sheet in final_output:
            output_data = final_output.get(sheet)

            file_name = os.path.splitext(self.s3_path)[0]

            sheet = boa.constrict(sheet)

            output_name = ''.join(
                [self.s3_path, '/', sheet, '.', self.output_format])

            if self.include_schema is True:
                schema_name = ''.join([
                    self.s3_path, '/', sheet, '_schema', '.',
                    self.output_format
                ])
            else:
                schema_name = None

            self.output_manager(s3, output_name, output_data, context, sheet,
                                schema_name)

        dag_id = context['ti'].dag_id

        var_key = '_'.join([dag_id, self.sheet_id])
        Variable.set(key=var_key, value=json.dumps(total_sheets))
        time.sleep(10)

        return boa.constrict(title)
    def execute(self, context):
        self.token = (MarketoHook(http_conn_id=self.marketo_conn_id)
                      .run(self.methodMapper('auth'))
                      .json())['access_token']
        if self.endpoint == 'activities':
            paging_token = self.paginate_data(endpoint='paging_token',
                                              payload={'sinceDatetime': '2014-01-01T00:00:00'})
            activity_types = self.paginate_data(endpoint='activity_types')
            activities = [activity['id'] for activity in activity_types]
            output = []
            output += self.paginate_data(payload={'activityTypeIds': activities[0],
                                                  'nextPageToken': paging_token})
        elif self.endpoint == 'leads':
            request = {}
            lead_fields = self.paginate_data(endpoint='lead_description')
            request['fields'] = [record['rest']['name']
                                 for record in lead_fields]
            request['columnHeaderNames'] = {record['rest']['name']: record['rest']['name']
                                            for record in lead_fields}

            request['filter'] = {}
            createdAt = {}
            createdAt['startAt'] = self.start_at
            createdAt['endAt'] = self.end_at
            request['filter']['updatedAt'] = createdAt
            request['format'] = 'CSV'
            get_hook = MarketoHook(http_conn_id=self.marketo_conn_id)

            post_hook = MarketoHook(method='POST',
                                    http_conn_id=self.marketo_conn_id)

            job = post_hook.run(self.methodMapper('leads_create'),
                                data=json.dumps(request),
                                token=self.token).json()
            export_id = [e['exportId'] for e in job['result']][0]

            status = [e['status'] for e in post_hook.run('bulk/v1/leads/export/{0}/enqueue.json'.format(export_id),
                                                         token=self.token).json()['result']][0]
            while status != 'Completed':
                status = [e['status'] for e in get_hook.run('bulk/v1/leads/export/{0}/status.json'.format(export_id),
                                                            token=self.token).json()['result']][0]
                logging.info('Status: ' + str(status))
                sleep(5)

            output = get_hook.run('bulk/v1/leads/export/{0}/file.json'.format(export_id),
                                  token=self.token).text

            output = output.split('\n')
            headers = output.pop(0).split(',')
            del output[0]
            headers = [boa.constrict(header) for header in headers]
            output = [row for row in reader(output)]
            output = [dict(zip(headers, row)) for row in output]
            marketo_schema = schema[self.endpoint]
            field_names = []
            for field in marketo_schema['fields']:
                field_names.append(field['name'])
            logging.info('DIFF: ' + str(set(headers) - set(field_names)))
        else:
            output = self.paginate_data()
            logging.info(len('Output Length: ' + str(output)))

        if len(output) == 0 or output is None:
            logging.info("No records pulled from Marketo.")
            downstream_tasks = context['task'].get_flat_relatives(upstream=False)
            logging.info('Skipping downstream tasks...')
            logging.debug("Downstream task_ids %s", downstream_tasks)

            if downstream_tasks:
                self.skip(context['dag_run'],
                          context['ti'].execution_date,
                          downstream_tasks)

            return True
        else:
            self.outputManager(self.nullify_output(output),
                               self.s3_key,
                               self.s3_bucket,
                               self.output_format)
def imap_py(**kwargs):
    selenium_conn_id = kwargs.get('templates_dict',
                                  None).get('selenium_conn_id', None)
    filename = kwargs.get('templates_dict', None).get('filename', None)
    s3_conn_id = kwargs.get('templates_dict', None).get('s3_conn_id', None)
    s3_bucket = kwargs.get('templates_dict', None).get('s3_bucket', None)
    s3_key = kwargs.get('templates_dict', None).get('s3_key', None)
    date = kwargs.get('templates_dict', None).get('date', None)

    @provide_session
    def get_conn(conn_id, session=None):
        conn = (session.query(Connection).filter(
            Connection.conn_id == conn_id).first())
        return conn

    url = get_conn(selenium_conn_id).host
    email = get_conn(selenium_conn_id).user
    pwd = get_conn(selenium_conn_id).password

    vdisplay = Xvfb()
    vdisplay.start()
    caps = webdriver.DesiredCapabilities.FIREFOX
    caps["marionette"] = True

    profile = webdriver.FirefoxProfile()
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
                           "text/csv")

    logging.info('Profile set...')
    options = Options()
    options.set_headless(headless=True)
    logging.info('Options set...')
    logging.info('Initializing Driver...')
    driver = webdriver.Firefox(firefox_profile=profile,
                               firefox_options=options,
                               capabilities=caps)
    logging.info('Driver Intialized...')
    driver.get(url)
    logging.info('Authenticating...')
    elem = driver.find_element_by_id("email")
    elem.send_keys(email)
    elem = driver.find_element_by_id("password")
    elem.send_keys(pwd)
    elem.send_keys(Keys.RETURN)

    logging.info('Successfully authenticated.')

    sleep_time = 15

    logging.info('Downloading File....Sleeping for {} Seconds.'.format(
        str(sleep_time)))
    time.sleep(sleep_time)

    driver.close()
    vdisplay.stop()

    dest_s3 = S3Hook(s3_conn_id=s3_conn_id)

    os.chdir('/root/Downloads')

    csvfile = open(filename, 'r')

    output_json = 'file.json'

    with open(output_json, 'w') as jsonfile:
        reader = csv.DictReader(csvfile)

        for row in reader:
            row = dict((boa.constrict(k), v) for k, v in row.items())
            row['run_date'] = date
            json.dump(row, jsonfile)
            jsonfile.write('\n')

    dest_s3.load_file(filename=output_json,
                      key=s3_key,
                      bucket_name=s3_bucket,
                      replace=True)

    dest_s3.connection.close()