Python FileWriter Examples, wherehows.common.writers.FileWriter Python Examples

Example #1

0

Show file

File: OozieExtract.py Project: alyiwang/WhereHows

  def collect_flow_schedules(self, schedule_file):
    self.logger.info("collect flow schedule")
    schedule_writer = FileWriter(schedule_file)
    query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      schedule_record = OozieFlowScheduleRecord(self.app_id,
                                                row['app_path'],
                                                row['time_unit'],
                                                str(row['frequency']),
                                                None,
                                                row['start_time'],
                                                row['end_time'],
                                                row['ref_id'],
                                                self.wh_exec_id)
      schedule_writer.append(schedule_record)

    schedule_writer.close()

Example #2

0

Show file

    def __init__(self, args):
        self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
        self.base_url = args[Constant.BASE_URL_KEY]

        temp_dir = FileUtil.etl_temp_dir(args, "CODESEARCH")
        self.code_search_committer_writer = FileWriter(
            os.path.join(temp_dir, args[Constant.DATABASE_SCM_REPO_OUTPUT_KEY]))

Example #3

0

Show file

File: OozieExtract.py Project: alyiwang/WhereHows

  def collect_job_execs(self, job_exec_file, lookback_period):
    self.logger.info("collect job execs")
    job_exec_writer = FileWriter(job_exec_file)
    query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      job_exec_record = OozieJobExecRecord(self.app_id,
                                           row['app_path'],
                                           row['flow_exec_id'],
                                           row['flow_exec_id'],
                                           row['job_name'],
                                           row['app_path'] + "/" + row['job_name'],
                                           row['job_exec_id'],
                                           row['status'],
                                           row['user_retry_count'],
                                           row['start_time'],
                                           row['end_time'],
                                           self.wh_exec_id)
      job_exec_writer.append(job_exec_record)
    job_exec_writer.close()

Example #4

0

Show file

File: AppworxExtract.py Project: alyiwang/WhereHows

 def collect_flow_schedules(self, schedule_file):
   # load flow scheduling info from table triggers
   self.logger.info("collect flow schedule")
   timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
   self.aw_cursor.execute(timezone)
   schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
   self.aw_cursor.execute(schema)
   schedule_writer = FileWriter(schedule_file)
   query = \
       """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
          FROM SO_JOB_TABLE J
          JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
          WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
   self.aw_cursor.execute(query)
   rows = DbUtil.dict_cursor(self.aw_cursor)
   for row in rows:
     schedule_record = AppworxFlowScheduleRecord(self.app_id,
                                                 row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
                                                 row['AW_SCH_NAME'],
                                                 int(row['AW_SCH_INTERVAL']),
                                                 long(row['EFFECT_STARTED']),
                                                 long(row['EFFECT_END']),
                                                 '0',
                                                 self.wh_exec_id
                                                 )
     schedule_writer.append(schedule_record)
   schedule_writer.close()

Example #5

0

Show file

File: HiveTransform.py Project: thomas-young-2013/Wherehows-eXtension

 def __init__(self):
   self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
   username = args[Constant.HIVE_METASTORE_USERNAME]
   password = args[Constant.HIVE_METASTORE_PASSWORD]
   jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
   jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
   self.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
   self.curs = self.conn_hms.cursor()
   dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
   self.instance_writer = FileWriter(dependency_instance_file)

Example #6

0

Show file

File: MultiproductExtract.py Project: dmoore247/WhereHows

  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    requests.packages.urllib3.disable_warnings()
    self.app_id = int(args[Constant.APP_ID_KEY])
    self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
    self.project_writer = FileWriter(args[Constant.GIT_PROJECT_OUTPUT_KEY])
    self.repo_writer = FileWriter(args[Constant.PRODUCT_REPO_OUTPUT_KEY])
    self.repo_owner_writer = FileWriter(args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY])

    self.multiproduct = {}
    self.git_repo = {}
    self.product_repo = []

Example #7

0

Show file

File: OozieExtract.py Project: alyiwang/WhereHows

  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owners")
    owner_writer = FileWriter(owner_file)
    query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      owner_record = OozieFlowOwnerRecord(self.app_id,
                                          row['app_path'],
                                          row['user_name'],
                                          self.wh_exec_id)
      owner_writer.append(owner_record)
    owner_writer.close()

Example #8

0

Show file

File: AzkabanExtract.py Project: linkedin/WhereHows

    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = "data"
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode="r", fileobj=StringIO.StringIO(row[json_column].tostring())
                ).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0]["actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true":
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[-1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1]
                        )
                    if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id,
                        row[json_column]["actions"][0]["actionJson"]["projectName"]
                        + ":"
                        + row[json_column]["actions"][0]["actionJson"]["flowName"],
                        unit,
                        frequency,
                        cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"])
                        / 1000,
                        int(time.mktime(datetime.date(2099, 12, 31).timetuple())),
                        "0",
                        self.wh_exec_id,
                    )
                    schedule_writer.append(schedule_record)
        schedule_writer.close()

Example #9

0

Show file

File: AzkabanExtract.py Project: alyiwang/WhereHows

  def collect_flow_owners(self, owner_file):
    # load user info from table project_permissions
    self.logger.info("collect owner&permissions")
    user_writer = FileWriter(owner_file)
    query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
            "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)

    for row in rows:
      record = AzkabanFlowOwnerRecord(self.app_id,
                                      row['project_name'] + ':' + row["flow_id"],
                                      row["owner"],
                                      AzkabanPermission(row["permissions"]).toFlatString(),
                                      'GROUP' if row['isGroup'] == 1 else 'LDAP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()

Example #10

0

Show file

File: HiveTransform.py Project: CSRedRat/WhereHows

 def __init__(self):
   self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
   username = args[Constant.HIVE_METASTORE_USERNAME]
   password = args[Constant.HIVE_METASTORE_PASSWORD]
   jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
   jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
   self.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
   self.curs = self.conn_hms.cursor()
   dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
   self.instance_writer = FileWriter(dependency_instance_file)

Example #11

0

Show file

File: OozieExtract.py Project: alyiwang/WhereHows

  def collect_flow_execs(self, flow_exec_file, lookback_period):
    self.logger.info("collect flow execs")
    flow_exec_writer = FileWriter(flow_exec_file)
    query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_exec_record = OozieFlowExecRecord(self.app_id,
                                             row['app_name'],
                                             row['app_path'],
                                             row['id'],
                                             row['id'],
                                             row['status'],
                                             row['run'],
                                             row['user_name'],
                                             row['start_time'],
                                             row['end_time'],
                                             self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

    flow_exec_writer.close()

Example #12

0

Show file

File: AppworxExtract.py Project: alyiwang/WhereHows

  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owner&permissions")
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    user_writer = FileWriter(owner_file)
    query = \
        """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
    self.aw_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.aw_cursor)

    for row in rows:
      record = AppworxFlowOwnerRecord(self.app_id,
                                      row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                                      row["SO_USER_NAME"],
                                      'EXECUTE',
                                      'GROUP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()

Example #13

0

Show file

  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    requests.packages.urllib3.disable_warnings()
    self.app_id = int(args[Constant.APP_ID_KEY])
    self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
    self.project_writer = FileWriter(args[Constant.GIT_PROJECT_OUTPUT_KEY])
    self.repo_writer = FileWriter(args[Constant.PRODUCT_REPO_OUTPUT_KEY])
    self.repo_owner_writer = FileWriter(args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY])

    self.multiproduct = {}
    self.git_repo = {}
    self.product_repo = []

Example #14

0

Show file

    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owners")
        owner_writer = FileWriter(owner_file)
        query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'],
                                                row['user_name'],
                                                self.wh_exec_id)
            owner_writer.append(owner_record)
        owner_writer.close()

Example #15

0

Show file

    def collect_flow_owners(self, owner_file):
        # load user info from table project_permissions
        self.logger.info("collect owner&permissions")
        user_writer = FileWriter(owner_file)

        query = "SELECT project_name, workflow_name, owner FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.lz_cursor)

        for row in rows:
            record = LhotseFlowOwnerRecord(
                self.app_id, row['project_name'] + ':' + row["workflow_name"],
                row["owner"], 'ADMIN', 'LDAP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()

Example #16

0

Show file

    def collect_flow_execs(self, flow_exec_file, lookback_period):
        self.logger.info("collect flow execs")
        flow_exec_writer = FileWriter(flow_exec_file)
        query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (
            int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_exec_record = OozieFlowExecRecord(
                self.app_id, row['app_name'], row['app_path'], row['id'],
                row['id'], row['status'], row['run'], row['user_name'],
                row['start_time'], row['end_time'], self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

        flow_exec_writer.close()

Example #17

0

Show file

    def collect_job_execs(self, job_exec_file, lookback_period):
        self.logger.info("collect job execs")
        job_exec_writer = FileWriter(job_exec_file)
        query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            job_exec_record = OozieJobExecRecord(
                self.app_id, row['app_path'], row['flow_exec_id'],
                row['flow_exec_id'], row['job_name'],
                row['app_path'] + "/" + row['job_name'], row['job_exec_id'],
                row['status'], row['user_retry_count'], row['start_time'],
                row['end_time'], self.wh_exec_id)
            job_exec_writer.append(job_exec_record)
        job_exec_writer.close()

Example #18

0

Show file

    def collect_flow_schedules(self, schedule_file):
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            schedule_record = OozieFlowScheduleRecord(
                self.app_id, row['app_path'], row['time_unit'],
                int(row['frequency']), row['start_time'], row['end_time'],
                row['ref_id'], self.wh_exec_id)
            schedule_writer.append(schedule_record)

        schedule_writer.close()

Example #19

0

Show file

File: HiveTransform.py Project: 0xqq/WhereHows

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []
        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
          schema_json = schema_data

          # process each field
          for field in schema_data['fields']:
            field_name = field['name']
            type = field['type']  # could be a list
            default_value = field['default'] if 'default' in field else None
            doc = field['doc'] if 'doc' in field else None

            attributes_json = json.loads(field['attributes_json']) if 'attributes_json' in field else None
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
            if attributes_json:
              pk = attributes_json['pk'] if 'pk' in attributes_json else None
              delta = attributes_json['delta'] if 'delta' in attributes_json else None
              is_nullable = attributes_json['nullable'] if 'nullable' in attributes_json else None
              inside_type = attributes_json['type'] if 'type' in attributes_json else None
              format = attributes_json['format'] if 'format' in attributes_json else None

            flds[field_name] = {'type': type}
            # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
            #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
            sort_id += 1
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), str(sort_id), '0', None, field_name, '',
               type, data_size, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               json.dumps(attributes_json)])
        elif TableInfo.field_list in table:
          schema_json = {'type': 'record', 'name': table['name'],
                         'fields': table[TableInfo.field_list]}  # construct a schema for data came from COLUMN_V2
          for field in table[TableInfo.field_list]:
            field_name = field['ColumnName']
            type = field['TypeName']
            # ColumnName, IntegerIndex, TypeName, Comment
            flds[field_name] = {'type': type}
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), field['IntegerIndex'], '0', None, field_name,
               '', field['TypeName'], None, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               None])

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Example #20

0

Show file

File: AzkabanExtract.py Project: alyiwang/WhereHows

  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:
      row['version'] = 0 if (row["version"] is None) else row["version"]

      json_column = 'json'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except:
        pass

      flow_path = row['project_name'] + ":" + row['flow_id']

      flow_record = AzkabanFlowRecord(self.app_id,
                                      row['flow_id'],
                                      row['project_name'],
                                      flow_path,
                                      0,
                                      row['modified_time'] / 1000,
                                      row["version"],
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)

      # get flow jobs
      nodes = row[json_column]['nodes']
      for node in nodes:
        job_record = AzkabanJobRecord(self.app_id,
                                      flow_path,
                                      row["version"],
                                      node['id'],
                                      flow_path + '/' + node['id'],
                                      node['jobType'],
                                      'Y',
                                      self.wh_exec_id)
        if node['jobType'] == 'flow':
          job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
        job_writer.append(job_record)

      # job dag
      edges = row[json_column]['edges']
      for edge in edges:
        dag_edge = AzkabanFlowDagRecord(self.app_id,
                                        flow_path,
                                        row['version'],
                                        flow_path + '/' + edge['source'],
                                        flow_path + '/' + edge['target'],
                                        self.wh_exec_id)
        dag_writer.append(dag_edge)

      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()

Example #21

0

Show file

File: AppworxExtract.py Project: alyiwang/WhereHows

  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    flow_id_list = []
    if self.last_execution_unix_time:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
    else:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)

    ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
    if self.last_execution_unix_time:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
    else:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

    try:
      self.aw_cursor.execute(flow_cmd)
    except Exception as e:
      self.logger.error(e + "\n" + flow_cmd)

    rows = DbUtil.dict_cursor(self.aw_cursor)
    row_count = 0
    for row in rows:
      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
      so_flow_id = row['SO_JOBID']
      flow_attempt = 0
      flow_exec_id = 0
      try:
        flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100)
        flow_exec_id = int(so_flow_id)
      except Exception as e:
        self.logger.error(e)
      self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

      flow_exec_record = AppworxFlowExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               row['SO_MODULE'],
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               row['SO_STATUS_NAME'],
                                               flow_attempt,
                                               row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                                               long(row['JOB_STARTED']),
                                               long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

      new_appworx_cursor = self.aw_con.cursor()
      if self.last_execution_unix_time:
        new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id))
      else:
        new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id))
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)

      for job in job_rows:
        so_job_id = job['SO_JOBID']
        job_attempt = 0
        job_exec_id = 0
        try:
          job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100)
          job_exec_id = int(so_job_id)
        except Exception as e:
          self.logger.error(e)

        job_exec_record = AppworxJobExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               long(job['JOB_ID']),
                                               job['SO_TASK_NAME'],
                                               flow_path + "/" + job['SO_TASK_NAME'],
                                               job_exec_id,
                                               job['SO_STATUS_NAME'],
                                               job_attempt,
                                               long(job['JOB_STARTED']),
                                               long(job['JOB_FINISHED']),
                                               self.wh_exec_id)

        job_exec_writer.append(job_exec_record)
        row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()

    flow_exec_writer.close()
    job_exec_writer.close()

Example #22

0

Show file

File: AzkabanExtract.py Project: alyiwang/WhereHows

  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()

Example #23

0

Show file

File: TeradataExtract.py Project: BornOfHope/WhereHows

  def run(self, database_name, table_name, schema_output_file, sample_output_file, sample=True):
    """
    The entrance of the class, extract schema and sample data
    Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
    :param database_name:
    :param table_name:
    :param schema_output_file:
    :return:
    """
    cur = self.conn_td.cursor()
    schema = []

    f_log = open(self.log_file, "a")

    schema_json = open(schema_output_file, 'wb')
    os.chmod(schema_output_file, 0666)


    if database_name is None and table_name is None:  # default route: process everything
      for database_name in self.databases:
        self.logger.info("Collecting tables in database : " + database_name)
        # table info
        rows = []
        begin = datetime.datetime.now().strftime("%H:%M:%S")
        rows.extend(self.get_table_info(database_name, table_name))
        if len(rows) > 0:
          self.format_table_metadata(rows, schema)
        end = datetime.datetime.now().strftime("%H:%M:%S")
        f_log.write("Get table info %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))

        # view info
        rows = []
        begin = datetime.datetime.now().strftime("%H:%M:%S")
        rows.extend(self.get_view_info(database_name, table_name))
        if len(rows) > 0:
          self.format_view_metadata(rows, schema)
        end = datetime.datetime.now().strftime("%H:%M:%S")
        f_log.write("Get view  info %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))

      scaned_dict = {}  # a cache of {name : {urn : _, data : _}} to avoid repeat computing

      if sample:
        open(sample_output_file, 'wb')
        os.chmod(sample_output_file, 0666)
        sample_file_writer = FileWriter(sample_output_file)

        # collect sample data
        for onedatabase in schema:
          database_name = onedatabase['database']
          if 'tables' in onedatabase:
            alltables = onedatabase['tables']
          else:
            alltables = onedatabase['views']

          for onetable in alltables:
            table_name = onetable['original_name'].split('.')[1]
            if table_name in scaned_dict:
              sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name,
                                               scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
            else:
              (ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
              sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, '', sample_data)
              scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
            sample_file_writer.append(sample_record)
        sample_file_writer.close()

    # print 'byte size of schema : ' + str(sys.getsizeof(schema))
    schema_json.write(json.dumps(schema, indent=None) + '\n')
    cur.close()
    schema_json.close()
    f_log.close()

Example #24

0

Show file

File: CodeSearchExtract.py Project: superwood/WhereHows

 def __init__(self):
     self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
     self.base_url = args[Constant.BASE_URL_KEY]
     self.code_search_committer_writer = FileWriter(args[Constant.DATABASE_SCM_REPO_OUTPUT_KEY])

Example #25

0

Show file

File: CodeSearchExtract.py Project: superwood/WhereHows

class CodeSearchExtract:
    """
    Lists all repos for oracle & espresso databases. Since this feature is not
    available through the UI, we need to use http://go/codesearch to discover
    the multiproduct repos that use 'li-db' plugin.
    """

    # verbose = False
    limit_search_result = 500
    # limit_multiproduct = None
    # limit_plugin = None

    def __init__(self):
        self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
        self.base_url = args[Constant.BASE_URL_KEY]
        self.code_search_committer_writer = FileWriter(args[Constant.DATABASE_SCM_REPO_OUTPUT_KEY])

    def run(self):
        offset_min = 1
        offset_max = 100
        databases = []
        search_request = \
            {"request":
                {
                    "other":{"CurrentResult":str(offset_min),"requestTimeout":"200000000"},
                    "queryContext":{"numToScore":1000,"docDataSet":"results","rawQuery":"type:gradle plugin:*'li-db'"},
                    "paginationContext":{"numToReturn":offset_max}
                }
            }
        while True:
            resp = requests.post(self.base_url + '/galene-codesearch?action=search',
                                 json=search_request,
                                 verify=False)
            if resp.status_code != 200:
                # This means something went wrong.
                d = resp.json()
                self.logger.info("Request Error! Stack trace {}".format(d['stackTrace']))
                # raise Exception('Request Error', 'POST /galene-codesearch?action=search %s' % (resp.status_code))
                break

            result = resp.json()['value']
            self.logger.debug("Pagination offset = {}".format(result['total']))
            for element in result['elements']:
                fpath = element['docData']['filepath']
                ri = fpath.rindex('/')
                prop_file = fpath[:ri] + '/database.properties'
                # e.g. identity-mt/database/Identity/database.properties
                #      network/database/externmembermap/database.properties
                #      cap-backend/database/campaigns-db/database.properties
                try:
                    databases.append( {'filepath': prop_file, 'app_name': element['docData']['mp']} )
                except:
                    self.logger.error("Exception happens with prop_file {}".format(prop_file))

            if result['total'] < 100:
                break
            offset_min += int(result['total'])
            offset_max += 100 # if result['total'] < 100 else result['total']
            search_request['request']['other']['CurrentResult'] = str(offset_min)
            search_request['request']['paginationContext']['numToReturn'] = offset_max
            self.logger.debug("Property file path {}".format(search_request))

        self.logger.debug(" length of databases is {}".format(len(databases)))

        owner_count = 0
        committers_count = 0
        for db in databases:
            prop_file = db['filepath']
            file_request = \
                {"request":{
                    "other":{"filepath":prop_file,
                             "TextTokenize":"True",
                             "CurrentResult":"1",
                             "requestTimeout":"2000000000"
                             },
                    "queryContext":{"numToScore":10,"docDataSet":"result"},
                    "paginationContext":{"numToReturn":1}
                }
                }
            resp = requests.post(self.base_url + '/galene-codesearch?action=search',
                                 json=file_request,
                                 verify=False)
            if resp.status_code != 200:
                # This means something went wrong.
                d = resp.json()
                self.logger.info("Request Error! Stack trace {}".format(d['stackTrace']))
                continue
            result = resp.json()['value']
            if result['total'] < 1:
                self.logger.info("Nothing found for {}".format(prop_file))
                continue
            if "repoUrl" in result['elements'][0]['docData']:
                db['scm_url'] = result['elements'][0]['docData']['repoUrl']
                db['scm_type'] = result['elements'][0]['docData']['repotype']
                db['committers'] = ''

                if db['scm_type'] == 'SVN':
                    schema_in_repo = re.sub(r"http://(\w+)\.([\w\.\-/].*)database.properties\?view=markup",
                                            "http://svn." + r"\2" + "schema", db['scm_url'])
                    db['committers'] = self.get_svn_committers(schema_in_repo)
                    committers_count +=1
                    self.logger.info("Committers for {} => {}".format(schema_in_repo,db['committers']))

            else:
                self.logger.info("Search request {}".format(prop_file))

            code = result['elements'][0]['docData']['code']
            try:
                code_dict = dict(line.split("=", 1) for line in code.strip().splitlines())

                db['database_name'] = code_dict['database.name']
                db['database_type'] = code_dict['database.type']

                owner_record = SCMOwnerRecord(
                    db['scm_url'],
                    db['database_name'],
                    db['database_type'],
                    db['app_name'],
                    db['filepath'],
                    db['committers'],
                    db['scm_type']
                )
                owner_count += 1
                self.code_search_committer_writer.append(owner_record)
            except Exception as e:
                self.logger.error(e)
                self.logger.error("Exception happens with code {}".format(code))


        self.code_search_committer_writer.close()
        self.logger.info('Finish Fetching committers, total {} committers entries'.format(committers_count))
        self.logger.info('Finish Fetching SVN owners, total {} records'.format(owner_count))


    def get_svn_committers(self, svn_repo_path):
        """Collect recent committers from the cmd
           svn log %s | grep '^\(A=\|r[0-9]* \)' | head -10
           e.g.
           r1617887 | htang | 2016-09-21 14:27:40 -0700 (Wed, 21 Sep 2016) | 12 lines
           A=shanda,pravi
           r1600397 | llu | 2016-08-08 17:14:22 -0700 (Mon, 08 Aug 2016) | 3 lines
           A=rramakri,htang
        """
        #svn_cmd = """svn log %s | grep '^\(A=\|r[0-9]* \)' | head -10"""
        committers = []
        possible_svn_paths = [svn_repo_path, svn_repo_path + "ta"]
        for svn_repo_path in possible_svn_paths:
            p = subprocess.Popen('svn log ' + svn_repo_path + " |grep '^\(A=\|r[0-9]* \)' |head -10",
                                 shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            svn_log_output, svn_log_err = p.communicate()
            if svn_log_err[:12] == 'svn: E160013':
                continue  # try the next possible path

            for line in svn_log_output.split('\n'):
                if re.match(r"r[0-9]+", line):
                    committer = line.split('|')[1].strip()
                    if committer not in committers:
                        committers.append(committer)
                elif line[:2] == 'A=':
                    for apvr in line[2:].split(','):
                        if apvr not in committers:
                            committers.append(apvr)


            if len(committers) > 0:
                self.logger.debug(" {}, ' => ', {}".format(svn_repo_path,committers))
                break

        return ','.join(committers)

Example #26

0

Show file

File: HdfsTransform.py Project: 7396553/WhereHows

  def transform(self, raw_metadata, metadata_output, field_metadata_output):

    # sys.setdefaultencoding("UTF-8")

    input_json_file = open(raw_metadata, 'r')
    schema_file_writer = FileWriter(metadata_output)
    field_file_writer = FileWriter(field_metadata_output)
    i = 0
    self.sort_id = 0
    o_urn = ''
    p = ''

    def fields_json_to_csv(output_list_, parent_field_path, field_list_):
      # string, list, int, optional int
      self.sort_id
      parent_field_path
      parent_id = self.sort_id

      for f in field_list_:
        self.sort_id += 1

        o_field_name = f['name']
        o_field_data_type = ''
        o_field_data_size = None
        o_field_nullable = 'N'
        o_field_default = ''
        o_field_namespace = ''
        o_field_doc = ''
        effective_type_index_in_type = -1

        if f.has_key('namespace'):
          o_field_namespace = f['namespace']

        if f.has_key('default') and type(f['default']) != None:
          o_field_default = f['default']

        if not f.has_key('type'):
          o_field_data_type = None
        elif type(f['type']) == list:
          i = effective_type_index = -1
          for data_type in f['type']:
            i += 1  # current index
            if type(data_type) is None or (data_type == 'null'):
              o_field_nullable = 'Y'
            elif type(data_type) == dict:
              o_field_data_type = data_type['type']
              effective_type_index_in_type = i

              if data_type.has_key('namespace'):
                o_field_namespace = data_type['namespace']
              elif data_type.has_key('name'):
                o_field_namespace = data_type['name']

              if data_type.has_key('size'):
                o_field_data_size = data_type['size']
              else:
                o_field_data_size = None

            else:
              o_field_data_type = data_type
              effective_type_index_in_type = i
        elif type(f['type']) == dict:
          o_field_data_type = f['type']['type']
        else:
          o_field_data_type = f['type']
          if f.has_key('attributes') and f['attributes'].has_key('nullable'):
            o_field_nullable = 'Y' if f['attributes']['nullable'] else 'N'
          if f.has_key('attributes') and f['attributes'].has_key('size'):
            o_field_data_size = f['attributes']['size']

        if f.has_key('doc'):
          if len(f['doc']) == 0 and f.has_key('attributes'):
            o_field_doc = json.dumps(f['attributes'])
          else:
            o_field_doc = f['doc']
        elif f.has_key('comment'):
          o_field_doc = f['comment']

        output_list_.append(
          [o_urn, self.sort_id, parent_id, parent_field_path, o_field_name, o_field_data_type, o_field_nullable,
           o_field_default, o_field_data_size, o_field_namespace,
           o_field_doc.replace("\n", ' ') if o_field_doc is not None else None])

        # check if this field is a nested record
        if type(f['type']) == dict and f['type'].has_key('fields'):
          current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
          fields_json_to_csv(output_list_, current_field_path, f['type']['fields'])
        elif type(f['type']) == dict and f['type'].has_key('items') and type(f['type']['items']) == dict and f['type']['items'].has_key('fields'):
          current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
          fields_json_to_csv(output_list_, current_field_path, f['type']['items']['fields'])

        if effective_type_index_in_type >= 0 and type(f['type'][effective_type_index_in_type]) == dict:
          if f['type'][effective_type_index_in_type].has_key('items') and type(
              f['type'][effective_type_index_in_type]['items']) == list:

            for item in f['type'][effective_type_index_in_type]['items']:
              if type(item) == dict and item.has_key('fields'):
                current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                fields_json_to_csv(output_list_, current_field_path, item['fields'])
          elif f['type'][effective_type_index_in_type].has_key('items') and f['type'][effective_type_index_in_type]['items'].has_key('fields'):
            # type: [ null, { type: array, items: { name: xxx, type: record, fields: [] } } ]
            current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
            fields_json_to_csv(output_list_, current_field_path, f['type'][effective_type_index_in_type]['items']['fields'])
          elif f['type'][effective_type_index_in_type].has_key('fields'):
            # if f['type'][effective_type_index_in_type].has_key('namespace'):
            # o_field_namespace = f['type'][effective_type_index_in_type]['namespace']
            current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
            fields_json_to_csv(output_list_, current_field_path, f['type'][effective_type_index_in_type]['fields'])

            # End of function

    for line in input_json_file:
      try:
        j = json.loads(line)
      except:
        self.logger.error("    Invalid JSON:\n%s" % line)
        continue

      i += 1
      o_field_list_ = []
      parent_field_path = ''
      self.sort_id = 0

      if not (j.has_key('attributes_json') or j.has_key('attributes')):
        o_properties = {"doc": null}
      else:
        o_properties = {}
        if j.has_key('attributes_json'):
          o_properties = json.loads(j['attributes_json'])
          del j['attributes_json']
        if j.has_key('attributes'):
          o_properties = dict(j['attributes'].items() + o_properties.items())
          del j['attributes']

      if j.has_key('uri'):
        o_urn = j['uri']
      elif o_properties.has_key('uri'):
        o_urn = o_properties['uri']
      else:
        self.logger.info('*** Warning: "uri" is not found in %s' % j['name'])
        o_urn = ''

      if o_urn.find('hdfs://') == 0:
        o_name = o_urn[o_urn.rfind('/') + 1:]
      elif o_properties.has_key('table_name'):
        o_name = o_properties['table_name']
      elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
        o_name = j['name']
      else:
        o_name = o_urn[o_urn.rfind('/') + 1:]

      if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
        o_fields = {}
        for k in j:
          if not (k == 'uri' or k == 'attributes' or k == 'doc'):
            if type(j[k]) == list:
              o_fields[k] = {"name": k, "type": 'list', "doc": str(j[k])}
            elif type(j[k]) == dict:
              o_fields[k] = {"name": k, "type": 'dict', "doc": str(j[k])}
            else:
              o_fields[k] = {"name": k, "type": j[k], "doc": None}

            self.sort_id += 1
            o_field_list_.append([o_urn, self.sort_id, 0, '', k, o_fields[k]['type'], '', '', '',
                                  o_fields[k]['doc'].replace("\n", ' ') if o_fields[k]['doc'] is not None else None])

      elif j.has_key('fields'):
        o_fields = {}
        for f in j['fields']:
          o_field_name = f['name']
          o_fields[o_field_name] = dict(f)  # for schema output
          if f.has_key('attributes_json'):
            f['attributes'] = json.loads(f['attributes_json'])
            del f['attributes_json']

        fields_json_to_csv(o_field_list_, '', j['fields'])

      else:
        o_fields = {"doc": None}

      if j.has_key('attributes') and not o_properties.has_key('source'):
        o_properties['source'] = j['attributes']['source']

      if o_urn.startswith('hdfs:///') and self.file_regex_source_map is not None:
        o_source = self.get_source(o_urn[7:])
      else:
        self.logger.warn("property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                         " is None, will use default source for all dataset")
        o_source = 'Hdfs'

      self.logger.info(
        "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s" % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn, o_source))

      dataset_schema_record = DatasetSchemaRecord(o_name, json.dumps(j, sort_keys=True),
                                                  json.dumps(o_properties, sort_keys=True), json.dumps(o_fields), o_urn,
                                                  o_source, None, None, None)
      schema_file_writer.append(dataset_schema_record)

      for fields in o_field_list_:
        field_record = DatasetFieldRecord(fields)
        field_file_writer.append(field_record)

    schema_file_writer.close()
    field_file_writer.close()
    input_json_file.close()

Example #27

0

Show file

File: TeradataTransform.py Project: 0xqq/WhereHows

  def transform(self, input, td_metadata, td_field_metadata):
    '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
    f_json = open(input)
    data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(td_metadata)
    field_file_writer = FileWriter(td_field_metadata)

    for d in data:
      i = 0
      for k in d.keys():
        if k not in ['tables', 'views']:
          continue
        self.logger.info("%s %4d %s" % (datetime.datetime.now().strftime("%H:%M:%S"), len(d[k]), k))
        for t in d[k]:
          self.logger.info("%4d %s" % (i, t['name']))
          if t['name'] == 'HDFStoTD_2464_ERR_1':
            continue
          i += 1
          output = {}
          prop_json = {}
          output['name'] = t['name']
          output['original_name'] = t['original_name']

          prop_json["createTime"] = t["createTime"] if t.has_key("createTime") else None
          prop_json["lastAlterTime"] = t["lastAlterTime"] if t.has_key("lastAlterTime") else None
          prop_json["lastAccessTime"] = t["lastAccessTime"] if t.has_key("lastAccessTime") else None
          prop_json["accessCount"] = t["accessCount"] if t.has_key("accessCount") else None
          prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key("sizeInMbytes") else None
          if "type" in t:
            prop_json["storage_type"] = t["type"]
          if "partition" in t:
            prop_json["partition"] = t["partition"]
          if "partitions" in t:
            prop_json["partitions"] = t["partitions"]
          if "hashKey" in t:
            prop_json["hashKey"] = t["hashKey"]
          if "indices" in t:
            prop_json["indices"] = t["indices"]
          if "referenceTables" in t:
            prop_json["referenceTables"] = t["referenceTables"]
          if "viewSqlText" in t:
            prop_json["viewSqlText"] = t["viewSqlText"]

          output['fields'] = []
          flds = {}
          field_detail_list = []
          sort_id = 0
          for c in t['columns']:
            # output['fields'].append(
            #                    { 'name' : t['name'].encode('latin-1'),
            #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
            #                      'attributes_json' : c}
            #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
            sort_id += 1
            output['fields'].append({"name": c['name'], "doc": '', "type": c['dataType'] if c['dataType'] else None,
                                     "nullable": c['nullable'], "maxByteLength": c['maxByteLength'],
                                     "format": c['columnFormat'] if c.has_key('columnFormat') else None,
                                     "accessCount": c['accessCount'] if c.has_key('accessCount') else None,
                                     "lastAccessTime": c['lastAccessTime'] if c.has_key("lastAccessTime") else None})

            flds[c['name']] = {'type': c['dataType'], "maxByteLength": c['maxByteLength']}

            field_detail_list.append(
              ["teradata:///%s/%s" % (d['database'], output['name']), str(sort_id), '0', '', c['name'], '',
               c['dataType'] if 'dataType' in c and c['dataType'] is not None else '',
               str(c['maxByteLength']) if 'maxByteLength' in c else '0',
               str(c['precision']) if 'precision' in c and c['precision'] is not None else '',
               str(c['scale']) if 'scale' in c and c['scale'] is not None else '',
               c['nullable'] if 'nullable' in c and c['nullable'] is not None else 'Y', '', '', '', '', '', '', ''])

          dataset_scehma_record = DatasetSchemaRecord(output['name'], json.dumps(output), json.dumps(prop_json),
                                                      json.dumps(flds),
                                                      "teradata:///%s/%s" % (d['database'], output['name']), 'Teradata',
                                                      output['original_name'],
                                                      (self.convert_timestamp(t["createTime"]) if t.has_key("createTime") else None),
                                                      (self.convert_timestamp(t["lastAlterTime"]) if t.has_key("lastAlterTime") else None))
          schema_file_writer.append(dataset_scehma_record)

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)

        schema_file_writer.flush()
        field_file_writer.flush()
        self.logger.info("%20s contains %6d %s" % (d['database'], i, k))

    schema_file_writer.close()
    field_file_writer.close()

Example #28

0

Show file

File: AppworxExtract.py Project: alyiwang/WhereHows

  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    if self.last_execution_unix_time:
        time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time)
    else:
        time_filter = "SYSDATE - %d" % int(self.lookback_period)
    flow_query = \
        """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
    job_query = \
        """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
    self.aw_cursor.execute(flow_query)
    rows = DbUtil.dict_cursor(self.aw_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:

      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

      flow_record = AppworxFlowRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      row['SO_MODULE'],
                                      row['SO_APPLICATION'],
                                      flow_path,
                                      0,
                                      0,
                                      0,
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)
      new_appworx_cursor = self.aw_con.cursor()
      new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)
      for job in job_rows:
        job_record = AppworxJobRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      flow_path,
                                      0,
                                      long(job['JOB_ID']),
                                      job['SO_TASK_NAME'],
                                      flow_path + '/' + job['SO_TASK_NAME'],
                                      job['SO_MODULE'],
                                      'Y',
                                      self.wh_exec_id)
        command_type = job['SO_COMMAND_TYPE']
        if command_type and command_type == 'CHAIN':
          job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE'])
          job_record.setJobType('CHAIN')

        job_writer.append(job_record)

        predecessors_str = job['PREDECESSORS']
        if predecessors_str:
          predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str)
          if predecessors:
            for predecessor in predecessors:
              dag_edge = AppworxFlowDagRecord(self.app_id,
                                             long(row['SO_JOB_SEQ']),
                                             flow_path,
                                             0,
                                             flow_path + '/' + predecessor,
                                             flow_path + '/' + job['SO_TASK_NAME'],
                                             self.wh_exec_id)
              dag_writer.append(dag_edge)
      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()

Example #29

0

Show file

File: MultiproductExtract.py Project: dmoore247/WhereHows

class MultiproductLoad:

  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    requests.packages.urllib3.disable_warnings()
    self.app_id = int(args[Constant.APP_ID_KEY])
    self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
    self.project_writer = FileWriter(args[Constant.GIT_PROJECT_OUTPUT_KEY])
    self.repo_writer = FileWriter(args[Constant.PRODUCT_REPO_OUTPUT_KEY])
    self.repo_owner_writer = FileWriter(args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY])

    self.multiproduct = {}
    self.git_repo = {}
    self.product_repo = []


  def get_multiproducts(self):
    '''
    fetch all products and owners of Multiproduct
    '''
    resp = requests.get(args[Constant.MULTIPRODUCT_SERVICE_URL], verify=False)
    if resp.status_code != 200:
      # This means something went wrong.
      raise Exception('Request Error', 'GET /api/v1/mpl {}'.format(resp.status_code))

    # print resp.content
    re_git_repo_name = re.compile(r":(.*)\.git$")
    re_svn_repo_name = re.compile(r"/(.*)/trunk$")
    if resp.headers['content-type'].split(';')[0] == 'application/json':
      for product_name, product_info in resp.json()['products'].items():
        scm_type = product_info["scm"]["name"]
        try:
          if scm_type == 'git':
            repo_fullname = re_git_repo_name.search(product_info["uris"]["trunk"]).group(1)
            repo_key = 'git:' + repo_fullname
          elif scm_type == 'svn':
            repo_fullname = re_svn_repo_name.search(product_info["uris"]["trunk"]).group(1)
            repo_key = 'svn:' + repo_fullname
        except:
          self.logger.debug("Error parsing repo full name {} - {}".format(product_name, product_info["uris"]))
          continue

        self.multiproduct[repo_key] = {
          "scm_repo_fullname": repo_fullname,
          "scm_type": scm_type,
          "multiproduct_name": product_name,
          "product_type": product_info["type"],
          "namespace": product_info["org"],
          "owner_name": ",".join(product_info["owners"]),
          "product_version": product_info["product-version"]
        }
      self.logger.info("Fetched {} Multiproducts".format(len(self.multiproduct)))


  def get_project_repo(self):
    '''
    fetch detail and repos of all git projects
    '''
    re_git_project_name = re.compile(r"(.*)/(.*)$")
    re_git_repo_name = re.compile(r"git://[\w\.-]+/(.*)\.git$")

    project_nonexist = []
    project_names = {}
    for key, product in self.multiproduct.iteritems():
      if product["scm_type"] == 'svn':
        continue

      project_name = re_git_project_name.search(product['scm_repo_fullname']).group(1)
      if project_name in project_names:
        continue
      project_url = '{}/{}?format=xml'.format(args[Constant.GIT_URL_PREFIX], project_name)

      try:
        resp = requests.get(project_url, verify=False)
      except Exception as ex:
        self.logger.info("Error getting /{}.xml - {}".format(project_name, ex.message))
        continue
      if resp.status_code != 200:
        # This means something went wrong.
        self.logger.debug('Request Error: GET /{}.xml {}'.format(project_name, resp.status_code))
        project_nonexist.append(project_name)
        continue

      # print resp.content
      if resp.headers['content-type'].split(';')[0] == 'application/xml':
        xml = ET.fromstring(resp.content)
        current_project = MultiproductProjectRecord(
          self.app_id,
          xml.find('slug').text,
          'git',
          xml.find('owner').attrib['kind'],
          xml.find('owner').text,
          xml.find('created-at').text,
          xml.find('license').text,
          self.trim_newline(xml.find('description').text),
          self.wh_exec_id
        )

        project_repo_names = []
        for repo in xml.findall('repositories/mainlines/repository'):
          repo_fullname = re_git_repo_name.search(repo.find('clone_url').text).group(1)
          project_repo_names.append(repo_fullname)
          repo_key = 'git:' + repo_fullname
          self.git_repo[repo_key] = {
            'scm_repo_fullname': repo_fullname,
            'scm_type': 'git',
            'repo_id': repo.find('id').text,
            'project': project_name,
            'owner_type': repo.find('owner').attrib['kind'],
            'owner_name': repo.find('owner').text
          }

        project_repo_num = len(project_repo_names)
        current_project.setRepos(project_repo_num, ','.join(project_repo_names))
        self.project_writer.append(current_project)
        project_names[project_name] = project_repo_num
        # self.logger.debug("Project: {} - Repos: {}".format(project_name, project_repo_num))

    self.project_writer.close()
    self.logger.info("Finish Fetching git projects and repos")
    self.logger.debug('Non-exist projects: {}'.format(project_nonexist))


  def merge_product_repo(self):
    '''
    merge multiproduct and repo into same product_repo store
    '''
    for key, repo in self.git_repo.iteritems():
      record = MultiproductRepoRecord(
        self.app_id,
        repo['scm_repo_fullname'],
        repo['scm_type'],
        int(repo['repo_id']),
        repo['project'],
        repo['owner_type'],
        repo['owner_name'],
        self.wh_exec_id
      )
      if key in self.multiproduct:
        mp = self.multiproduct[key]
        record.setMultiproductInfo(
          mp["multiproduct_name"],
          mp["product_type"],
          mp["product_version"],
          mp["namespace"]
        )
      self.repo_writer.append(record)
      self.product_repo.append(record)

    for key, product in self.multiproduct.iteritems():
      if key not in self.git_repo:
        record = MultiproductRepoRecord(
          self.app_id,
          product["scm_repo_fullname"],
          product["scm_type"],
          0,
          None,
          None,
          product["owner_name"],
          self.wh_exec_id
        )
        record.setMultiproductInfo(
          product["multiproduct_name"],
          product["product_type"],
          product["product_version"],
          product["namespace"],
        )
        self.repo_writer.append(record)
        self.product_repo.append(record)

    self.repo_writer.close()
    self.logger.info("Merged products and repos, total {} records".format(len(self.product_repo)))


  def get_acl_owners(self):
    '''
    fetch owners information from acl
    '''
    re_acl_owners = re.compile(r"owners\:\s*\[([^\[\]]+)\]")
    re_acl_path = re.compile(r"paths\:\s*\[([^\[\]]+)\]")
    re_svn_acl_url = re.compile(r'href=\"[\w\/\-]+[\/\:]acl\/([\w\-\/]+)\.acl(\?revision=\d+)&amp;view=markup\"')
    re_git_acl_url = re.compile(r'href=\"[\w\/\-]+\/source\/([\w\:]*)acl\/([\w\-]+)\.acl\"')

    owner_count = 0
    for repo in self.product_repo:
      repo_fullname = repo.getScmRepoFullname()
      scm_type = repo.getScmType()
      repo_id = repo.getRepoId()

      if scm_type == "git":
        repo_url = '{}/{}/source/acl'.format(args[Constant.GIT_URL_PREFIX], repo_fullname)
      elif scm_type == "svn":
        repo_url = '{}/{}/acl'.format(args[Constant.SVN_URL_PREFIX], repo_fullname)

      try:
        resp = requests.get(repo_url, verify=False)
      except Exception as ex:
        self.logger.info("Error getting acl {} - {}".format(repo_url, ex.message))
        continue
      if resp.status_code != 200:
        self.logger.debug('Request Error: GET repo {} acls - {}'.format(repo, resp.status_code))
        continue

      if resp.headers['content-type'].split(';')[0] == 'text/html':
        re_acl_url = re_git_acl_url if scm_type == "git" else re_svn_acl_url

        for acl_url in re_acl_url.finditer(resp.content):
          if scm_type == "git":
            acl_name = acl_url.group(2)
            commit_hash = acl_url.group(1)
            full_acl_url = '{}/{}/raw/{}acl/{}.acl'.format(args[Constant.GIT_URL_PREFIX],
                                                           repo_fullname, commit_hash, acl_name)
          elif scm_type == "svn":
            acl_name = acl_url.group(1)
            commit_hash = acl_url.group(2)
            full_acl_url = '{}/{}.acl{}'.format(repo_url, acl_name, commit_hash)

          try:
            resp = requests.get(full_acl_url, verify=False)
          except Exception as ex:
            self.logger.info("Error getting acl {} - {}".format(full_acl_url, ex.message))
            continue
          if resp.status_code != 200:
            self.logger.debug('Request Error: GET acl {} - {}'.format(full_acl_url, resp.status_code))
            continue

          owners_string = re_acl_owners.search(resp.content)
          path_string = re_acl_path.search(resp.content)
          if owners_string:
            owners = self.parse_owners(owners_string.group(1))
            paths = self.trim_path(path_string.group(1)) if path_string else None
            sort_id = 0
            for owner in owners:
              owner_record = MultiproductRepoOwnerRecord(
                self.app_id,
                repo_fullname,
                scm_type,
                repo_id,
                acl_name.title(),
                owner,
                sort_id,
                paths,
                self.wh_exec_id
              )
              self.repo_owner_writer.append(owner_record)
              sort_id += 1
              owner_count += 1
            # self.logger.debug('{} - {} owners: {}'.format(repo_fullname, acl_name, len(owners)))

    self.repo_owner_writer.close()
    self.logger.info('Finish Fetching acl owners, total {} records'.format(owner_count))


  def trim_newline(self, line):
    return line.replace('\n', ' ').replace('\r', ' ').encode('ascii', 'ignore') if line else None

  def trim_path(self, line):
    return line.strip().replace('\n', ' ').replace('\r', ' ').replace('&apos;', "'")

  def parse_owners(self, line):
    elements = [s.strip() for l in line.splitlines() for s in l.split(',')]
    return [x for x in elements if x and not x.startswith('#')]


  def run(self):
    begin = datetime.datetime.now().strftime("%H:%M:%S")
    self.get_multiproducts()
    self.get_project_repo()
    self.merge_product_repo()
    mid = datetime.datetime.now().strftime("%H:%M:%S")
    self.logger.info("Finish getting multiproducts and repos [{} -> {}]".format(str(begin), str(mid)))
    self.get_acl_owners()
    end = datetime.datetime.now().strftime("%H:%M:%S")
    self.logger.info("Extract Multiproduct and gitli metadata [{} -> {}]".format(str(begin), str(end)))

Example #30

0

Show file

File: HiveTransform.py Project: alyiwang/WhereHows

  def transform(self, input, hive_instance, hive_metadata, hive_field_metadata, view_dependency):
    """
    convert from json to csv
    :param input: input json file
    :param hive_instance: output data file for hive instance
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    all_data = []
    with open(input) as input_file:
        for line in input_file:
            all_data.append(json.loads(line))

    dataset_idx = -1

    instance_file_writer = FileWriter(hive_instance)
    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)
    dependency_file_writer = FileWriter(view_dependency)

    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        view_expanded_text = ''

        if TableInfo.view_expended_text in prop_json:
          view_expanded_text = prop_json[TableInfo.view_expended_text]
          text = prop_json[TableInfo.view_expended_text].replace('`', '')	# this will be fixed after switching to Hive AST
          array = []
          try:
            array = HiveViewDependency.getViewDependency(text)
          except:
            self.logger.error("HiveViewDependency.getViewDependency(%s) failed!" % (table['name']))

          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0].lower()
              table_name = names[1].lower()
              if db_name and table_name:
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                self.conn_hms.commit()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
                                          'depends on',
                                          'Y',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    dependency_file_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          dependency_file_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and \
           table[TableInfo.schema_literal] is not None and \
           table[TableInfo.schema_literal].startswith('{'):
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting schema literal for: %s" % (urn))
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema Literal JSON error for table: " + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting column definition for: %s" % (uri))
          try:
            hcp = HiveColumnParser(table, urn = uri)
            schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
            field_detail_list += hcp.column_type_list
          except:
            self.logger.error("HiveColumnParser(%s) failed!" % (uri))
            schema_json = {'fields' : {}, 'type' : 'record', 'name' : table['name'], 'uri' : uri}

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])

        dataset_instance_record = DatasetInstanceRecord('dalids:///' + one_db_info['database'] + '/' + table['name']
                                                if one_db_info['type'].lower() == 'dalids'
                                                else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                                'grid',
                                                '',
                                                '',
                                                '*',
                                                True,
                                                table['native_name'],
                                                table['logical_name'],
                                                table['version'],
                                                table['create_time'],
                                                json.dumps(schema_json),
                                                json.dumps(view_expanded_text),
                                                dataset_urn)
        instance_file_writer.append(dataset_instance_record)

        if dataset_urn not in self.dataset_dict:
          dataset_scehma_record = DatasetSchemaRecord(table['dataset_name'], json.dumps(schema_json),
                                                      json.dumps(prop_json),
                                                      json.dumps(flds), dataset_urn, 'Hive', one_db_info['type'],
                                                      table['type'], '',
                                                      table.get(TableInfo.create_time),
                                                      (int(table.get(TableInfo.source_modified_time,"0"))))
          schema_file_writer.append(dataset_scehma_record)

          dataset_idx += 1
          self.dataset_dict[dataset_urn] = dataset_idx

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)

      instance_file_writer.flush()
      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    instance_file_writer.close()
    schema_file_writer.close()
    field_file_writer.close()
    dependency_file_writer.close()

Example #31

0

Show file

File: HiveTransform.py Project: SunZhaonan/WhereHows-1

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Example #32

0

Show file

File: HiveTransform.py Project: CSRedRat/WhereHows

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()
    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0]
              table_name = names[1]
              if db_name and table_name:
                rows = []
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                          'depends on',
                                          'is used by',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    self.instance_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          self.instance_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    dataset_urn,
                                                    'Hive', one_db_info['type'], table['type'],
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Example #33

0

Show file

File: TeradataTransform.py Project: kiranclarivate/WhereHows_Bk

    def transform(self, input, td_metadata, td_field_metadata):
        '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
        f_json = open(input)
        data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(td_metadata)
        field_file_writer = FileWriter(td_field_metadata)

        for d in data:
            i = 0
            for k in d.keys():
                if k not in ['tables', 'views']:
                    continue
                self.logger.info("%s %4d %s" %
                                 (datetime.datetime.now().strftime("%H:%M:%S"),
                                  len(d[k]), k))
                for t in d[k]:
                    self.logger.info("%4d %s" % (i, t['name']))
                    if t['name'] == 'HDFStoTD_2464_ERR_1':
                        continue
                    i += 1
                    output = {}
                    prop_json = {}
                    output['name'] = t['name']
                    output['original_name'] = t['original_name']

                    prop_json["createTime"] = t["createTime"] if t.has_key(
                        "createTime") else None
                    prop_json[
                        "lastAlterTime"] = t["lastAlterTime"] if t.has_key(
                            "lastAlterTime") else None
                    prop_json[
                        "lastAccessTime"] = t["lastAccessTime"] if t.has_key(
                            "lastAccessTime") else None
                    prop_json["accessCount"] = t["accessCount"] if t.has_key(
                        "accessCount") else None
                    prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key(
                        "sizeInMbytes") else None
                    if "type" in t:
                        prop_json["storage_type"] = t["type"]
                    if "partition" in t:
                        prop_json["partition"] = t["partition"]
                    if "partitions" in t:
                        prop_json["partitions"] = t["partitions"]
                    if "hashKey" in t:
                        prop_json["hashKey"] = t["hashKey"]
                    if "indices" in t:
                        prop_json["indices"] = t["indices"]
                    if "referenceTables" in t:
                        prop_json["referenceTables"] = t["referenceTables"]
                    if "viewSqlText" in t:
                        prop_json["viewSqlText"] = t["viewSqlText"]

                    output['fields'] = []
                    flds = {}
                    field_detail_list = []
                    sort_id = 0
                    for c in t['columns']:
                        # output['fields'].append(
                        #                    { 'name' : t['name'].encode('latin-1'),
                        #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
                        #                      'attributes_json' : c}
                        #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
                        sort_id += 1
                        output['fields'].append({
                            "name":
                            c['name'],
                            "doc":
                            '',
                            "type":
                            c['dataType'] if c['dataType'] else None,
                            "nullable":
                            c['nullable'],
                            "maxByteLength":
                            c['maxByteLength'],
                            "format":
                            c['columnFormat']
                            if c.has_key('columnFormat') else None,
                            "accessCount":
                            c['accessCount']
                            if c.has_key('accessCount') else None,
                            "lastAccessTime":
                            c['lastAccessTime']
                            if c.has_key("lastAccessTime") else None
                        })

                        flds[c['name']] = {
                            'type': c['dataType'],
                            "maxByteLength": c['maxByteLength']
                        }

                        field_detail_list.append([
                            "teradata:///%s/%s" %
                            (d['database'], output['name']),
                            str(sort_id), '0', '', c['name'], '',
                            c['dataType'] if 'dataType' in c
                            and c['dataType'] is not None else '',
                            str(c['maxByteLength'])
                            if 'maxByteLength' in c else '0',
                            str(c['precision']) if 'precision' in c
                            and c['precision'] is not None else '',
                            str(c['scale'])
                            if 'scale' in c and c['scale'] is not None else '',
                            c['nullable'] if 'nullable' in c
                            and c['nullable'] is not None else 'Y', '', '', '',
                            '', '', '', ''
                        ])

                    dataset_scehma_record = DatasetSchemaRecord(
                        output['name'], json.dumps(output),
                        json.dumps(prop_json), json.dumps(flds),
                        "teradata:///%s/%s" % (d['database'], output['name']),
                        'Teradata', output['original_name'],
                        (self.convert_timestamp(t["createTime"])
                         if t.has_key("createTime") else None),
                        (self.convert_timestamp(t["lastAlterTime"])
                         if t.has_key("lastAlterTime") else None))
                    schema_file_writer.append(dataset_scehma_record)

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

                schema_file_writer.flush()
                field_file_writer.flush()
                self.logger.info("%20s contains %6d %s" %
                                 (d['database'], i, k))

        schema_file_writer.close()
        field_file_writer.close()

Example #34

0

Show file

File: OozieExtract.py Project: alyiwang/WhereHows

  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_record = OozieFlowRecord(self.app_id,
                                    row['app_name'],
                                    row['app_path'],
                                    0,
                                    row['source_version'],
                                    row['created_time'],
                                    row['last_modified_time'],
                                    self.wh_exec_id)
      flow_writer.append(flow_record)
      query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
      new_oz_cursor = self.oz_con.cursor()
      new_oz_cursor.execute(query)
      nodes = DbUtil.dict_cursor(new_oz_cursor)

      for node in nodes:
        job_record = OozieJobRecord(self.app_id,
                                    row['app_path'],
                                    row['source_version'],
                                    node['name'],
                                    row['app_path'] + "/" + node['name'],
                                    node['type'],
                                    self.wh_exec_id)
        job_writer.append(job_record)

        if node['transition'] != "*" and node['transition'] is not None:
          dag_edge = OozieFlowDagRecord(self.app_id,
                                        row['app_path'],
                                        row['source_version'],
                                        row['app_path'] + "/" + node['name'],
                                        row['app_path'] + "/" + node['transition'],
                                        self.wh_exec_id)
          dag_writer.append(dag_edge)
      new_oz_cursor.close()

    dag_writer.close()
    job_writer.close()
    flow_writer.close()

Example #35

0

Show file

File: HiveTransform.py Project: BornOfHope/WhereHows

class HiveTransform:
  dataset_dict = {}
  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    username = args[Constant.HIVE_METASTORE_USERNAME]
    password = args[Constant.HIVE_METASTORE_PASSWORD]
    jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
    jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
    self.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
    self.curs = self.conn_hms.cursor()
    dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
    self.instance_writer = FileWriter(dependency_instance_file)

  def transform(self, input, hive_instance, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    dataset_idx = -1

    instance_file_writer = FileWriter(hive_instance)
    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()
    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        view_expanded_text = ''

        if TableInfo.view_expended_text in prop_json:
          view_expanded_text = prop_json[TableInfo.view_expended_text]
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0].lower()
              table_name = names[1].lower()
              if db_name and table_name:
                rows = []
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
                                          'depends on',
                                          'Y',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    self.instance_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          self.instance_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and \
           table[TableInfo.schema_literal] is not None and \
           table[TableInfo.schema_literal].startswith('{'):
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting schema literal for: %s" % (urn))
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema Literal JSON error for table: " + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting column definition for: %s" % (uri))
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])

        dataset_instance_record = DatasetInstanceRecord('dalids:///' + one_db_info['database'] + '/' + table['name']
                                                if one_db_info['type'].lower() == 'dalids'
                                                else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                                'grid',
                                                'eat1',
                                                'eat1-nertz',
                                                '*',
                                                0,
                                                table['native_name'],
                                                table['logical_name'],
                                                table['version'],
                                                table['create_time'],
                                                json.dumps(schema_json),
                                                json.dumps(view_expanded_text),
                                                dataset_urn)
        instance_file_writer.append(dataset_instance_record)

        if dataset_urn not in self.dataset_dict:
          dataset_scehma_record = DatasetSchemaRecord(table['dataset_name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    dataset_urn,
                                                    'Hive', one_db_info['type'], table['type'],
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
          schema_file_writer.append(dataset_scehma_record)

          dataset_idx += 1
          self.dataset_dict[dataset_urn] = dataset_idx

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)



      instance_file_writer.flush()
      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    instance_file_writer.close()
    schema_file_writer.close()
    field_file_writer.close()

  def convert_timestamp(self, time_string):
    return int(time.mktime(time.strptime(time_string, "%Y-%m-%d %H:%M:%S")))