Ejemplo n.º 1
0
  def get_tasks(self, release_id):
    """Retrieve all task information.

    Args:
      release_id: str, unique release identifier

    Returns:
      Dictionary of Task objects.
    """
    if release_id in self._releases:
      tasks = json.loads(self._releases['release_id'].tasks)
      task_objects = []
      for k, v in tasks:
        task_objects.append(Task(v))
      return task_objects
    dag_id, execution_date = release_id_parser(release_id)
    execution_date = time.mktime(execution_date.timetuple())
    # build SQl query
    task_query = to_sql_tasks(execution_date)
    # get data from SQL
    task_objects = {}
    for config in self._configs:
      raw_task_data = config.db.query(task_query)  # it's a tuple, so no .append()
      if len(raw_task_data):
        # package the data into release objects with all neccessary info
        task_object = read_tasks(raw_task_data)

    return task_objects
Ejemplo n.º 2
0
  def get_logs(self, release_id, task_name, log_file='1.log'):
    """Gets the logs for a task from GCS.

    Args:
      release_id: str
      task_name: str
      log_file: str

    Returns:
      Structured log text
    """
    # !!! Note that because of overlapping datetimes and dag_ids, some logs WILL be broken.
    #     This cannot be helped, because of the way which Airflow defines a release !!!
    dag_id, execution_date = release_id_parser(release_id)
    for i, config in enumerate(self._configs):
      dag_run = config.db.query(to_sql_tasks(dag_id, float(time.mktime(execution_date.timetuple()))))
      if len(dag_run):
        execution_date = str(execution_date).replace(' ', 'T')  # put into same format as gcs bucket
        logging.debug('Bucket name: ' + config.bucket_name)
        filename = os.path.join(os.path.sep, config.bucket_name, 'logs', dag_id, task_name, execution_date, log_file)
        logging.info('Retrieving from GCS: ' + str(filename))
        try:
          gcs_file = gcs.open(filename)
          contents = gcs_file.read()
          gcs_file.close()
          return contents
        except:
          continue
Ejemplo n.º 3
0
  def get_task(self, task_name, release_id):

    """Retrieves a task object.

    Args:
      task_name: str
      release_id: str, unique release identifier

    Returns:
      Dictionary with Task object.
    """
    if release_id in self._releases:
      return [self._tasks[task_name + '@' + release_id]]


    dag_id, execution_date = release_id_parser(release_id)
    execution_date = time.mktime(execution_date.timetuple())

    task_query = to_sql_task(dag_id, task_name, execution_date)
    task_object = {}
    for config in self._configs:
      raw_task_data = config.db.query(task_query)  # it's a tuple, so no .append()
      if len(raw_task_data):
        # package the data into release objects with all neccessary info
        task_object = read_tasks(raw_task_data)

    return task_object
Ejemplo n.º 4
0
  def get_release(self, release_id):
    """Gets a single release, defined by release_id.

    Args:
      release_id: str, unique release identifier

    Returns:
      release_data: a release object in a list
    """
    dag_id, execution_date = release_id_parser(release_id)
    # construct SQL query
    release_query = to_sql_release(dag_id, execution_date)
    # get data from SQL
    release_data = {}
    for config in self._configs:
      raw_release_data = config.db.query(release_query)  # it's a tuple, so no .append()
      # package the data into release objects with all neccessary info
      release_data.update(read_releases(raw_release_data, config.db))

    return release_data
Ejemplo n.º 5
0
airflow_db = AirflowDB(host=CLOUDSQL_HOST,
                       user=CLOUDSQL_USER,
                       password=CLOUDSQL_PASSWORD,
                       db=CLOUDSQL_DB)

raw_release_data = airflow_db.query(
    'SELECT dag_id, execution_date FROM dag_run')
release_objects = read_releases(raw_release_data, airflow_db)

releases = {}
releases_read = 0
for release in release_objects.values():
    releases_read += 1
    tasks = []
    release_id = release.release_id
    dag_id, execution_date = release_id_parser(release_id)
    execution_date = int(time.mktime(execution_date.timetuple()))
    for task in release.tasks:
        sql_query = to_sql_task(dag_id, task, execution_date)
        raw_task_data = airflow_db.query(sql_query)
        task_data = read_tasks(raw_task_data)
        task = task_data[0].to_json()
        task['last_modified'] = int(task['last_modified'])
        task['started'] = int(task['started'])
        tasks.append(task)
    release = release.to_json()
    release['tasks'] = tasks
    release['started'] = int(release['started'])
    release['last_modified'] = int(release['last_modified'])
    releases[release_id] = release
    print 'Release "%s" downloaded' % release_id