Exemple #1
0
def populate_compounds(
    session: Session,
    properties: pd.DataFrame,
    cross_references: pd.DataFrame,
    batch_size: int,
) -> None:
    """
    Populate the compound and identifier tables using information from MetaNetX.

    Parameters
    ----------
    session : SQLAlchemy.Session
    properties : pd.DataFrame
    cross_references : pd.DataFrame
    batch_size : int

    Warnings
    --------
    The function uses bulk inserts for performance and thus assumes empty
    tables. Do **not** use it for updating content.

    """
    prefix2registry = get_mnx_mapping(session)
    grouped_xref = cross_references[
        (cross_references["prefix"] != "metanetx.chemical")
        & cross_references["accession"].notnull()].groupby("mnx_id",
                                                           sort=False)
    with tqdm(total=len(properties), desc="Compounds") as pbar:
        for index in range(0, len(properties), batch_size):
            compounds = [
                create_compound_object(row)
                for row in properties[index:index +
                                      batch_size].itertuples(index=False)
            ]
            session.bulk_insert_mappings(Compound, compounds)
            session.commit()
            pbar.update(len(compounds))
    with tqdm(total=len(properties), desc="Cross-References") as pbar:
        for index in range(0, len(properties), batch_size):
            identifiers = []
            counter = 0
            for row in session.query(Compound.id, Compound.mnx_id).slice(
                    index, index + batch_size):
                try:
                    identifiers.extend(
                        create_compound_identifier_objects(
                            row.id,
                            grouped_xref.get_group(row.mnx_id),
                            prefix2registry,
                        ))
                except KeyError:
                    logger.debug("Compound '%s' has no cross-references.",
                                 row.mnx_id)
                counter += 1
            session.bulk_insert_mappings(CompoundIdentifier, identifiers)
            session.commit()
            pbar.update(counter)
Exemple #2
0
    def _create_task_instances(
        self,
        dag_id: str,
        tasks: Iterable["Operator"],
        created_counts: Dict[str, int],
        hook_is_noop: bool,
        *,
        session: Session,
    ) -> None:
        """
        Create the necessary task instances from the given tasks.

        :param dag_id: DAG ID associated with the dagrun
        :param tasks: the tasks to create the task instances from
        :param created_counts: a dictionary of number of tasks -> total ti created by the task creator
        :param hook_is_noop: whether the task_instance_mutation_hook is noop
        :param session: the session to use

        """
        try:
            if hook_is_noop:
                session.bulk_insert_mappings(TI, tasks)
            else:
                session.bulk_save_objects(tasks)

            for task_type, count in created_counts.items():
                Stats.incr(f"task_instance_created-{task_type}", count)
            session.flush()
        except IntegrityError:
            self.log.info(
                'Hit IntegrityError while creating the TIs for %s- %s',
                dag_id,
                self.run_id,
                exc_info=True,
            )
            self.log.info('Doing session rollback.')
            # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive.
            session.rollback()
Exemple #3
0
    def verify_integrity(self, session: Session = NEW_SESSION):
        """
        Verifies the DagRun by checking for removed tasks or tasks that are not in the
        database yet. It will set state to removed or add the task if required.

        :param session: Sqlalchemy ORM Session
        :type session: Session
        """
        from airflow.settings import task_instance_mutation_hook

        dag = self.get_dag()
        tis = self.get_task_instances(session=session)

        # check for removed or restored tasks
        task_ids = set()
        for ti in tis:
            task_instance_mutation_hook(ti)
            task_ids.add(ti.task_id)
            task = None
            try:
                task = dag.get_task(ti.task_id)
            except AirflowException:
                if ti.state == State.REMOVED:
                    pass  # ti has already been removed, just ignore it
                elif self.state != State.RUNNING and not dag.partial:
                    self.log.warning(
                        "Failed to get task '%s' for dag '%s'. Marking it as removed.",
                        ti, dag)
                    Stats.incr(f"task_removed_from_dag.{dag.dag_id}", 1, 1)
                    ti.state = State.REMOVED

            should_restore_task = (task
                                   is not None) and ti.state == State.REMOVED
            if should_restore_task:
                self.log.info(
                    "Restoring task '%s' which was previously removed from DAG '%s'",
                    ti, dag)
                Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1)
                ti.state = State.NONE
            session.merge(ti)

        def task_filter(task: "BaseOperator"):
            return task.task_id not in task_ids and (
                self.is_backfill or task.start_date <= self.execution_date)

        created_counts: Dict[str, int] = defaultdict(int)

        # Set for the empty default in airflow.settings -- if it's not set this means it has been changed
        hook_is_noop = getattr(task_instance_mutation_hook, 'is_noop', False)

        if hook_is_noop:

            def create_ti_mapping(task: "BaseOperator"):
                created_counts[task.task_type] += 1
                return TI.insert_mapping(self.run_id, task)

        else:

            def create_ti(task: "BaseOperator") -> TI:
                ti = TI(task, run_id=self.run_id)
                task_instance_mutation_hook(ti)
                created_counts[ti.operator] += 1
                return ti

        # Create missing tasks
        tasks = list(filter(task_filter, dag.task_dict.values()))
        try:
            if hook_is_noop:
                session.bulk_insert_mappings(TI, map(create_ti_mapping, tasks))
            else:
                session.bulk_save_objects(map(create_ti, tasks))

            for task_type, count in created_counts.items():
                Stats.incr(f"task_instance_created-{task_type}", count)
            session.flush()
        except IntegrityError as err:
            self.log.info(str(err))
            self.log.info(
                'Hit IntegrityError while creating the TIs for %s- %s',
                dag.dag_id, self.run_id)
            self.log.info('Doing session rollback.')
            # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive.
            session.rollback()
Exemple #4
0
    def verify_integrity(self, session: Session = NEW_SESSION):
        """
        Verifies the DagRun by checking for removed tasks or tasks that are not in the
        database yet. It will set state to removed or add the task if required.

        :param session: Sqlalchemy ORM Session
        """
        from airflow.settings import task_instance_mutation_hook

        dag = self.get_dag()
        tis = self.get_task_instances(session=session)

        # check for removed or restored tasks
        task_ids = set()
        for ti in tis:
            task_instance_mutation_hook(ti)
            task_ids.add(ti.task_id)
            task = None
            try:
                task = dag.get_task(ti.task_id)

                should_restore_task = (
                    task is not None) and ti.state == State.REMOVED
                if should_restore_task:
                    self.log.info(
                        "Restoring task '%s' which was previously removed from DAG '%s'",
                        ti, dag)
                    Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1)
                    ti.state = State.NONE
            except AirflowException:
                if ti.state == State.REMOVED:
                    pass  # ti has already been removed, just ignore it
                elif self.state != State.RUNNING and not dag.partial:
                    self.log.warning(
                        "Failed to get task '%s' for dag '%s'. Marking it as removed.",
                        ti, dag)
                    Stats.incr(f"task_removed_from_dag.{dag.dag_id}", 1, 1)
                    ti.state = State.REMOVED
                continue

            if not task.is_mapped:
                continue
            task = cast("MappedOperator", task)
            num_mapped_tis = task.parse_time_mapped_ti_count
            # Check if the number of mapped literals has changed and we need to mark this TI as removed
            if num_mapped_tis is not None:
                if ti.map_index >= num_mapped_tis:
                    self.log.debug(
                        "Removing task '%s' as the map_index is longer than the literal mapping list (%s)",
                        ti,
                        num_mapped_tis,
                    )
                    ti.state = State.REMOVED
                elif ti.map_index < 0:
                    self.log.debug(
                        "Removing the unmapped TI '%s' as the mapping can now be performed",
                        ti)
                    ti.state = State.REMOVED
                else:
                    self.log.info("Restoring mapped task '%s'", ti)
                    Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1)
                    ti.state = State.NONE
            else:
                #  What if it is _now_ dynamically mapped, but wasn't before?
                total_length = task.run_time_mapped_ti_count(self.run_id,
                                                             session=session)

                if total_length is None:
                    # Not all upstreams finished, so we can't tell what should be here. Remove everything.
                    if ti.map_index >= 0:
                        self.log.debug(
                            "Removing the unmapped TI '%s' as the mapping can't be resolved yet",
                            ti)
                        ti.state = State.REMOVED
                    continue
                # Upstreams finished, check there aren't any extras
                if ti.map_index >= total_length:
                    self.log.debug(
                        "Removing task '%s' as the map_index is longer than the resolved mapping list (%d)",
                        ti,
                        total_length,
                    )
                    ti.state = State.REMOVED
                    ...

        def task_filter(task: "Operator") -> bool:
            return task.task_id not in task_ids and (
                self.is_backfill or task.start_date <= self.execution_date and
                (task.end_date is None
                 or self.execution_date <= task.end_date))

        created_counts: Dict[str, int] = defaultdict(int)

        # Set for the empty default in airflow.settings -- if it's not set this means it has been changed
        hook_is_noop = getattr(task_instance_mutation_hook, 'is_noop', False)

        if hook_is_noop:

            def create_ti_mapping(task: "Operator",
                                  indexes: Tuple[int, ...]) -> Generator:
                created_counts[task.task_type] += 1
                for map_index in indexes:
                    yield TI.insert_mapping(self.run_id,
                                            task,
                                            map_index=map_index)

            creator = create_ti_mapping

        else:

            def create_ti(task: "Operator", indexes: Tuple[int,
                                                           ...]) -> Generator:
                for map_index in indexes:
                    ti = TI(task, run_id=self.run_id, map_index=map_index)
                    task_instance_mutation_hook(ti)
                    created_counts[ti.operator] += 1
                    yield ti

            creator = create_ti

        # Create missing tasks -- and expand any MappedOperator that _only_ have literals as input
        def expand_mapped_literals(
                task: "Operator") -> Tuple["Operator", Sequence[int]]:
            if not task.is_mapped:
                return (task, (-1, ))
            task = cast("MappedOperator", task)
            count = task.parse_time_mapped_ti_count or task.run_time_mapped_ti_count(
                self.run_id, session=session)
            if not count:
                return (task, (-1, ))
            return (task, range(count))

        tasks_and_map_idxs = map(expand_mapped_literals,
                                 filter(task_filter, dag.task_dict.values()))
        tasks = itertools.chain.from_iterable(
            itertools.starmap(creator, tasks_and_map_idxs))

        try:
            if hook_is_noop:
                session.bulk_insert_mappings(TI, tasks)
            else:
                session.bulk_save_objects(tasks)

            for task_type, count in created_counts.items():
                Stats.incr(f"task_instance_created-{task_type}", count)
            session.flush()
        except IntegrityError:
            self.log.info(
                'Hit IntegrityError while creating the TIs for %s- %s',
                dag.dag_id,
                self.run_id,
                exc_info=True,
            )
            self.log.info('Doing session rollback.')
            # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive.
            session.rollback()