def _match_source_to_existing_locations(
    source: location.NormalizedLocation,
    existing: rtree.index.Index,
    candidate_distance: float,
    enable_match: bool = True,
    enable_create: bool = False,
) -> Optional[load.ImportMatchAction]:
    """Attempt to match source location to existing locations"""
    if not source.location:
        return None

    candidates = list(_find_candidates(source, existing, candidate_distance))

    if not candidates:
        logger.info("%s is a new location - nothing close", source.name)
        if enable_create:
            return load.ImportMatchAction(action="new")
        else:
            return None

    # Filter out candidates that are too different to be a match
    candidates = [loc for loc in candidates if not _is_different(source, loc)]

    if not candidates:
        logger.info("%s is a new location", source.name)
        if enable_create:
            return load.ImportMatchAction(action="new")
        else:
            return None

    # Filter to candidates that are similar enough to be the same
    candidates = [loc for loc in candidates if _is_match(source, loc)]

    # If there is one remaining high confidant match then use it.
    if len(candidates) == 1:
        logger.info("%s is an existing location", source.name)
        if enable_match:
            return load.ImportMatchAction(
                action="existing",
                id=candidates[0]["properties"]["id"],
            )
        else:
            return None

    logger.info("%d matches, not sure about %s", len(candidates), source.name)
    return None
def _match_source_to_existing_locations(
    source: location.NormalizedLocation,
    existing: rtree.index.Index,
    candidate_distance: float,
    enable_match: bool = True,
    enable_create: bool = False,
) -> Optional[load.ImportMatchAction]:
    """Attempt to match source location to existing locations"""
    if not source.location:
        return None

    nearby_candidates = list(
        _find_candidates(source, existing, candidate_distance))

    if not nearby_candidates:
        logger.info("NEW: %s (%s): No candidates nearby", source.id,
                    source.name)
        if enable_create:
            return load.ImportMatchAction(action="new")
        else:
            return None

    # Filter out candidates that are too different to be a match
    different_candidates = [
        loc for loc in nearby_candidates if not _is_different(source, loc)
    ]

    if not different_candidates:
        logger.info(
            "NEW: %s (%s): %d nearby candidates were not matchable",
            source.id,
            source.name,
            len(nearby_candidates),
        )
        if enable_create:
            return load.ImportMatchAction(action="new")
        else:
            return None

    # Filter to candidates that are similar enough to be the same
    candidates = [
        loc for loc in different_candidates if _is_match(source, loc)
    ]

    # If there is one remaining high confidant match then use it.
    if len(candidates) == 1:
        match_candidate = candidates[0]

        logger.info(
            "MATCH: %s (%s) matched to %s (%s)",
            source.id,
            source.name,
            match_candidate["properties"]["id"],
            match_candidate["properties"]["name"],
        )
        if enable_match:
            return load.ImportMatchAction(
                action="existing",
                id=match_candidate["properties"]["id"],
            )
        else:
            return None

    if len(candidates) > 1:
        logger.info(
            "AMBIGUOUS: %s (%s) has %d matches e.g. %s (%s), %s (%s)",
            source.id,
            source.name,
            len(candidates),
            candidates[0]["properties"]["id"],
            candidates[0]["properties"]["name"],
            candidates[1]["properties"]["id"],
            candidates[1]["properties"]["name"],
        )
    else:
        logger.info("MISSING: %s (%s) has no matching candidates", source.id,
                    source.name)

    return None
def run_load_to_vial(
    site_dir: pathlib.Path,
    output_dir: pathlib.Path,
    dry_run: bool,
    vial_http: urllib3.connectionpool.ConnectionPool,
    import_run_id: str,
    locations: Optional[rtree.index.Index],
    matched_ids: Optional[Collection[str]],
    enable_match: bool = True,
    enable_create: bool = False,
    enable_rematch: bool = False,
    match_ids: Optional[Dict[str, str]] = None,
    create_ids: Optional[Collection[str]] = None,
    candidate_distance: float = 0.6,
    import_batch_size: int = 500,
) -> Optional[List[load.ImportSourceLocation]]:
    """Load source to vial source locations"""
    ennrich_run_dir = outputs.find_latest_run_dir(output_dir,
                                                  site_dir.parent.name,
                                                  site_dir.name,
                                                  PipelineStage.ENRICH)
    if not ennrich_run_dir:
        logger.warning(
            "Skipping load for %s because there is no data from enrich stage",
            site_dir.name,
        )
        return None

    if not outputs.data_exists(
            ennrich_run_dir, suffix=STAGE_OUTPUT_SUFFIX[PipelineStage.ENRICH]):
        logger.warning("No enriched data available to load for %s.",
                       site_dir.name)
        return None

    num_imported_locations = 0
    num_new_locations = 0
    num_match_locations = 0
    num_already_matched_locations = 0

    for filepath in outputs.iter_data_paths(
            ennrich_run_dir, suffix=STAGE_OUTPUT_SUFFIX[PipelineStage.ENRICH]):
        import_locations = []
        with filepath.open() as src_file:
            for line in src_file:
                try:
                    normalized_location = location.NormalizedLocation.parse_raw(
                        line)
                except pydantic.ValidationError as e:
                    logger.warning(
                        "Skipping source location because it is invalid: %s\n%s",
                        line,
                        str(e),
                    )
                    continue

                match_action = None
                if match_ids and normalized_location.id in match_ids:
                    match_action = load.ImportMatchAction(
                        action="existing",
                        id=match_ids[normalized_location.id],
                    )

                elif create_ids and normalized_location.id in create_ids:
                    match_action = load.ImportMatchAction(action="new")

                elif (enable_match or enable_create) and locations is not None:
                    # Match source locations if we are re-matching, or if we
                    # haven't matched this source location yet
                    if enable_rematch or (matched_ids
                                          and normalized_location.id
                                          not in matched_ids):
                        match_action = _match_source_to_existing_locations(
                            normalized_location,
                            locations,
                            candidate_distance,
                            enable_match=enable_match,
                            enable_create=enable_create,
                        )
                    else:
                        num_already_matched_locations += 1

                import_location = _create_import_location(
                    normalized_location, match_action=match_action)

                num_imported_locations += 1
                if match_action:
                    if match_action.action == "existing":
                        num_match_locations += 1
                    elif match_action.action == "new":
                        num_new_locations += 1

                import_locations.append(import_location)

        if not import_locations:
            logger.warning(
                "No locations to import in %s in %s",
                filepath.name,
                site_dir.name,
            )
            continue

        if not dry_run:
            try:
                vial.import_source_locations(
                    vial_http,
                    import_run_id,
                    import_locations,
                    import_batch_size=import_batch_size,
                )
            except HTTPError as e:
                logger.warning(
                    "Failed to import some source locations for %s in %s. Because this is action spans multiple remote calls, some locations may have been imported: %s",
                    filepath.name,
                    site_dir.name,
                    e,
                )

            continue

    num_unknown_locations = (num_imported_locations - num_new_locations -
                             num_match_locations -
                             num_already_matched_locations)

    if enable_rematch:
        logger.info(
            "Imported %d source locations for %s (%d new, %d matched, %d unknown)",
            num_imported_locations,
            site_dir.name,
            num_new_locations,
            num_match_locations,
            num_unknown_locations,
        )
    else:
        logger.info(
            "Imported %d source locations for %s "
            "(%d new, %d matched, %d unknown, %d had existing match)",
            num_imported_locations,
            site_dir.name,
            num_new_locations,
            num_match_locations,
            num_unknown_locations,
            num_already_matched_locations,
        )

    return import_locations
Exemple #4
0
    def _process_locations(
        ennrich_run_dir: pathlib.Path,
    ) -> Iterator[load.ImportSourceLocation]:
        nonlocal num_imported_locations
        nonlocal num_new_locations
        nonlocal num_match_locations
        nonlocal num_already_matched_locations
        nonlocal num_already_imported_locations

        for filepath in outputs.iter_data_paths(
            ennrich_run_dir, suffix=STAGE_OUTPUT_SUFFIX[PipelineStage.ENRICH]
        ):
            with filepath.open(mode="rb") as src_file:
                for line in src_file:
                    try:
                        loc_dict = orjson.loads(line)
                    except json.JSONDecodeError as e:
                        logger.warning(
                            "Skipping source location because it is invalid json: %s\n%s",
                            line,
                            str(e),
                        )
                        continue

                    try:
                        normalized_location = location.NormalizedLocation.parse_obj(
                            loc_dict
                        )
                    except pydantic.ValidationError as e:
                        logger.warning(
                            "Skipping source location because it is invalid: %s\n%s",
                            line,
                            str(e),
                        )
                        continue

                    # Skip source locations that haven't changed since last load
                    source_summary = None
                    if source_summaries:
                        source_summary = source_summaries.get(normalized_location.id)

                        if (
                            not enable_reimport
                            and source_summary
                            and source_summary.content_hash
                        ):
                            incoming_hash = normalize.calculate_content_hash(
                                normalized_location
                            )

                            if incoming_hash == source_summary.content_hash:
                                num_already_imported_locations += 1
                                continue

                    match_action = None
                    if match_ids and normalized_location.id in match_ids:
                        match_action = load.ImportMatchAction(
                            action="existing",
                            id=match_ids[normalized_location.id],
                        )

                    elif create_ids and normalized_location.id in create_ids:
                        match_action = load.ImportMatchAction(action="new")

                    elif (enable_match or enable_create) and locations is not None:
                        # Match source locations if we are re-matching, or if we
                        # haven't matched this source location yet
                        if (
                            enable_rematch
                            or not source_summary
                            or not source_summary.matched
                        ):
                            match_action = _match_source_to_existing_locations(
                                normalized_location,
                                locations,
                                candidate_distance,
                                enable_match=enable_match,
                                enable_create=enable_create,
                            )
                        else:
                            num_already_matched_locations += 1

                    import_location = _create_import_location(
                        normalized_location, match_action=match_action
                    )

                    num_imported_locations += 1
                    if match_action:
                        if match_action.action == "existing":
                            num_match_locations += 1
                        elif match_action.action == "new":
                            num_new_locations += 1

                    yield import_location

                    if import_limit and num_imported_locations >= import_limit:
                        logger.info(
                            "Reached import limit of %d and starting load", import_limit
                        )
                        return