Ejemplo n.º 1
0
def collect_entities_by_type(
    relation_types: Dictionary,
    entity_configs: Dict[str, EntitySchema],
    relation_configs: List[RelationSchema],
    edge_paths: List[str],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
    entity_min_count: int,
) -> Dict[str, Dictionary]:

    counters: Dict[str, Counter[str]] = {}
    for entity_name in entity_configs.keys():
        counters[entity_name] = Counter()

    print("Searching for the entities in the edge files...")
    for edgepath in edge_paths:
        with open(edgepath, "rt") as tf:
            for line_num, line in enumerate(tf, start=1):
                words = line.split()
                try:
                    lhs_word = words[lhs_col]
                    rhs_word = words[rhs_col]
                    rel_word = words[rel_col] if rel_col is not None else None
                except IndexError:
                    raise RuntimeError(
                        "Line %d of %s has only %d words" %
                        (line_num, edgepath, len(words))) from None

                if dynamic_relations or rel_col is None:
                    rel_id = 0
                else:
                    try:
                        rel_id = relation_types.get_id(rel_word)
                    except KeyError:
                        raise RuntimeError(
                            "Could not find relation type in config")

                counters[relation_configs[rel_id].lhs][lhs_word] += 1
                counters[relation_configs[rel_id].rhs][rhs_word] += 1

    entities_by_type: Dict[str, Dictionary] = {}
    for entity_name, counter in counters.items():
        print("Entity type %s:" % entity_name)
        print("- Found %d entities" % len(counter))
        if entity_min_count > 0:
            print("- Removing the ones with fewer than %d occurrences..." %
                  entity_min_count)
            counter = Counter(
                {k: c
                 for k, c in counter.items() if c >= entity_min_count})
            print("- Left with %d entities" % len(counter))
        print("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)
        entities_by_type[entity_name] = Dictionary(
            names, num_parts=entity_configs[entity_name].num_partitions)

    return entities_by_type
Ejemplo n.º 2
0
def generate_edge_path_files_fast(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    edgelist_reader: EdgelistReader,
) -> None:
    processed = 0
    skipped = 0

    log("Taking the fast train!")
    data = []
    for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
        if rel_word is None:
            rel_id = 0
        else:
            try:
                rel_id = relation_types.get_id(rel_word)
            except KeyError:
                # Ignore edges whose relation type is not known.
                skipped += 1
                continue

        lhs_type = relation_configs[rel_id].lhs
        rhs_type = relation_configs[rel_id].rhs

        try:
            _, lhs_offset = entities_by_type[lhs_type].get_partition(lhs_word)
            _, rhs_offset = entities_by_type[rhs_type].get_partition(rhs_word)
        except KeyError:
            # Ignore edges whose entities are not known.
            skipped += 1
            continue

        data.append((lhs_offset, rhs_offset, rel_id))

        processed = processed + 1
        if processed % 100000 == 0:
            log(f"- Processed {processed} edges so far...")

    lhs_offsets, rhs_offsets, rel_ids = zip(*data)
    edge_list = EdgeList(
        EntityList.from_tensor(torch.tensor(list(lhs_offsets), dtype=torch.long)),
        EntityList.from_tensor(torch.tensor(list(rhs_offsets), dtype=torch.long)),
        torch.tensor(list(rel_ids), dtype=torch.long),
    )
    edge_storage.save_edges(0, 0, edge_list)

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )
Ejemplo n.º 3
0
def collect_entities_by_type(
    relation_types: Dictionary,
    entity_configs: Dict[str, EntitySchema],
    relation_configs: List[RelationSchema],
    edge_paths: List[Path],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
    entity_min_count: int,
) -> Dict[str, Dictionary]:

    counters: Dict[str, Counter[str]] = {}
    for entity_name in entity_configs.keys():
        counters[entity_name] = Counter()

    log("Searching for the entities in the edge files...")
    for edgepath in edge_paths:
        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edgepath):
            if dynamic_relations or rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    raise RuntimeError("Could not find relation type in config")

            counters[relation_configs[rel_id].lhs][lhs_word] += 1
            counters[relation_configs[rel_id].rhs][rhs_word] += 1

    entities_by_type: Dict[str, Dictionary] = {}
    for entity_name, counter in counters.items():
        log(f"Entity type {entity_name}:")
        log(f"- Found {len(counter)} entities")
        if entity_min_count > 0:
            log(
                f"- Removing the ones with fewer than {entity_min_count} occurrences..."
            )
            counter = Counter(
                {k: c for k, c in counter.items() if c >= entity_min_count}
            )
            log(f"- Left with {len(counter)} entities")
        log("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)
        entities_by_type[entity_name] = Dictionary(
            names, num_parts=entity_configs[entity_name].num_partitions
        )

    return entities_by_type
Ejemplo n.º 4
0
def generate_edge_path_files(
    edge_file_in: str,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:

    basename, _ = os.path.splitext(edge_file_in)
    edge_path_out = basename + '_partitioned'

    print("Preparing edge path %s, out of the edges found in %s" %
          (edge_path_out, edge_file_in))
    os.makedirs(edge_path_out, exist_ok=True)

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print("- Edges will be partitioned in %d x %d buckets." %
          (num_lhs_parts, num_rhs_parts))

    buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \
        DefaultDict(list)
    processed = 0
    skipped = 0

    with open(edge_file_in, "rt") as tf:
        for line in tf:
            words = line.split()
            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(words[rel_col])
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(words[lhs_col])
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(words[rhs_col])
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            buckets[lhs_part, rhs_part].append(
                (lhs_offset, rhs_offset, rel_id))

            processed = processed + 1
            if processed % 100000 == 0:
                print("- Processed %d edges so far..." % processed)

    print("- Processed %d edges in total" % processed)
    if skipped > 0:
        print(
            "- Skipped %d edges because their relation type or entities were "
            "unknown (either not given in the config or filtered out as too "
            "rare)." % skipped)

    for i in range(num_lhs_parts):
        for j in range(num_rhs_parts):
            print("- Writing bucket (%d, %d), containing %d edges..." %
                  (i, j, len(buckets[i, j])))
            edges = np.asarray(buckets[i, j])
            with h5py.File(
                    os.path.join(edge_path_out, "edges_%d_%d.h5" % (i, j)),
                    "w") as hf:
                hf.attrs["format_version"] = 1
                hf.create_dataset("lhs", data=edges[:, 0])
                hf.create_dataset("rhs", data=edges[:, 1])
                hf.create_dataset("rel", data=edges[:, 2])
Ejemplo n.º 5
0
def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
    n_flush_edges: int = 100000,
) -> None:
    log(
        f"Preparing edge path {edge_path_out}, "
        f"out of the edges found in {edge_file_in}"
    )
    edge_storage.prepare()

    num_lhs_parts = max(
        entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs
    )
    num_rhs_parts = max(
        entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs
    )

    log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.")

    processed = 0
    skipped = 0
    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        data: Dict[Tuple[int, int], List[Tuple[int, int, int]]] = {}

        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
            if rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition(
                    lhs_word
                )
                rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition(
                    rhs_word
                )
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part)
                )
                data[lhs_part, rhs_part] = []

            part_data = data[lhs_part, rhs_part]
            part_data.append((lhs_offset, rhs_offset, rel_id))
            if len(part_data) > n_flush_edges:
                append_to_file(part_data, appenders[lhs_part, rhs_part])
                part_data.clear()

            processed = processed + 1
            if processed % 100000 == 0:
                log(f"- Processed {processed} edges so far...")

        for (lhs_part, rhs_part), part_data in data.items():
            if len(part_data) > 0:
                append_to_file(part_data, appenders[lhs_part, rhs_part])
                part_data.clear()

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )
Ejemplo n.º 6
0
def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
) -> None:
    log(
        f"Preparing edge path {edge_path_out}, "
        f"out of the edges found in {edge_file_in}"
    )
    edge_storage.prepare()

    num_lhs_parts = max(
        entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs
    )
    num_rhs_parts = max(
        entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs
    )

    if not dynamic_relations and num_lhs_parts == 1 and num_rhs_parts == 1:
        print('using fast version')
        return generate_edge_path_files_fast(
            edge_file_in,
            edge_path_out,
            edge_storage,
            entities_by_type,
            relation_types,
            relation_configs,
            edgelist_reader,
        )

    log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.")

    processed = 0
    skipped = 0

    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
            if rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition(
                    lhs_word
                )
                rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition(
                    rhs_word
                )
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part)
                )
            appenders[lhs_part, rhs_part].append_edges(
                EdgeList(
                    EntityList.from_tensor(
                        torch.tensor([lhs_offset], dtype=torch.long)
                    ),
                    EntityList.from_tensor(
                        torch.tensor([rhs_offset], dtype=torch.long)
                    ),
                    torch.tensor([rel_id], dtype=torch.long),
                )
            )

            processed = processed + 1
            if processed % 100000 == 0:
                log(f"- Processed {processed} edges so far...")

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )
Ejemplo n.º 7
0
def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:
    print(f"Preparing edge path {edge_path_out}, "
          f"out of the edges found in {edge_file_in}")
    edge_storage.prepare()

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print(
        f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets."
    )

    processed = 0
    skipped = 0

    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with edge_file_in.open("rt") as tf, ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        for line_num, line in enumerate(tf, start=1):
            words = line.split()
            try:
                lhs_word = words[lhs_col]
                rhs_word = words[rhs_col]
                rel_word = words[rel_col] if rel_col is not None else None
            except IndexError:
                raise RuntimeError(
                    f"Line {line_num} of {edge_file_in} has only {len(words)} words"
                ) from None

            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(lhs_word)
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(rhs_word)
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part))
            appenders[lhs_part, rhs_part].append_edges(
                EdgeList(
                    EntityList.from_tensor(
                        torch.tensor([lhs_offset], dtype=torch.long)),
                    EntityList.from_tensor(
                        torch.tensor([rhs_offset], dtype=torch.long)),
                    torch.tensor([rel_id], dtype=torch.long),
                ))

            processed = processed + 1
            if processed % 100000 == 0:
                print(f"- Processed {processed} edges so far...")

    print(f"- Processed {processed} edges in total")
    if skipped > 0:
        print(f"- Skipped {skipped} edges because their relation type or "
              f"entities were unknown (either not given in the config or "
              f"filtered out as too rare).")
Ejemplo n.º 8
0
def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:
    print(f"Preparing edge path {edge_path_out}, "
          f"out of the edges found in {edge_file_in}")
    edge_storage.prepare()

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print(
        f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets."
    )

    buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \
        DefaultDict(list)
    processed = 0
    skipped = 0

    with edge_file_in.open("rt") as tf:
        for line_num, line in enumerate(tf, start=1):
            words = line.split()
            try:
                lhs_word = words[lhs_col]
                rhs_word = words[rhs_col]
                rel_word = words[rel_col] if rel_col is not None else None
            except IndexError:
                raise RuntimeError(
                    f"Line {line_num} of {edge_file_in} has only {len(words)} words"
                ) from None

            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(lhs_word)
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(rhs_word)
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            buckets[lhs_part, rhs_part].append(
                (lhs_offset, rhs_offset, rel_id))

            processed = processed + 1
            if processed % 100000 == 0:
                print(f"- Processed {processed} edges so far...")

    print(f"- Processed {processed} edges in total")
    if skipped > 0:
        print(f"- Skipped {skipped} edges because their relation type or "
              f"entities were unknown (either not given in the config or "
              f"filtered out as too rare).")

    for i in range(num_lhs_parts):
        for j in range(num_rhs_parts):
            print(f"- Writing bucket ({i}, {j}), "
                  f"containing {len(buckets[i, j])} edges...")
            edges = torch.tensor(buckets[i, j], dtype=torch.long).view((-1, 3))
            edge_storage.save_edges(
                i, j,
                EdgeList(
                    EntityList.from_tensor(edges[:, 0]),
                    EntityList.from_tensor(edges[:, 1]),
                    edges[:, 2],
                ))