Esempio n. 1
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        await self._prepare()

        num_combinations = math.factorial(len(
            self._input_files)) // 2 // math.factorial(
                len(self._input_files) - 2)
        pair_num = 0

        for f1, f2 in combinations(self._input_files.keys(), 2):
            f1_contents = await self.await_file(f1)
            f2_contents = await self.await_file(f2)

            cls = self.Class.SAME_AUTHOR if self._input_files[f1] == self._input_files[f2] \
                else self.Class.DIFFERENT_AUTHORS
            pair = SamplePairImpl(cls, self.chunk_tokenizer)
            await pair.chunk([f1_contents], [f2_contents])

            group_id = PairBuildingProgressEvent.generate_group_id(["a:" +
                                                                    f1] +
                                                                   ["b:" + f2])
            await EventBroadcaster().publish(
                "onPairGenerated",
                PairBuildingProgressEvent(group_id, pair_num, num_combinations,
                                          pair, [f1], [f2]), self.__class__)
            yield pair
            pair_num += 1
Esempio n. 2
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        texts = []
        for a in self._text_dict:
            texts.extend([(a, t, hashlib.sha1(t.encode('utf-8')).hexdigest())
                          for t in self._text_dict[a]])

        num_combinations = math.factorial(
            len(texts)) // 2 // math.factorial(len(texts) - 2)

        for pair_num, (t1, t2) in enumerate(combinations(texts, 2)):
            if t1[0] is None or t2[0] is None:
                cls = self.Class.UNSPECIFIED
            elif t1[0] == t2[0]:
                cls = self.Class.SAME_AUTHOR
            else:
                cls = self.Class.DIFFERENT_AUTHORS

            pair = SamplePairImpl(cls, self.chunk_tokenizer)
            await pair.chunk([t1[1]], [t2[1]])

            group_id = PairBuildingProgressEvent.generate_group_id(
                [t1[2], t2[2]])
            await EventBroadcaster().publish(
                "onPairGenerated",
                PairBuildingProgressEvent(group_id, pair_num + 1,
                                          num_combinations, pair, [t1[0]],
                                          [t2[0]]), self.__class__)
            yield pair
Esempio n. 3
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        # parse ground truth if it exists
        ground_truth = {}
        if os.path.isfile(os.path.join(self.corpus_path, "truth.txt")):
            with open(os.path.join(self.corpus_path, "truth.txt"),
                      "r",
                      encoding="utf-8",
                      errors="ignore") as f:
                for line in f:
                    tmp = [
                        x.strip().replace("\ufeff", "")
                        for x in re.split("[ \t]+", line)
                    ]
                    if 2 != len(tmp):
                        continue
                    ground_truth[tmp[0]] = (tmp[1].upper() == "Y")

        pair_num = 0
        total_num_pairs = len(ground_truth)

        for case_dir in tqdm(glob(os.path.join(self.corpus_path, "*")),
                             desc='Text pairs'):
            if not os.path.isdir(case_dir) or \
               not os.path.isfile(os.path.join(case_dir, "unknown.txt")) or \
               not os.path.isfile(os.path.join(case_dir, "known01.txt")):
                continue

            case = os.path.basename(case_dir)

            chunks_a = []
            file_name_a = os.path.join(self.corpus_path, case, "unknown.txt")
            chunks_a.append(await self.await_file(file_name_a))

            chunks_b = []
            file_names_b = sorted(
                glob(os.path.join(self.corpus_path, case, "known??.txt")))
            for b in file_names_b:
                chunks_b.append(await self.await_file(b))

            cls = self.Class.UNSPECIFIED
            if case in ground_truth:
                cls = self.Class.SAME_AUTHOR if ground_truth[
                    case] else self.Class.DIFFERENT_AUTHORS

            pair = SamplePairImpl(cls, self.chunk_tokenizer)
            await pair.chunk(chunks_a, chunks_b)
            group_id = PairBuildingProgressEvent.generate_group_id(
                [pair.pair_id])
            await EventBroadcaster().publish(
                "onPairGenerated",
                PairBuildingProgressEvent(group_id, pair_num, total_num_pairs,
                                          pair, [file_name_a], file_names_b),
                self.__class__)

            yield pair
            pair_num += 1
Esempio n. 4
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        input_files = glob(os.path.join(self.corpus_path, "*.jsonl"))
        if not 1 <= len(input_files) <= 2:
            raise RuntimeError(
                "Corpus must contain one or two .jsonl files, found {}".format(
                    len(input_files)))

        if len(input_files) > 1:
            input_files = sorted(input_files,
                                 key=lambda x: x.endswith("-truth.jsonl"))
            if not input_files[1].endswith("-truth.jsonl"):
                raise RuntimeError(
                    "One of the input files must end with -truth.jsonl")

        pair_file = open(input_files[0], "r")
        truths = None
        if len(input_files) > 1:
            with open(input_files[1], "r") as truth_file:
                truths = truth_file.readlines()

        try:
            truth_it = iter(truths)
            for pair_num, pair_line in enumerate(pair_file):
                truth_line = next(truth_it) if truths is not None else None

                pair_json = json.loads(pair_line)
                truth_json = json.loads(
                    truth_line) if truth_line is not None else None

                if truth_json and truth_json["id"] != pair_json["id"]:
                    raise ValueError(
                        "IDs of pair and truth must match, found: {} and {}".
                        format(truth_json["id"], pair_json["id"]))

                cls = self.Class.UNSPECIFIED
                if truth_json:
                    cls = self.Class.SAME_AUTHOR if truth_json[
                        "same"] else self.Class.DIFFERENT_AUTHORS

                pair = SamplePairImpl(cls, self.chunk_tokenizer)
                await pair.chunk([pair_json["pair"][0]],
                                 [pair_json["pair"][1]])
                await EventBroadcaster().publish(
                    "onPairGenerated",
                    PairBuildingProgressEvent(pair_json["id"], pair_num,
                                              len(truths), pair,
                                              [pair_json["id"]]),
                    self.__class__)

                yield pair

        finally:
            pair_file.close()
Esempio n. 5
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        await self._prepare()

        pair_num = 0
        single_file_sets = []

        for a1, a2 in combinations_with_replacement(self._input_authors.keys(),
                                                    2):
            for f1 in self._input_authors[a1]:
                f2 = sorted([f for f in self._input_authors[a2] if f != f1])
                if not f2:
                    # skip if author has only one file
                    continue

                if len(f2) == 1:
                    fs = {f1, f2[0]}
                    if fs in single_file_sets:
                        # We already compared these two texts
                        continue
                    single_file_sets.append(fs)

                f1_contents = await self.await_file(f1)
                f2_contents = [await self.await_file(f) for f in f2]

                cls = self.Class.SAME_AUTHOR if a1 == a2 else self.Class.DIFFERENT_AUTHORS
                pair = SamplePairImpl(cls, self.chunk_tokenizer)
                await pair.chunk([f1_contents], f2_contents)

                group_id = PairBuildingProgressEvent.generate_group_id(
                    ["a:" + f1] + ["b:" + ",".join(f2)])
                await EventBroadcaster().publish(
                    "onPairGenerated",
                    PairBuildingProgressEvent(group_id, pair_num, None, pair,
                                              [f1], f2), self.__class__)

                yield pair
                pair_num += 1
Esempio n. 6
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        texts_by_class = {}
        for ds in self._datasets:
            ds_path = os.path.join(self.corpus_path, ds)
            files = os.listdir(ds_path)

            for f in files:
                file_path = os.path.join(ds_path, f)

                if not os.path.isfile(file_path) or not f.endswith(".xml"):
                    continue

                xml = etree.parse(file_path).getroot()
                cls = self._class_assigner(xml)
                if cls == self.SingleTextClass.UNSPECIFIED:
                    continue

                if cls not in texts_by_class:
                    texts_by_class[cls] = []
                texts_by_class[cls].append((file_path, xml))

        # compound classes to build
        processed_comp_classes = []

        pair_num = 0

        for cls1 in texts_by_class:
            num_texts1 = len(texts_by_class[cls1])

            for cls2 in texts_by_class:
                pair_class = None
                try:
                    pair_class = self.PairClass[str(cls1) + "_" + str(cls2)]
                    pair_class = self.PairClass[str(cls2) + "_" + str(cls1)]
                except KeyError:
                    if pair_class is None:
                        continue

                comp_class = {cls1, cls2}
                if comp_class in processed_comp_classes:
                    continue
                processed_comp_classes.append(comp_class)

                num_texts2 = len(texts_by_class[cls2])

                # list to keep track of already drawn texts, so we don't use them again
                drawn_a = []
                drawn_b = []

                # number of already matched chunk / text pairs
                pair_counter = 0

                # final chunks of a pair
                chunks_a = []
                chunks_b = []

                # file names of drawn texts
                file_names_a = []
                file_names_b = []

                # skip if both classes have too few samples
                if num_texts1 + num_texts2 < self._samples or \
                   cls1 == cls2 and num_texts1 < self._samples // 4:
                    continue

                while pair_counter < self._samples and len(
                        drawn_a) < num_texts1 and len(
                            drawn_b) < num_texts2 > 0:
                    idx1 = random.randint(0, num_texts1 - 1)
                    idx2 = random.randint(0, num_texts2 - 1)
                    if cls1 == cls2:
                        # make sure the cut between both sets is always empty
                        # when comparing a class against itself
                        if idx1 == idx2:
                            continue
                        if idx1 in drawn_a or idx1 in drawn_b:
                            continue
                        if idx2 in drawn_a or idx2 in drawn_b:
                            continue

                    if idx1 in drawn_a or idx2 in drawn_b:
                        continue

                    for e in texts_by_class[cls1][idx1][1]:
                        if e.tag == "mainText":
                            chunks_a.append(str(e.text))
                            file_names_a.append(texts_by_class[cls1][idx1][0])
                            break
                    drawn_a.append(idx1)
                    for e in texts_by_class[cls2][idx2][1]:
                        if e.tag == "mainText":
                            chunks_b.append(str(e.text))
                            file_names_b.append(texts_by_class[cls2][idx2][0])
                            break
                    drawn_b.append(idx2)

                    pair_counter += 1

                    # break earlier when we are comparing a class with itself, since
                    # we only need half the number of iterations
                    if cls1 == cls2 and len(drawn_a) >= num_texts1 // 2:
                        break

                    # generate more samples by random oversampling when one class has less
                    # than self._samples // 2 samples
                    #if pair_counter < self._samples // 2 and len(drawn_a) >= num_texts1:
                    #    drawn_a = []
                    #elif pair_counter < self._samples // 2 and len(drawn_b) >= num_texts2:
                    #    drawn_b = []

                pair = SamplePairImpl(pair_class, self.chunk_tokenizer)
                await pair.chunk(chunks_a, chunks_b)
                group_id = PairBuildingProgressEvent.generate_group_id(
                    [pair.pair_id])
                await EventBroadcaster().publish(
                    "onPairGenerated",
                    PairBuildingProgressEvent(group_id, pair_num, None, pair,
                                              file_names_a, file_names_b),
                    self.__class__)
                pair_num += 1
                yield pair
Esempio n. 7
0
    async def __aiter__(self) -> AsyncGenerator[SamplePair, None]:
        texts_by_portals = {}

        for ds in self._datasets:
            ds_path = os.path.join(self.corpus_path, ds)
            files = os.listdir(ds_path)

            for f in files:
                file_path = os.path.join(ds_path, f)

                if not os.path.isfile(file_path) or not f.endswith(".xml"):
                    continue

                xml = etree.parse(file_path).getroot()
                portal = ""
                main_text = ""
                done = 0
                for e in xml:
                    if e.tag == "uri":
                        portal = urlparse(str(e.text)).hostname
                        done += 1
                    if e.tag == "mainText":
                        main_text = str(e.text)
                        done += 1
                    if done >= 2:
                        break

                if portal == "" or main_text == "":
                    continue

                if portal not in texts_by_portals:
                    texts_by_portals[portal] = []
                texts_by_portals[portal].append((file_path, main_text))

        # discard all portals with too few texts
        discard = []
        for p in texts_by_portals:
            if len(texts_by_portals[p]) < 50:
                discard.append(p)
        texts_by_portals = {
            k: v
            for (k, v) in texts_by_portals.items() if k not in discard
        }

        pair_num = 0

        for cls1 in texts_by_portals:
            num_texts1 = len(texts_by_portals[cls1])

            for cls2 in texts_by_portals:
                num_texts2 = len(texts_by_portals[cls2])

                # number of already matched chunk / text pairs
                pair_counter = 0

                # keep track of already drawn texts
                drawn_a = []
                drawn_b = []

                # final chunks of a pair
                chunks_a = []
                chunks_b = []

                # file names of drawn texts
                file_names_a = []
                file_names_b = []

                while pair_counter < self._samples and len(
                        drawn_a) < num_texts1 and len(drawn_b) < num_texts2:
                    idx1 = random.randint(0, num_texts1 - 1)
                    idx2 = random.randint(0, num_texts2 - 1)
                    if cls1 == cls2:
                        # make sure the cut between both sets is always empty
                        # when comparing a class against itself
                        if idx1 == idx2:
                            continue
                        if idx1 in drawn_a or idx1 in drawn_b:
                            continue
                        if idx2 in drawn_a or idx2 in drawn_b:
                            continue

                    if idx1 in drawn_a or idx2 in drawn_b:
                        continue

                    chunks_a.append(texts_by_portals[cls1][idx1][1])
                    chunks_b.append(texts_by_portals[cls2][idx2][1])
                    file_names_a.append(texts_by_portals[cls1][idx1][0])
                    file_names_b.append(texts_by_portals[cls2][idx2][0])
                    drawn_a.append(idx1)
                    drawn_b.append(idx2)

                    pair_counter += 1

                    # break earlier when we are comparing a class with itself, since
                    # we only need half the number of iterations
                    if cls1 == cls2 and len(drawn_a) >= num_texts1 // 2:
                        break

                    # generate more samples by random oversampling when one class has less
                    # than self._samples // 2 samples
                    if pair_counter < self._samples // 2 and len(
                            drawn_a) >= num_texts1:
                        drawn_a = []
                    elif pair_counter < self._samples // 2 and len(
                            drawn_b) >= num_texts2:
                        drawn_b = []

                pair_class = self.Class.DIFFERENT_PORTALS
                if cls1 == cls2:
                    pair_class = self.Class.SAME_PORTAL

                pair = SamplePairImpl(pair_class, self.chunk_tokenizer)
                await pair.chunk(chunks_a, chunks_b)
                group_id = PairBuildingProgressEvent.generate_group_id(
                    [pair.pair_id])
                await EventBroadcaster().publish(
                    "onPairGenerated",
                    PairBuildingProgressEvent(group_id, pair_num, None, pair,
                                              file_names_a, file_names_b),
                    self.__class__)
                pair_num += 1
                yield pair