Beispiel #1
0
    def test_partial_load_invalid_end_index(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id=0,
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id=1,
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id=2, text="this is a test", user=User(name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 0)
Beispiel #2
0
    def test_partial_load_start_idx_specified_only(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 2)
        self.assertEqual(corpus1.get_utterance("1"),
                         corpus2.get_utterance("1"))
        self.assertEqual(corpus1.get_utterance("2"),
                         corpus2.get_utterance("2"))
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str,
                         meta_cols: List[str]) -> Corpus:
    """ Helper function to convert data to Corpus format
     
    Arguments:
        df {DataFrame} -- Actual data, in a pandas Dataframe
        id_col {str} -- name of the column that corresponds to utterances ids 
        text_col {str} -- name of the column that stores texts of the utterances  
        meta_cols {List[str]} -- set of columns that stores relevant metadata 
    
    Returns:
        Corpus -- the converted corpus
    """

    # in this particular case, user, reply_to, and timestamp information are all not applicable
    # and we will simply either create a placeholder entry, or leave it as None

    user = User(id="user")
    time = "NOT_RECORDED"

    utterance_list = []
    for index, row in tqdm(df.iterrows()):

        # extracting meta data
        metadata = {}
        for meta_col in meta_cols:
            metadata[meta_col] = row[meta_col]

        utterance_list.append(Utterance(id=str(row[id_col]), user=user,\
                                        root=str(row[id_col]), reply_to=None,\
                                        timestamp=time, text=row[text_col], \
                                        meta=metadata))

    return Corpus(utterances=utterance_list)
Beispiel #4
0
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a Corpus from an Intermediate.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        if block.user not in users:
            users[block.user] = User(id=block.user)
        segments = accum.segment_contiguous_blocks(block.reply_chain)
        for seg in segments[:-1]:
            sos = helpers.string_of_seg(seg)
            complete_utterances.add(sos)

        assert (block_hash == segments[-1][-1])
        if not accum.blocks[segments[-1][-1]].is_followed:
            complete_utterances.add(helpers.string_of_seg(segments[-1]))
        block_hashes_to_segments[block_hash] = segments

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = belongs_to_segment[0][0]
        u_replyto = _find_reply_to_from_segment(belongs_to_segment)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes

        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=u_replyto,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)
        # this_utterance.meta = u_meta

        utterances.append(this_utterance)

    corpus = Corpus(utterances=utterances)
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids

    return corpus
Beispiel #5
0
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing users with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={
                                    'user_binary_data': user_byte_arr1,
                                    'index': 99
                                }),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        alice = corpus1.utterances["0"].user
        bob = corpus1.utterances["1"].user

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.utterances["0"].user
        bob2 = corpus2.utterances["1"].user

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(corpus1.utterances["0"].meta,
                         corpus2.utterances["0"].meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(corpus1.utterances["1"].meta,
                         corpus2.utterances["1"].meta)
def reconstruct_corpus(dataset):
    users = [utt['event_user_id'] for utt in dataset]
    users = np.unique(users)
    users_dict = {user: User(name=user) for user in users}

    utterances = []

    for utt in tqdm(dataset):
        user = users_dict[utt['event_user_id']] if utt[
            'event_user_id'] is not None else users_dict['none']
        utterances.append(
            Utterance(id=utt['revision_id'],
                      user=user,
                      text=utt['event_comment\n']))

    corpus = Corpus(utterances=utterances)

    return corpus
Beispiel #7
0
def rough_convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a rougher approximation of a Corpus from an Intermediate.
    Does not worry about reply_to structure, and instead sorts replies by the 
    chronological order in which utterances are posted to discussions.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        try:
            if block.user not in users:
                users[block.user] = User(id=block.user)
            segments = accum.segment_contiguous_blocks(block.reply_chain)
            assert (block_hash == segments[-1][-1])
            # any complete contiguous block is a complete utterance
            for seg in segments[:-1]:
                sos = helpers.string_of_seg(seg)
                complete_utterances.add(sos)
            if block.is_header or not accum.blocks[segments[-1]
                                                   [-1]].is_followed:
                complete_utterances.add(helpers.string_of_seg(segments[-1]))
            block_hashes_to_segments[block_hash] = segments
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Issue with conversion to corpus; skipping adding block "%s..."',
                block.text[:32])

    children_of_root = {}

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = accum.find_ultimate_hash(first_block.root_hash)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["last_revision"] = first_block.revision_ids[
            -1] if first_block.revision_ids[-1] != "unknown" else 0

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=None,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)

        if u_root in children_of_root:
            children_of_root[u_root].append(this_utterance)
        else:
            children_of_root[u_root] = [this_utterance]

    utterances = []
    for root, utt_list in children_of_root.items():
        if root == None:
            continue

        utt_list.sort(key=lambda x: x.timestamp)

        ind_of_root = 0
        try:
            while utt_list[ind_of_root].id != root:
                ind_of_root += 1
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Skipping section in conversion to corpus: could not find section header for root %s',
                root)
            continue

        if ind_of_root > 0:
            utt_list.insert(0, utt_list.pop(ind_of_root))

        utterances.append(utt_list[0])
        added = set([utt_list[0].id])
        i, j = 0, 1
        while j < len(utt_list):
            if utt_list[j].id not in added:
                utt_list[j].reply_to = utt_list[i].id
                added.add(utt_list[j].id)
                utterances.append(utt_list[j])
                i = j
            j += 1

        # for i in range(1, len(utt_list)):
        #     if utt_list[i-1].id == utt_list[i].id:
        #         logging.warning("Skipping utterance in conversion to corpus: reply to self %s", utt_list[i].id)
        #     else:
        #         utt_list[i].reply_to = utt_list[i-1].id
        #         utterances.append(utt_list[i])

    corpus = Corpus(utterances=utterances)
    return corpus
Beispiel #8
0
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a Corpus from an Intermediate.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        try:
            if block.user not in users:
                users[block.user] = User(id=block.user)
            segments = accum.segment_contiguous_blocks(block.reply_chain)
            assert (block_hash == segments[-1][-1])
            # any complete contiguous block is a complete utterance
            for seg in segments[:-1]:
                sos = helpers.string_of_seg(seg)
                complete_utterances.add(sos)
            if block.is_header or not accum.blocks[segments[-1]
                                                   [-1]].is_followed:
                complete_utterances.add(helpers.string_of_seg(segments[-1]))
            block_hashes_to_segments[block_hash] = segments
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Issue with conversion to corpus; skipping adding block "%s..."',
                block.text[:32])

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = accum.find_ultimate_hash(first_block.root_hash)
        u_replyto = _find_reply_to_from_segment(belongs_to_segment)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes
        u_meta["last_revision"] = first_block.revision_ids[
            -1] if first_block.revision_ids[-1] != "unknown" else 0
        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=u_replyto,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)

        utterances.append(this_utterance)

    corpus = Corpus(utterances=utterances)
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids

    return corpus