def test_partial_load_invalid_end_index(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id=0, text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id=1, text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id=2, text="this is a test", user=User(name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1) self.assertEqual(len(list(corpus2.iter_utterances())), 0)
def test_partial_load_start_idx_specified_only(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=1) self.assertEqual(len(list(corpus2.iter_utterances())), 2) self.assertEqual(corpus1.get_utterance("1"), corpus2.get_utterance("1")) self.assertEqual(corpus1.get_utterance("2"), corpus2.get_utterance("2"))
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str, meta_cols: List[str]) -> Corpus: """ Helper function to convert data to Corpus format Arguments: df {DataFrame} -- Actual data, in a pandas Dataframe id_col {str} -- name of the column that corresponds to utterances ids text_col {str} -- name of the column that stores texts of the utterances meta_cols {List[str]} -- set of columns that stores relevant metadata Returns: Corpus -- the converted corpus """ # in this particular case, user, reply_to, and timestamp information are all not applicable # and we will simply either create a placeholder entry, or leave it as None user = User(id="user") time = "NOT_RECORDED" utterance_list = [] for index, row in tqdm(df.iterrows()): # extracting meta data metadata = {} for meta_col in meta_cols: metadata[meta_col] = row[meta_col] utterance_list.append(Utterance(id=str(row[id_col]), user=user,\ root=str(row[id_col]), reply_to=None,\ timestamp=time, text=row[text_col], \ meta=metadata)) return Corpus(utterances=utterance_list)
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a Corpus from an Intermediate. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) assert (block_hash == segments[-1][-1]) if not accum.blocks[segments[-1][-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = belongs_to_segment[0][0] u_replyto = _find_reply_to_from_segment(belongs_to_segment) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["constituent_blocks"] = block_hashes for each_hash in block_hashes: block_hashes_to_utt_ids[each_hash] = u_id this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=u_replyto, timestamp=u_timestamp, text=u_text, meta=u_meta) # this_utterance.meta = u_meta utterances.append(this_utterance) corpus = Corpus(utterances=utterances) corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids return corpus
def test_dump_and_load_with_binary(self): """ Dump a corpus containing users with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={ 'user_binary_data': user_byte_arr1, 'index': 99 }), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) alice = corpus1.utterances["0"].user bob = corpus1.utterances["1"].user corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.utterances["0"].user bob2 = corpus2.utterances["1"].user self.assertEqual(alice.meta, alice2.meta) self.assertEqual(corpus1.utterances["0"].meta, corpus2.utterances["0"].meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual(corpus1.utterances["1"].meta, corpus2.utterances["1"].meta)
def reconstruct_corpus(dataset): users = [utt['event_user_id'] for utt in dataset] users = np.unique(users) users_dict = {user: User(name=user) for user in users} utterances = [] for utt in tqdm(dataset): user = users_dict[utt['event_user_id']] if utt[ 'event_user_id'] is not None else users_dict['none'] utterances.append( Utterance(id=utt['revision_id'], user=user, text=utt['event_comment\n'])) corpus = Corpus(utterances=utterances) return corpus
def rough_convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a rougher approximation of a Corpus from an Intermediate. Does not worry about reply_to structure, and instead sorts replies by the chronological order in which utterances are posted to discussions. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): try: if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) assert (block_hash == segments[-1][-1]) # any complete contiguous block is a complete utterance for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) if block.is_header or not accum.blocks[segments[-1] [-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Issue with conversion to corpus; skipping adding block "%s..."', block.text[:32]) children_of_root = {} for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = accum.find_ultimate_hash(first_block.root_hash) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["last_revision"] = first_block.revision_ids[ -1] if first_block.revision_ids[-1] != "unknown" else 0 this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=None, timestamp=u_timestamp, text=u_text, meta=u_meta) if u_root in children_of_root: children_of_root[u_root].append(this_utterance) else: children_of_root[u_root] = [this_utterance] utterances = [] for root, utt_list in children_of_root.items(): if root == None: continue utt_list.sort(key=lambda x: x.timestamp) ind_of_root = 0 try: while utt_list[ind_of_root].id != root: ind_of_root += 1 except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Skipping section in conversion to corpus: could not find section header for root %s', root) continue if ind_of_root > 0: utt_list.insert(0, utt_list.pop(ind_of_root)) utterances.append(utt_list[0]) added = set([utt_list[0].id]) i, j = 0, 1 while j < len(utt_list): if utt_list[j].id not in added: utt_list[j].reply_to = utt_list[i].id added.add(utt_list[j].id) utterances.append(utt_list[j]) i = j j += 1 # for i in range(1, len(utt_list)): # if utt_list[i-1].id == utt_list[i].id: # logging.warning("Skipping utterance in conversion to corpus: reply to self %s", utt_list[i].id) # else: # utt_list[i].reply_to = utt_list[i-1].id # utterances.append(utt_list[i]) corpus = Corpus(utterances=utterances) return corpus
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a Corpus from an Intermediate. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): try: if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) assert (block_hash == segments[-1][-1]) # any complete contiguous block is a complete utterance for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) if block.is_header or not accum.blocks[segments[-1] [-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Issue with conversion to corpus; skipping adding block "%s..."', block.text[:32]) for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = accum.find_ultimate_hash(first_block.root_hash) u_replyto = _find_reply_to_from_segment(belongs_to_segment) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["constituent_blocks"] = block_hashes u_meta["last_revision"] = first_block.revision_ids[ -1] if first_block.revision_ids[-1] != "unknown" else 0 for each_hash in block_hashes: block_hashes_to_utt_ids[each_hash] = u_id this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=u_replyto, timestamp=u_timestamp, text=u_text, meta=u_meta) utterances.append(this_utterance) corpus = Corpus(utterances=utterances) corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids return corpus