Beispiel #1
0
 def read_all(path: Path, add_eos):
     count = 0
     with IO.reader(path) as reader:
         dialog = Dialog()
         for line in reader:
             line = line.strip()
             if line:
                 parts = line.split("\t")
                 char, seq = parts[-2:]  # the last two are mandatory
                 uid = parts[0] if len(parts) > 2 else None
                 weight = float(parts[1]) if len(parts) > 3 else None
                 char, seq = int(char), [
                     int(x) for x in seq.strip().split()
                 ]
                 if add_eos and seq[-1] != EOS_TOK_IDX:
                     seq.append(EOS_TOK_IDX)
                 dialog.append(Utterance(char, seq, uid=uid, weight=weight))
             else:
                 if len(dialog) > 0:
                     yield dialog
                     count += 1
                     dialog = Dialog()
         if len(dialog) > 0:
             count += 1
             yield dialog
     log.info(f"Read {count} dialogs")
Beispiel #2
0
def read_msg_resp(path: str):
    def _read(rdr):
        recs = (x.strip() for x in rdr)
        recs = (x for x in recs if x)
        recs = (x.split('\t') for x in recs)
        recs = (x for x in recs if len(x) == 2)
        recs = list(recs)
        msgs = [x[0] for x in recs]
        resps = [x[1] for x in recs]
        return msgs, resps

    if type(path) is str:
        with IO.reader(path) as r:
            return _read(r)
    else:
        return _read(path)
Beispiel #3
0
    def __init__(self,
                 inp: Union[str, Path, TextIO, Iterator[str]],
                 text_field: Field = None,
                 char_field: LookupField = None,
                 max_seq_len: int = 100,
                 add_eos=True):
        """

        :param inp: dialog seq file
        :param text_field: provide this field if you want to map text to word ids.
         by default it splits words by white space and return words as sequence
        :param char_field: provide this field if you want to map character name to id.
        """
        if type(inp) is str:
            inp = Path(inp)
        if isinstance(inp, Path):
            assert inp.exists()
            inp = IO.reader(inp).open()
        self.reader = inp
        self.text_field = text_field
        self.char_field = char_field
        self.max_seq_len = max_seq_len
        self.add_eos = add_eos
        self.num_cols = 0
Beispiel #4
0
 def read_raw_lines(dialog_path: Union[str, Path]) -> Iterator[RawRecord]:
     with IO.reader(dialog_path) as lines:
         recs = (line.split("\t")[-2:] for line in lines)
         recs = ((char.strip(), dialog.strip()) for char, dialog in recs)
         recs = ((char, dialog) for char, dialog in recs if char and dialog)
         yield from recs
Beispiel #5
0
 def _read_char_names():
     with IO.reader(path) as inp:
         for line in inp:
             parts = line.strip().split('\t')
             if len(parts) >= 2:
                 yield parts[-2]
Beispiel #6
0
def read_lines(path: Union[str, Path]):
    with IO.reader(path) as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
        return lines
Beispiel #7
0
def read_tsv(path: str):
    assert os.path.exists(path)
    with IO.reader(path) as f:
        yield from (line.split('\t') for line in f)
Beispiel #8
0
def read_lines(path):
    if type(path) is str:
        with IO.reader(path) as reader:
            yield from read_lines_reader(reader)
    else:
        return read_lines_reader(path)