Ejemplo n.º 1
0
def read_system_training_data(filename):
    insts = []
    for inst in pd.read_csv(filename, index_col=None,
                            encoding='UTF-8').to_dict('records'):
        insts.append({
            'dataset':
            'E2E',
            'mr':
            DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(),
            'delex_mr':
            DA.parse_diligent_da(inst['mr']).get_delexicalized(
                set(['name', 'near'])).to_cambridge_da_string(),
            'system':
            'HUMAN',
            'system_ref':
            None,
            'orig_ref':
            inst['ref'],
            'informativeness':
            None,
            'naturalness':
            None,
            'quality':
            None,
            'is_real':
            0
        })
    log_info(
        "Using %d different training human references to create fake pairs" %
        len(insts))
    return insts
Ejemplo n.º 2
0
def reclassify_mr(ref, gold_mr=DA()):
    """Classify the MR given a text. Can use a gold-standard MR to make the classification more
    precise (in case of ambiguity, goes with the gold-standard value). Returns a dict-based MR format
    for the system output MR and the gold-standard MR."""
    # convert MR to dict for comparing & checking against
    mr_dict = {}
    for dai in gold_mr.dais:
        mr_dict[dai.slot] = mr_dict.get(dai.slot, {})
        val = CAPITALIZE[dai.slot][dai.value.lower()]
        mr_dict[dai.slot][val] = mr_dict[dai.slot].get(val, 0) + 1

    # create MR dict representation of the output text
    # first, collect all value matches
    matches = []
    for slot in REALIZATIONS.keys():
        # verbatim slot
        if not isinstance(REALIZATIONS[slot], dict):
            matches.extend([
                Match(slot, CAPITALIZE[slot][match.group(0).lower()], match)
                for match in REALIZATIONS[slot].finditer(ref)
            ])
        # slot with variable realizations
        else:
            # collect all matches for all values
            for value in REALIZATIONS[slot].keys():
                matches.extend([
                    Match(slot, CAPITALIZE[slot][value.lower()], match)
                    for match in REALIZATIONS[slot][value].finditer(ref)
                ])

    # then filter out those that are substrings/duplicates (let only one value match,
    # preferrably the one indicated by the true MR -- check with the MR dict)
    filt_matches = []
    for match in matches:
        skip = False
        for other_match in matches:
            if match is other_match:
                continue
            if (match.is_substring(other_match) or (
                    match.is_same_string(other_match) and
                (other_match.value in mr_dict.get(other_match.slot, {}).keys()
                 or other_match in filt_matches))):
                skip = True
                break
        if not skip:
            filt_matches.append(match)

    # now put it all into a dict
    out_dict = {}
    for match in filt_matches:
        out_dict[match.slot] = out_dict.get(match.slot, {})
        out_dict[match.slot][match.value] = out_dict[match.slot].get(value,
                                                                     0) + 1

    return DA.parse_dict(out_dict)
Ejemplo n.º 3
0
 def _delex_das(self):
     """Delexicalize DAs in the buffers, save them separately."""
     out = []
     for da in self._das:
         delex_da = DA()
         for dai in da:
             delex_dai = DAI(
                 dai.da_type, dai.slot, 'X-' + dai.slot if
                 (dai.value not in [None, 'none', 'dont_care']
                  and dai.slot in self._abst_slots) else dai.value)
             delex_da.append(delex_dai)
         out.append(delex_da)
     self._delexed_das = out
Ejemplo n.º 4
0
def parse_cambridge_da(da_text):
    """Parse a DA string into DAIs (DA types, slots, and values)."""

    da = DA()

    for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text):
        da_type, svps = dai_text.groups()

        if not svps:  # no slots/values (e.g. 'hello()')
            da.append(DAI(da_type, None, None))
            continue

        # we have some slots/values – split them into DAIs
        svps = re.split('(?<! )[,;]', svps)
        for svp in svps:

            if '=' not in svp:  # no value, e.g. '?request(near)'
                da.append(DAI(da_type, svp, None))
                continue

            # we have a value
            slot, value = svp.split('=', 1)
            if re.match(r'^\'.*\'$', value):
                value = value[1:-1]
            assert not re.match(r'^\'', value) and not re.match(r'\'$', value)

            da.append(DAI(da_type, slot, value))

    return da
Ejemplo n.º 5
0
 def _delex_das(self):
     """Delexicalize DAs in the buffers, save them separately."""
     out = []
     for da in self._das:
         delex_da = DA()
         for dai in da:
             delex_dai = DAI(dai.da_type, dai.slot,
                             'X-' + dai.slot
                             if (dai.value not in [None, 'none', 'dont_care'] and
                                 dai.slot in self._abst_slots)
                             else dai.value)
             delex_da.append(delex_dai)
         out.append(delex_da)
     self._delexed_das = out
Ejemplo n.º 6
0
    def evaluate_file(self, das_file, ttree_file):
        """Evaluate the reranking classifier on a given pair of DA/tree files (show the
        total Hamming distance and total number of DAIs)

        @param das_file: DA file path
        @param ttree_file: trees/sentences file path
        @return: a tuple (total DAIs, distance)
        """
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees/tokens from ' + ttree_file + '...')
        trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector)
        if self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        tot_len = 0
        tot_dist = 0
        classif_das = []
        for da, tree in zip(das, trees):
            tot_len += len(da)
            dist, classif = self.dist_to_da(da, [tree], return_classif=True)
            tot_dist += dist[0]
            classif_das.append(DA.parse_features(classif[0]))

        return tot_len, tot_dist, classif_das
Ejemplo n.º 7
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Ejemplo n.º 8
0
    def process_dataset(self, input_data):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_data: path to the input JSON file with the data
        """
        # load data from JSON
        self._das = []
        self._texts = []
        with codecs.open(input_data, 'r', encoding='UTF-8') as fh:
            data = json.load(fh)
            for inst in data:
                da = DA.parse_cambridge_da(inst['da'])
                da.sort()
                self._das.append(da)
                self._texts.append(self.analyze(inst['text']))

        # delexicalize DAs and sentences
        self._create_delex_texts()
        self._create_delex_das()

        # return the result
        out = []
        for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts):
            out.append(Inst(da, text, delex_da, delex_text, abst))
        return out
Ejemplo n.º 9
0
 def process_files(self, input_text_file, input_da_file, skip_hello=False):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods).
     @param input_text_file: path to the input file with sentences
     @param input_da_file: path to the input file with DAs
     @param skip_hello: skip hello() DAs (remove them from the output?)
     """
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert (len(self._das) == len(self._sents))
     # skip hello() DAs, if required
     if skip_hello:
         pos = 0
         while pos < len(self._das):
             da = self._das[pos]
             if len(da) == 1 and da[0].da_type == 'hello':
                 del self._das[pos]
                 del self._sents[pos]
             else:
                 pos += 1
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Ejemplo n.º 10
0
 def process_files(self, input_text_file, input_da_file, skip_hello=False):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods).
     @param input_text_file: path to the input file with sentences
     @param input_da_file: path to the input file with DAs
     @param skip_hello: skip hello() DAs (remove them from the output?)
     """
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert(len(self._das) == len(self._sents))
     # skip hello() DAs, if required
     if skip_hello:
         pos = 0
         while pos < len(self._das):
             da = self._das[pos]
             if len(da) == 1 and da[0].da_type == 'hello':
                 del self._das[pos]
                 del self._sents[pos]
             else:
                 pos += 1
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Ejemplo n.º 11
0
def main(in_file, out_mrs, out_refs):

    abst_das = []
    conc_das = []
    conc_da_texts = []
    abst_texts = []
    with codecs.open(in_file, 'r', 'UTF-8') as fh:
        for line in fh:
            line = line.strip()

            if line.startswith('FULL_DA'):
                line = re.sub('^FULL_DA = ', '', line)
                conc_das.append(DA.parse_cambridge_da(line))
                conc_da_texts.append(line)
            elif line.startswith('ABSTRACT_DA'):
                line = re.sub('^ABSTRACT_DA = ', '', line)
                abst_das.append(DA.parse_cambridge_da(line))
            elif line.startswith('->'):
                line = re.sub('^-> "', '', line)
                line = re.sub('";\s*$', '', line)
                line = re.sub(r'\[([a-z]+)\+X\]X', r'X-\1', line)
                line = re.sub(r'\[[^\]]*\]', '', line)
                abst_texts.append(line)

    conc_texts = []
    for abst_da, conc_da, abst_text in zip(abst_das, conc_das, abst_texts):
        text = abst_text
        for abst_dai, conc_dai in zip(abst_da.dais, conc_da.dais):
            assert abst_dai.slot == conc_dai.slot
            if abst_dai.value.startswith('X'):
                text = text.replace('X-' + abst_dai.slot, conc_dai.value, 1)
        text = re.sub(r'the The', 'The', text)
        conc_texts.append(text)

    with codecs.open(out_mrs, 'w', 'UTF-8') as fh:
        fh.write("\n".join(conc_da_texts))

    with codecs.open(out_refs, 'w', 'UTF-8') as fh:
        fh.write("\n".join(conc_texts))
Ejemplo n.º 12
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Ejemplo n.º 13
0
 def process_files(self, input_text_file, input_da_file):
     """Load DAs & sentences, obtain abstraction instructions, and store it all in member
     variables (to be used later by writing methods)."""
     # load DAs
     self._das = []
     with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._das.append(DA.parse(line.strip()))
     # load & process sentences
     self._sents = []
     with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
         for line in fh:
             self._sents.append(self.analyze(line.strip()))
     assert (len(self._das) == len(self._sents))
     # delexicalize DAs and sentences
     self._delex_texts()
     self._delex_das()
Ejemplo n.º 14
0
def interactive_input(das_type='cambridge',
                      delex_slots=set(),
                      delex_slot_names=False,
                      delex_das=False,
                      input_da=True,
                      input_ref=False):

    da = None
    if input_da:
        da = raw_input('Enter DA             : ').decode('utf-8').strip()
        if not da:
            return None
        if das_type == 'text':
            da = [(tok, None)
                  for tok in preprocess_sent(None, da, False, False)]
        else:
            da = DA.parse_cambridge_da(da)
            if delex_das:
                da = da.get_delexicalized(delex_slots)
    ref = None
    if input_ref:
        ref = raw_input('Enter reference      : ').decode('utf-8').strip()
        if not ref:
            return None
        ref = [
            (tok, None)
            for tok in preprocess_sent(da, ref, delex_slots, delex_slot_names)
        ]

    hyp = raw_input('Enter system output 1: ').decode('utf-8').strip()
    if not hyp:
        return None
    hyp = [(tok, None)
           for tok in preprocess_sent(da, hyp, delex_slots, delex_slot_names)]

    hyp2 = raw_input('Enter system output 2: ').decode('utf-8').strip()
    if not hyp2:
        hyp2 = []
    else:
        hyp2 = [
            (tok, None)
            for tok in preprocess_sent(da, hyp2, delex_slots, delex_slot_names)
        ]

    return (da, ref, hyp, hyp2)
Ejemplo n.º 15
0
def read_outputs(filename):
    data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8')
    if isinstance(data.iloc[len(data) - 1]['mr'], float):
        # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it,
        # probably an error in Pandas
        print(
            '!!!Strangely need to remove an empty intstance from the end of %s'
            % filename)
        data = data[:-1]
    das = [DA.parse_cambridge_da(da) for da in data['mr']]

    # force string data type for empty human references
    data['orig_ref'] = data['orig_ref'].apply(
        lambda x: '' if not isinstance(x, basestring) else x)
    texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['orig_ref']]
    texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['system_output']]
    if 'system_output2' not in data:
        data['system_output2'] = [None] * len(data)
    texts_hyp2 = [[(tok, None)
                   for tok in tokenize(sent.lower()).split(' ')] if isinstance(
                       sent, basestring) else None
                  for sent in data['system_output2']]
    inputs = [(da, text_ref, text_hyp, text_hyp2)
              for da, text_ref, text_hyp, text_hyp2 in zip(
                  das, texts_ref, texts_hyp, texts_hyp2)]

    # find out which columns were used for ratings
    target_cols = [
        c[:-len('_system_rating')] for c in data.columns
        if c.endswith('_system_rating')
    ]
    assert target_cols
    # compile data from all these columns
    outputs = {}
    for target_col in target_cols:
        outputs[target_col] = {
            subcol: list(data[target_col + '_' + subcol])
            for subcol in [
                'human_rating_raw', 'human_rating', 'system_rating_raw',
                'system_rating', 'rank_loss', 'rank_ok'
            ]
        }
    return (inputs, outputs)
Ejemplo n.º 16
0
def process_file(tagger_model, input_file):
    detok = Detokenizer()
    df = pd.read_csv(input_file, sep="\t", encoding="UTF-8")
    raw_mrs = list(df['MR'])
    raw_refs = [detok.detokenize(text) for text in list(df['output'])]
    mrs = [DA.parse_diligent_da(mr) for mr in raw_mrs]
    tagger = MorphoTagger(tagger_model)
    tagged_refs = [tagger.tag(line) for line in raw_refs]

    for ff in ['ngram', 'lca', 'collins']:
        write_output(tagged_refs, ff,
                     re.sub(r'\.tsv', '.tag.%s.txt' % ff, input_file))

    stats = data_stats(mrs, tagged_refs, {
        'name': [],
        'near': []
    }, re.sub(r'\.tsv', '', input_file))
    return stats
Ejemplo n.º 17
0
def convert(args):
    src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8')
    df = pd.DataFrame(index=np.arange(len(src)), columns=COLUMNS)
    for src_col, trg_col in COLUMN_MAP.iteritems():
        if isinstance(trg_col, list):
            for trg_col_ in trg_col:
                df[trg_col_] = src[src_col]
        else:
            df[trg_col] = src[src_col]
    df['mr'] = [
        DA.parse_diligent_da(da).to_cambridge_da_string() for da in src['mr']
    ]
    df['is_real'] = np.ones(len(src), dtype=np.int32)
    df['dataset'] = ['INLG'] * len(src)
    df['system'] = ['human'] * len(src)
    df.to_csv(args.out_file,
              columns=COLUMNS,
              sep=b"\t",
              index=False,
              encoding='UTF-8')
Ejemplo n.º 18
0
def convert(args):
    src = lines_to_list(args.src_file)
    if args.das:
        src = [DA.parse(da_text).to_cambridge_da_string() for da_text in src]
    ref = lines_to_list(args.ref_file)
    columns = ['mr', 'orig_ref']
    df = pd.DataFrame.from_dict({'mr': src, 'orig_ref': ref})

    if args.system_output:
        sys = lines_to_list(args.system_output)
        df['system_ref'] = sys
        columns.append('system_ref')

    if args.score:
        score = [float(score) for score in lines_to_list(args.score)]
        df['quality'] = score
        columns.append('quality')

    df.to_csv(args.out_file,
              columns=columns,
              sep=b"\t",
              index=False,
              encoding='UTF-8')
Ejemplo n.º 19
0
def read_sfx_data():
    with codecs.open('data/sfrest-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/sfrest-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse(mr.strip()) for mr in fh.readlines()]
    return mrs, refs
Ejemplo n.º 20
0
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens)
    @param abst_slots: a set of slots to be abstracted
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(conc, basestring):
        toks = conc.split(' ')
        return_string = True
    else:
        toks = conc
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value) if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        pos = find_value(dai.value, toks, toks_mask)
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]),
                              start=pos[0], end=pos[1]))

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in abst_slots:
                absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Ejemplo n.º 21
0
def read_bagel_data():
    with codecs.open('data/bagel-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/bagel-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse_cambridge_da(mr) for mr in fh.readlines()]
    return mrs, refs
Ejemplo n.º 22
0
def parse_mr(mr_text):
    return DA.parse_diligent_da(mr_text).get_delexicalized(
        set(['name', 'near']))
Ejemplo n.º 23
0
def delex_sent(da,
               sent,
               delex_slots,
               use_slot_names=True,
               delex_slot_names=False,
               repeated=False):
    """Delexicalize ("abstract") the given slots in the given sentence (replace them with X
    or X-slot_name).

    @param da: concrete DA
    @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens)
    @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \
        leave untouched for each slot)
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(sent, basestring):
        toks = sent.split(' ')
        return_string = True
    else:
        toks = sent
    if isinstance(delex_slots, set):  # convert sets to dicts
        delex_slots = {slot: set() for slot in delex_slots}
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the delexicalized DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the delexicalized (abstracted) DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue

        # search for the 1st or all occurrences
        found = 0
        pos = (-1, -1)
        while found < 1 or (repeated and pos != (-1, -1)):
            pos = find_value(dai.value, toks, toks_mask)
            # if the value is to be delexicalize, replace the value in the delexicalized DAI
            # and save abstraction instruction (even if not found in the sentence)
            if (dai.slot in delex_slots
                    and dai.value not in delex_slots[dai.slot]
                    and dai.value != 'dont_care' and (found == 0 or pos !=
                                                      (-1, -1))):

                abst_da[-1].value = 'X-' + dai.slot
                # save the abstraction instruction
                absts.append(
                    Abst(dai.slot,
                         dai.value,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))
            found += 1

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in delex_slots:
                absts.append(
                    Abst(dai.slot,
                         None,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be delexicalized
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be delexicalized on the output
        if (abst.slot not in delex_slots
                or abst.value in delex_slots[abst.slot]
                or abst.value == 'dont_care' or abst.start < 0):
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end -
                 shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Ejemplo n.º 24
0
def read_data(filename,
              target_cols,
              das_type='cambridge',
              delex_slots=set(),
              delex_slot_names=False,
              delex_das=False):
    """Read the input data from a TSV file."""

    refs_cache = {}

    def cached_preprocess_sent(da, sent):
        """we're caching since with generated data, we're likely to parse the same sentence many times."""
        if (da, sent) not in refs_cache:
            refs_cache[(da, sent)] = preprocess_sent(da, sent, delex_slots,
                                                     delex_slot_names)
        return list(refs_cache[(da, sent)])

    log_info("Reading %s..." % filename)
    data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8')
    log_info("Loaded %d instances." % len(data))

    # force data type to string if the data set doesn't contain human references
    data['orig_ref'] = data['orig_ref'].apply(
        lambda x: '' if not isinstance(x, basestring) else x)
    log_info("Adapted refs data type.")

    if das_type == 'text':  # for MT output classification
        das = [[(tok, None)
                for tok in preprocess_sent(None, sent, False, False)]
               for sent in data['mr']]
    else:
        das = [DA.parse_cambridge_da(da) for da in data['mr']]
    log_info("Parsed DAs.")

    texts_ref = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                 for da, sent in zip(das, data['orig_ref'])]
    log_info("Preprocessed human refs.")
    texts_hyp = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                 for da, sent in zip(das, data['system_ref'])]
    log_info("Preprocessed system outputs.")

    # alternative reference with rating difference / use to compare
    if 'system_ref2' in data.columns:
        texts_hyp2 = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                      if isinstance(sent, basestring) else None
                      for da, sent in zip(das, data['system_ref2'])]
    else:
        texts_hyp2 = [None] * len(texts_hyp)
    log_info("Preprocessed 2nd system outputs.")

    # DA delexicalization must take place after text delexicalization
    if das_type != 'text' and delex_das:
        das = [da.get_delexicalized(delex_slots) for da in das]
    log_info("Delexicalized DAs.")

    # fake data indicator
    if 'is_real' in data.columns:
        real_indics = [0 if indic == 0 else 1 for indic in data['is_real']]
    else:
        real_indics = [1 for _ in xrange(len(data))]
    log_info("Retrieved is_real indications.")

    inputs = [(da, ref, hyp, hyp2, ri) for da, ref, hyp, hyp2, ri in zip(
        das, texts_ref, texts_hyp, texts_hyp2, real_indics)]
    log_info("Built inputs list.")

    targets = np.array(
        data[[target_cols] if not isinstance(target_cols, list
                                             ) else target_cols],
        dtype=np.float)
    log_info("Built targets list.")

    return inputs, targets
Ejemplo n.º 25
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    turns = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract,
                                    args.slot_names)
        da.sort(
        )  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for dialogue in data:
            if isinstance(dialogue, dict):
                for turn in dialogue['dial']:
                    da = DA.parse_cambridge_da(turn['S']['dact'])
                    if args.skip_hello and len(
                            da) == 1 and da[0].da_type == 'hello':
                        continue  # skip hello() DAs
                    conc = postprocess_sent(turn['S']['ref'])
                    process_instance(da, conc)
                    turns += 1
            else:
                da = DA.parse_cambridge_da(dialogue[0])
                conc = postprocess_sent(dialogue[1])
                process_instance(da, conc)
                turns += 1

        print 'Processed', turns, 'turns.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) /
                                            float(len(das)))

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them up, as Wen does)
        total = float(sum(data_sizes))
        remain = turns
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(ceil(turns * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [turns]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        # create multiple lexicalized references for each instance by relexicalizing sentences
        # with the same DA from the same part
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:

            # group sentences with the same DA
            da_groups = {}
            for da, text, abst in zip(das[0:part_size], texts[0:part_size],
                                      absts[0:part_size]):
                da_groups[unicode(da)] = da_groups.get(unicode(da), [])
                da_groups[unicode(da)].append(
                    (text, filter_abst(abst, slots_to_abstract)))

            for da_str in da_groups.keys():
                seen = set()
                uniq = []
                for text, abst in da_groups[da_str]:
                    sig = text + "\n" + ' '.join(
                        [a.slot + str(a.start) for a in abst])
                    if sig not in seen:
                        seen.add(sig)
                        uniq.append((text, abst))
                da_groups[da_str] = uniq

            # relexicalize all abstract sentences for each DA
            relex = []
            for da, abst in zip(das[0:part_size], absts[0:part_size]):
                relex.append(
                    relexicalize(da_groups[unicode(da)],
                                 filter_abst(abst, slots_to_abstract)))

            with open(part_name + '-ref.txt', 'w') as fh:
                for relex_pars in relex:
                    fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n")

        with open(part_name + '-das.txt', 'w') as fh:
            for da in das[0:part_size]:
                fh.write(unicode(da).encode('utf-8') + "\n")
            del das[0:part_size]

        with open(part_name + '-conc_das.txt', 'w') as fh:
            for conc_da in conc_das[0:part_size]:
                fh.write(unicode(conc_da).encode('utf-8') + "\n")
            del conc_das[0:part_size]

        with open(part_name + '-conc.txt', 'w') as fh:
            for conc in concs[0:part_size]:
                fh.write(conc.encode('utf-8') + "\n")
            del concs[0:part_size]

        with open(part_name + '-abst.txt', 'w') as fh:
            for abst in absts[0:part_size]:
                fh.write("\t".join([unicode(a)
                                    for a in abst]).encode('utf-8') + "\n")
            del absts[0:part_size]

        with open(part_name + '-text.txt', 'w') as fh:
            for text in texts[0:part_size]:
                fh.write(text.encode('utf-8') + "\n")
            del texts[0:part_size]
Ejemplo n.º 26
0
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False):
    """Delexicalize ("abstract") the given slots in the given sentence (replace them with X
    or X-slot_name).

    @param da: concrete DA
    @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens)
    @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \
        leave untouched for each slot)
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(sent, basestring):
        toks = sent.split(' ')
        return_string = True
    else:
        toks = sent
    if isinstance(delex_slots, set):  # convert sets to dicts
        delex_slots = {slot: set() for slot in delex_slots}
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the delexicalized DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value) if dai.value is not None else 0,
                      reverse=True):
        # first, create the delexicalized (abstracted) DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue

        # search for the 1st or all occurrences
        found = 0
        pos = (-1, -1)
        while found < 1 or (repeated and pos != (-1, -1)):
            pos = find_value(dai.value, toks, toks_mask)
            # if the value is to be delexicalize, replace the value in the delexicalized DAI
            # and save abstraction instruction (even if not found in the sentence)
            if (dai.slot in delex_slots and
                    dai.value not in delex_slots[dai.slot] and
                    dai.value != 'dont_care' and
                    (found == 0 or pos != (-1, -1))):

                abst_da[-1].value = 'X-' + dai.slot
                # save the abstraction instruction
                absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))
            found += 1

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in delex_slots:
                absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be delexicalized
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be delexicalized on the output
        if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot]
                or abst.value == 'dont_care' or abst.start < 0):
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Ejemplo n.º 27
0
def create_fake_data(real_data, columns, score_type='nlg'):
    """Given some real data, create additional fake data, using human references and
    distorting them. Will start from scores provided, or default to best possible score.
    @param real_data: a real data set, as pd.DataFrame
    @param columns: list of columns for the fake data set
    @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter')
    @return: a fake data set, with the given columns, some of them empty
    """
    def target_score(src_score, distort_step):
        if score_type == 'hter':
            return src_score + distort_step
        elif score_type == 'rank':
            return 1.  # ignore scores for ranks
        return max(1, min(4., src_score - distort_step))

    normalize = False
    best_score = 6.
    num_steps = 4
    if score_type == 'hter':
        normalize = True
        best_score = 0.
        num_steps = 5
    elif score_type == 'rank':
        best_score = 1.

    fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)),
                             columns=columns)
    vocab = {}

    # add references as perfect data items
    for idx, row in enumerate(real_data.itertuples()):
        fake_data.loc[idx]['orig_ref'] = row.orig_ref
        fake_data.loc[idx]['system_ref'] = row.orig_ref
        fake_data.loc[idx]['mr'] = row.mr
        fake_data.loc[idx]['is_real'] = 0
        for quant in ['naturalness', 'quality', 'informativeness']:
            fake_data.loc[idx][quant] = (getattr(row, quant) if (
                hasattr(row, quant) and getattr(row, quant) is not None
                and not np.isnan(getattr(row, quant))) else best_score)

        for tok in tokenize(row.orig_ref).split(' '):
            vocab[tok] = vocab.get(tok, 0) + 1

    lexicalizer = Lexicalizer(cfg={'mode': 'tokens'})  # default lexicalizer
    vocab = build_vocab(vocab)

    for distort_step in xrange(1, num_steps + 1):
        for idx, row in enumerate(real_data.itertuples(),
                                  start=distort_step * len(real_data)):

            fake_data.loc[idx]['orig_ref'] = row.orig_ref
            fake_data.loc[idx]['mr'] = row.mr
            fake_data.loc[idx]['is_real'] = 0

            # delexicalize data
            da = DA.parse_cambridge_da(row.mr)
            sent, _, lex_instr = delex_sent(da,
                                            tokenize(row.orig_ref).split(' '),
                                            DELEX_SLOTS)
            ref_len = len(sent)
            # distort
            sent = distort_sent(sent, distort_step, vocab)
            # lexicalize again
            sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent],
                                                 lex_instr)
            fake_data.loc[idx]['system_ref'] = ' '.join(sent)

            for quant in ['naturalness', 'quality', 'informativeness']:
                score = (getattr(row, quant) if (
                    hasattr(row, quant) and getattr(row, quant) is not None
                    and not np.isnan(getattr(row, quant))) else best_score)
                score = target_score(score, distort_step)
                fake_data.loc[idx][quant] = (((score / ref_len) *
                                              100) if normalize else score)

    return fake_data
Ejemplo n.º 28
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Ejemplo n.º 29
0
def abstract_sent(da, conc, abst_slots, slot_names):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text
    @param abstr_slots: a set of slots to be abstracted
    @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions
    """
    toks = conc.split(' ')
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        # try to find the value in the sentence (first exact, then fuzzy)
        # while masking tokens of previously found values
        val_toks = dai.value.split(' ')
        pos = find_substr(val_toks,
                          [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is None:
            pos = find_substr_approx(
                val_toks, [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is not None:
            for idx in xrange(
                    pos[0],
                    pos[1]):  # mask found things so they're not found twice
                toks_mask[idx] = False
        if pos is None or pos == (0, 0):  # default to -1 for unknown positions
            pos = -1, -1
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text
        if slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks), abst_da, absts
Ejemplo n.º 30
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    turns = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names)
        da.sort()  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for dialogue in data:
            if isinstance(dialogue, dict):
                for turn in dialogue['dial']:
                    da = DA.parse_cambridge_da(turn['S']['dact'])
                    if args.skip_hello and len(da) == 1 and da[0].da_type == 'hello':
                        continue  # skip hello() DAs
                    conc = postprocess_sent(turn['S']['ref'])
                    process_instance(da, conc)
                    turns += 1
            else:
                da = DA.parse_cambridge_da(dialogue[0])
                conc = postprocess_sent(dialogue[1])
                process_instance(da, conc)
                turns += 1

        print 'Processed', turns, 'turns.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das)))

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them up, as Wen does)
        total = float(sum(data_sizes))
        remain = turns
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(ceil(turns * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [turns]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        # create multiple lexicalized references for each instance by relexicalizing sentences
        # with the same DA from the same part
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:

            # group sentences with the same DA
            da_groups = {}
            for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]):
                da_groups[unicode(da)] = da_groups.get(unicode(da), [])
                da_groups[unicode(da)].append((text, filter_abst(abst, slots_to_abstract)))

            for da_str in da_groups.keys():
                seen = set()
                uniq = []
                for text, abst in da_groups[da_str]:
                    sig = text + "\n" + ' '.join([a.slot + str(a.start) for a in abst])
                    if sig not in seen:
                        seen.add(sig)
                        uniq.append((text, abst))
                da_groups[da_str] = uniq

            # relexicalize all abstract sentences for each DA
            relex = []
            for da, abst in zip(das[0:part_size], absts[0:part_size]):
                relex.append(relexicalize(da_groups[unicode(da)],
                                          filter_abst(abst, slots_to_abstract)))

            with open(part_name + '-ref.txt', 'w') as fh:
                for relex_pars in relex:
                    fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n")

        with open(part_name + '-das.txt', 'w') as fh:
            for da in das[0:part_size]:
                fh.write(unicode(da).encode('utf-8') + "\n")
            del das[0:part_size]

        with open(part_name + '-conc_das.txt', 'w') as fh:
            for conc_da in conc_das[0:part_size]:
                fh.write(unicode(conc_da).encode('utf-8') + "\n")
            del conc_das[0:part_size]

        with open(part_name + '-conc.txt', 'w') as fh:
            for conc in concs[0:part_size]:
                fh.write(conc.encode('utf-8') + "\n")
            del concs[0:part_size]

        with open(part_name + '-abst.txt', 'w') as fh:
            for abst in absts[0:part_size]:
                fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n")
            del absts[0:part_size]

        with open(part_name + '-text.txt', 'w') as fh:
            for text in texts[0:part_size]:
                fh.write(text.encode('utf-8') + "\n")
            del texts[0:part_size]
Ejemplo n.º 31
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(conc_da, conc):
        # sort the DA using the same order as in E2E NLG data
        conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))
        conc_das.append(conc_da)

        text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))

        da_keys[str(da)] = da_keys.get(str(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    data = pd.read_csv(args.in_file, sep=',', encoding='UTF-8')
    data['mr'] = data['mr'].fillna('')
    for inst in data.itertuples():
        da = DA.parse_diligent_da(inst.mr)
        process_instance(da, inst.ref)
        insts += 1
        if insts % 100 == 0:
            print('%d...' % insts, end='', flush=True, file=sys.stderr)

    print('Processed', insts, 'instances.', file=sys.stderr)
    print('%d different DAs.' % len(da_keys), file=sys.stderr)
    print('%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))),
          file=sys.stderr)
    print('Max DA len: %d, max text len: %d' % (max([len(da) for da in das]),
                                                max([text.count(' ') + 1 for text in texts])),
          file=sys.stderr)

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()  # keep the original order (by 1st occurrence of DA)
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts):
            group = groups.get(str(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[str(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.values():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join(["\t".join([str(a) for a in absts_])
                                    for absts_ in group['abst']]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([str(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(str(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(str(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
Ejemplo n.º 32
0
def read_e2e_data():
    with codecs.open('data/e2e-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/e2e-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse_diligent_da(mr) for mr in fh.readlines()]
    return mrs, refs
Ejemplo n.º 33
0
def convert(args):
    src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8')
    data = []
    src_col = args.column
    trg_col = COLUMN_MAP[src_col[:3]]
    unique_mrs = set()

    for _, src_inst in src.iterrows():
        mr = DA.parse_diligent_da(src_inst['mr']).to_cambridge_da_string()
        delex_mr = DA.parse_diligent_da(src_inst['mr']).get_delexicalized(
            set(['name', 'near'])).to_cambridge_da_string()
        unique_mrs.add(delex_mr)
        syss = [{
            'sys': src_inst['sys%d' % i],
            'ref': src_inst['ref%d' % i],
            'val': src_inst['%s%d' % (src_col, i)]
        } for i in xrange(1, 6)]

        for sys1, sys2 in itertools.combinations(syss, 2):
            if sys1['val'] < sys2['val']:  # without loss of generality
                sys1, sys2 = sys2, sys1
            if sys1['val'] == sys2['val']:  # ignore those that are equal
                continue
            trg_inst = {
                'dataset': 'E2E',
                'system': SYSTEMS_MAP[sys1['sys']],
                'system2': SYSTEMS_MAP[sys2['sys']],
                'orig_ref': None,
                'mr': mr,
                'delex_mr': delex_mr,
                'system_ref': sys1['ref'],
                'system_ref2': sys2['ref'],
                'is_real': 1,
                'informativeness': None,
                'naturalness': None,
                'quality': None
            }
            trg_inst[trg_col] = 1
            data.append(trg_inst)

    unique_mrs = sorted(list(unique_mrs))
    random.shuffle(unique_mrs)

    part_sizes = [int(p) for p in args.ratio.split(':')]
    part_sizes = [
        int(round(p * len(unique_mrs) / float(sum(part_sizes))))
        for p in part_sizes
    ]
    part_sizes[0] = len(unique_mrs) - sum(part_sizes[1:])
    part_labels = args.labels.split(':')
    part_start = 0
    log_info('Data sizes in MRs: %s' % ':'.join([str(p) for p in part_sizes]))

    # remove ambiguous instances
    if args.unambiguous:
        occs = Counter([(inst['mr'], inst['system'], inst['system2'])
                        for inst in data])
        ambig = set()
        for mr, sys1, sys2 in occs.iterkeys():
            if occs.get((mr, sys2, sys1), 0) == occs[(mr, sys1, sys2)]:
                ambig.add((mr, sys1, sys2))

        uniq_data = []
        used_insts = set()
        for inst in data:
            mr, sys1, sys2 = inst['mr'], inst['system'], inst['system2']
            if (mr, sys1, sys2) in ambig or (mr, sys1, sys2) in used_insts:
                continue
            uniq_data.append(inst)
            used_insts.add((mr, sys1, sys2))
        data = uniq_data

    # mark down the configuration
    with codecs.open(os.path.join(args.out_path, 'config'),
                     'wb',
                     encoding='UTF-8') as fh:
        fh.write(pprint.pformat(vars(args), indent=4, width=100))

    # split the output
    for part_no, (part_size,
                  part_label) in enumerate(zip(part_sizes, part_labels)):
        part_mrs = set(unique_mrs[part_start:part_start + part_size])
        part_data = [inst for inst in data if inst['delex_mr'] in part_mrs]

        if args.shuffle:
            random.shuffle(part_data)

        part_df = pd.DataFrame(part_data)

        if part_no == 0 and args.fake_data:
            # create fake data
            indiv_sys_outputs = get_sys_outputs(part_data)
            if args.fake_data_from:
                indiv_sys_outputs.extend(
                    read_system_training_data(args.fake_data_from))
            fake_insts = create_fake_data(
                pd.DataFrame.from_records(indiv_sys_outputs),
                part_df.columns,
                score_type='rank')
            fake_pairs = create_fake_pairs(fake_insts, len(indiv_sys_outputs))
            part_df = part_df.append(fake_pairs, sort=True)

        out_file = os.path.join(args.out_path, part_label + '.tsv')
        log_info('File: %s, total size %d' % (out_file, len(part_df)))
        part_df.to_csv(out_file,
                       columns=COLUMNS,
                       sep=b"\t",
                       index=False,
                       encoding='UTF-8')

        part_start += part_size
Ejemplo n.º 34
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        csvread = csv.reader(fh, encoding='UTF-8')
        csvread.next()  # skip header
        for mr, text in csvread:
            da = DA.parse_diligent_da(mr)
            process_instance(da, text)
            insts += 1

        print 'Processed', insts, 'instances.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das)))
        print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]),
                                                    max([text.count(' ') + 1 for text in texts]))

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts):
            group = groups.get(unicode(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[unicode(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.itervalues():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join(["\t".join([unicode(a) for a in absts_])
                                    for absts_ in group['abst']]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(unicode(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(unicode(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
Ejemplo n.º 35
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language,
                                         self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language,
                                             self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(
                trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [
                da.get_delexicalized(self.delex_slots) for da in self.train_das
            ]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' %
                 len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Ejemplo n.º 36
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # initialize storage
    items = 0
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions
    contexts = []  # abstracted contexts
    conc_contexts = []  # lexicalized contexts

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for item in data:
            da = convert_abstr_da(DA.parse(item['response_da']))
            context = convert_abstractions(item['context_utt'])
            context_l = item['context_utt_l']
            conc_da = DA.parse(item['response_da_l'])
            concs_ = [tokenize(s) for s in item['response_nl_l']]
            absts_ = []
            texts_ = []
            for abst_text in item['response_nl']:
                text, abst = get_abstraction(
                    abst_text, conc_da, args.slot_names)  # convert *SLOT -> X
                absts_.append(abst)
                texts_.append(text)

            das.append(da)
            contexts.append(context)
            conc_contexts.append(context_l)
            concs.append(concs_)
            absts.append(absts_)
            texts.append(texts_)
            items += 1

        print 'Processed', items, 'items.'

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them)
        total = float(sum(data_sizes))
        remain = items
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(round(items * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [items]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        repeat_num = len(concs[0])
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:
            repeat_num = 1

        # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode
        write_part(part_name + '-das.txt', das, part_size, repeat_num)
        write_part(part_name + '-context.txt', contexts, part_size, repeat_num)
        write_part(part_name + '-conc_context.txt', conc_contexts, part_size,
                   repeat_num)

        # write all other just once (here, each instance is a list, it will be unrolled)
        write_part(part_name + '-conc.txt', concs, part_size)
        write_part(part_name + '-abst.txt', absts, part_size)
        write_part(part_name + '-text.txt', texts, part_size)
Ejemplo n.º 37
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da,
                                    tokenize(conc),
                                    slots_to_abstract,
                                    args.slot_names,
                                    repeated=True)
        text = text.lower().replace('x-',
                                    'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        csvread = csv.reader(fh, encoding='UTF-8')
        csvread.next()  # skip header
        for mr, text, voice in csvread:
            da = DA.parse_diligent_da(mr, voice)
            process_instance(da, text)
            insts += 1

        print 'Processed', insts, 'instances.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) /
                                            float(len(das)))
        print 'Max DA len: %d, max text len: %d' % (max(
            [len(da)
             for da in das]), max([text.count(' ') + 1 for text in texts]))

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts,
                                                 absts):
            group = groups.get(unicode(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[unicode(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.itervalues():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join([
                "\t".join([unicode(a) for a in absts_])
                for absts_ in group['abst']
            ]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(unicode(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(unicode(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
Ejemplo n.º 38
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # initialize storage
    items = 0
    conc_das = [] # concrete DAs
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions
    contexts = []  # abstracted contexts
    conc_contexts = []  # lexicalized contexts

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for item in data:
            da = convert_abstr_da(DA.parse(item['response_da']))
            context = convert_abstractions(item['context_utt'])
            context_l = item['context_utt_l']
            conc_da = DA.parse(item['response_da_l'])
            concs_ = [tokenize(s) for s in item['response_nl_l']]
            absts_ = []
            texts_ = []
            for abst_text in item['response_nl']:
                text, abst = get_abstraction(abst_text, conc_da, args.slot_names)  # convert *SLOT -> X
                absts_.append(abst)
                texts_.append(text)

            das.append(da)
            conc_das.append(conc_da)
            contexts.append(context)
            conc_contexts.append(context_l)
            concs.append(concs_)
            absts.append(absts_)
            texts.append(texts_)
            items += 1

        print 'Processed', items, 'items.'

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them)
        total = float(sum(data_sizes))
        remain = items
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(round(items * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [items]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        repeat_num = len(concs[0])
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:
            repeat_num = 1

        # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode
        write_part(part_name + '-das.txt', das, part_size, repeat_num)
        write_part(part_name + '-conc_das.txt', conc_das, part_size, repeat_num)
        write_part(part_name + '-context.txt', contexts, part_size, repeat_num)
        write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num)

        # write all other just once (here, each instance is a list, it will be unrolled)
        write_part(part_name + '-ref.txt', concs, part_size, trunc=False, separate=True)
        write_part(part_name + '-conc.txt', concs, part_size)
        write_part(part_name + '-abst.txt', absts, part_size)
        write_part(part_name + '-text.txt', texts, part_size)