Esempio n. 1
0
    def process_dataset(self, input_data):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_data: path to the input JSON file with the data
        """
        # load data from JSON
        self._das = []
        self._texts = []
        with codecs.open(input_data, 'r', encoding='UTF-8') as fh:
            data = json.load(fh)
            for inst in data:
                da = DA.parse_cambridge_da(inst['da'])
                da.sort()
                self._das.append(da)
                self._texts.append(self.analyze(inst['text']))

        # delexicalize DAs and sentences
        self._create_delex_texts()
        self._create_delex_das()

        # return the result
        out = []
        for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts):
            out.append(Inst(da, text, delex_da, delex_text, abst))
        return out
Esempio n. 2
0
def main(in_file, out_mrs, out_refs):

    abst_das = []
    conc_das = []
    conc_da_texts = []
    abst_texts = []
    with codecs.open(in_file, 'r', 'UTF-8') as fh:
        for line in fh:
            line = line.strip()

            if line.startswith('FULL_DA'):
                line = re.sub('^FULL_DA = ', '', line)
                conc_das.append(DA.parse_cambridge_da(line))
                conc_da_texts.append(line)
            elif line.startswith('ABSTRACT_DA'):
                line = re.sub('^ABSTRACT_DA = ', '', line)
                abst_das.append(DA.parse_cambridge_da(line))
            elif line.startswith('->'):
                line = re.sub('^-> "', '', line)
                line = re.sub('";\s*$', '', line)
                line = re.sub(r'\[([a-z]+)\+X\]X', r'X-\1', line)
                line = re.sub(r'\[[^\]]*\]', '', line)
                abst_texts.append(line)

    conc_texts = []
    for abst_da, conc_da, abst_text in zip(abst_das, conc_das, abst_texts):
        text = abst_text
        for abst_dai, conc_dai in zip(abst_da.dais, conc_da.dais):
            assert abst_dai.slot == conc_dai.slot
            if abst_dai.value.startswith('X'):
                text = text.replace('X-' + abst_dai.slot, conc_dai.value, 1)
        text = re.sub(r'the The', 'The', text)
        conc_texts.append(text)

    with codecs.open(out_mrs, 'w', 'UTF-8') as fh:
        fh.write("\n".join(conc_da_texts))

    with codecs.open(out_refs, 'w', 'UTF-8') as fh:
        fh.write("\n".join(conc_texts))
Esempio n. 3
0
def interactive_input(das_type='cambridge',
                      delex_slots=set(),
                      delex_slot_names=False,
                      delex_das=False,
                      input_da=True,
                      input_ref=False):

    da = None
    if input_da:
        da = raw_input('Enter DA             : ').decode('utf-8').strip()
        if not da:
            return None
        if das_type == 'text':
            da = [(tok, None)
                  for tok in preprocess_sent(None, da, False, False)]
        else:
            da = DA.parse_cambridge_da(da)
            if delex_das:
                da = da.get_delexicalized(delex_slots)
    ref = None
    if input_ref:
        ref = raw_input('Enter reference      : ').decode('utf-8').strip()
        if not ref:
            return None
        ref = [
            (tok, None)
            for tok in preprocess_sent(da, ref, delex_slots, delex_slot_names)
        ]

    hyp = raw_input('Enter system output 1: ').decode('utf-8').strip()
    if not hyp:
        return None
    hyp = [(tok, None)
           for tok in preprocess_sent(da, hyp, delex_slots, delex_slot_names)]

    hyp2 = raw_input('Enter system output 2: ').decode('utf-8').strip()
    if not hyp2:
        hyp2 = []
    else:
        hyp2 = [
            (tok, None)
            for tok in preprocess_sent(da, hyp2, delex_slots, delex_slot_names)
        ]

    return (da, ref, hyp, hyp2)
Esempio n. 4
0
def read_outputs(filename):
    data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8')
    if isinstance(data.iloc[len(data) - 1]['mr'], float):
        # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it,
        # probably an error in Pandas
        print(
            '!!!Strangely need to remove an empty intstance from the end of %s'
            % filename)
        data = data[:-1]
    das = [DA.parse_cambridge_da(da) for da in data['mr']]

    # force string data type for empty human references
    data['orig_ref'] = data['orig_ref'].apply(
        lambda x: '' if not isinstance(x, basestring) else x)
    texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['orig_ref']]
    texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['system_output']]
    if 'system_output2' not in data:
        data['system_output2'] = [None] * len(data)
    texts_hyp2 = [[(tok, None)
                   for tok in tokenize(sent.lower()).split(' ')] if isinstance(
                       sent, basestring) else None
                  for sent in data['system_output2']]
    inputs = [(da, text_ref, text_hyp, text_hyp2)
              for da, text_ref, text_hyp, text_hyp2 in zip(
                  das, texts_ref, texts_hyp, texts_hyp2)]

    # find out which columns were used for ratings
    target_cols = [
        c[:-len('_system_rating')] for c in data.columns
        if c.endswith('_system_rating')
    ]
    assert target_cols
    # compile data from all these columns
    outputs = {}
    for target_col in target_cols:
        outputs[target_col] = {
            subcol: list(data[target_col + '_' + subcol])
            for subcol in [
                'human_rating_raw', 'human_rating', 'system_rating_raw',
                'system_rating', 'rank_loss', 'rank_ok'
            ]
        }
    return (inputs, outputs)
Esempio n. 5
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    turns = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract,
                                    args.slot_names)
        da.sort(
        )  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for dialogue in data:
            if isinstance(dialogue, dict):
                for turn in dialogue['dial']:
                    da = DA.parse_cambridge_da(turn['S']['dact'])
                    if args.skip_hello and len(
                            da) == 1 and da[0].da_type == 'hello':
                        continue  # skip hello() DAs
                    conc = postprocess_sent(turn['S']['ref'])
                    process_instance(da, conc)
                    turns += 1
            else:
                da = DA.parse_cambridge_da(dialogue[0])
                conc = postprocess_sent(dialogue[1])
                process_instance(da, conc)
                turns += 1

        print 'Processed', turns, 'turns.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) /
                                            float(len(das)))

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them up, as Wen does)
        total = float(sum(data_sizes))
        remain = turns
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(ceil(turns * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [turns]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        # create multiple lexicalized references for each instance by relexicalizing sentences
        # with the same DA from the same part
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:

            # group sentences with the same DA
            da_groups = {}
            for da, text, abst in zip(das[0:part_size], texts[0:part_size],
                                      absts[0:part_size]):
                da_groups[unicode(da)] = da_groups.get(unicode(da), [])
                da_groups[unicode(da)].append(
                    (text, filter_abst(abst, slots_to_abstract)))

            for da_str in da_groups.keys():
                seen = set()
                uniq = []
                for text, abst in da_groups[da_str]:
                    sig = text + "\n" + ' '.join(
                        [a.slot + str(a.start) for a in abst])
                    if sig not in seen:
                        seen.add(sig)
                        uniq.append((text, abst))
                da_groups[da_str] = uniq

            # relexicalize all abstract sentences for each DA
            relex = []
            for da, abst in zip(das[0:part_size], absts[0:part_size]):
                relex.append(
                    relexicalize(da_groups[unicode(da)],
                                 filter_abst(abst, slots_to_abstract)))

            with open(part_name + '-ref.txt', 'w') as fh:
                for relex_pars in relex:
                    fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n")

        with open(part_name + '-das.txt', 'w') as fh:
            for da in das[0:part_size]:
                fh.write(unicode(da).encode('utf-8') + "\n")
            del das[0:part_size]

        with open(part_name + '-conc_das.txt', 'w') as fh:
            for conc_da in conc_das[0:part_size]:
                fh.write(unicode(conc_da).encode('utf-8') + "\n")
            del conc_das[0:part_size]

        with open(part_name + '-conc.txt', 'w') as fh:
            for conc in concs[0:part_size]:
                fh.write(conc.encode('utf-8') + "\n")
            del concs[0:part_size]

        with open(part_name + '-abst.txt', 'w') as fh:
            for abst in absts[0:part_size]:
                fh.write("\t".join([unicode(a)
                                    for a in abst]).encode('utf-8') + "\n")
            del absts[0:part_size]

        with open(part_name + '-text.txt', 'w') as fh:
            for text in texts[0:part_size]:
                fh.write(text.encode('utf-8') + "\n")
            del texts[0:part_size]
Esempio n. 6
0
def create_fake_data(real_data, columns, score_type='nlg'):
    """Given some real data, create additional fake data, using human references and
    distorting them. Will start from scores provided, or default to best possible score.
    @param real_data: a real data set, as pd.DataFrame
    @param columns: list of columns for the fake data set
    @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter')
    @return: a fake data set, with the given columns, some of them empty
    """
    def target_score(src_score, distort_step):
        if score_type == 'hter':
            return src_score + distort_step
        elif score_type == 'rank':
            return 1.  # ignore scores for ranks
        return max(1, min(4., src_score - distort_step))

    normalize = False
    best_score = 6.
    num_steps = 4
    if score_type == 'hter':
        normalize = True
        best_score = 0.
        num_steps = 5
    elif score_type == 'rank':
        best_score = 1.

    fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)),
                             columns=columns)
    vocab = {}

    # add references as perfect data items
    for idx, row in enumerate(real_data.itertuples()):
        fake_data.loc[idx]['orig_ref'] = row.orig_ref
        fake_data.loc[idx]['system_ref'] = row.orig_ref
        fake_data.loc[idx]['mr'] = row.mr
        fake_data.loc[idx]['is_real'] = 0
        for quant in ['naturalness', 'quality', 'informativeness']:
            fake_data.loc[idx][quant] = (getattr(row, quant) if (
                hasattr(row, quant) and getattr(row, quant) is not None
                and not np.isnan(getattr(row, quant))) else best_score)

        for tok in tokenize(row.orig_ref).split(' '):
            vocab[tok] = vocab.get(tok, 0) + 1

    lexicalizer = Lexicalizer(cfg={'mode': 'tokens'})  # default lexicalizer
    vocab = build_vocab(vocab)

    for distort_step in xrange(1, num_steps + 1):
        for idx, row in enumerate(real_data.itertuples(),
                                  start=distort_step * len(real_data)):

            fake_data.loc[idx]['orig_ref'] = row.orig_ref
            fake_data.loc[idx]['mr'] = row.mr
            fake_data.loc[idx]['is_real'] = 0

            # delexicalize data
            da = DA.parse_cambridge_da(row.mr)
            sent, _, lex_instr = delex_sent(da,
                                            tokenize(row.orig_ref).split(' '),
                                            DELEX_SLOTS)
            ref_len = len(sent)
            # distort
            sent = distort_sent(sent, distort_step, vocab)
            # lexicalize again
            sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent],
                                                 lex_instr)
            fake_data.loc[idx]['system_ref'] = ' '.join(sent)

            for quant in ['naturalness', 'quality', 'informativeness']:
                score = (getattr(row, quant) if (
                    hasattr(row, quant) and getattr(row, quant) is not None
                    and not np.isnan(getattr(row, quant))) else best_score)
                score = target_score(score, distort_step)
                fake_data.loc[idx][quant] = (((score / ref_len) *
                                              100) if normalize else score)

    return fake_data
Esempio n. 7
0
def read_bagel_data():
    with codecs.open('data/bagel-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/bagel-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse_cambridge_da(mr) for mr in fh.readlines()]
    return mrs, refs
Esempio n. 8
0
def convert(args):
    """Main function – read in the JSON data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    turns = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names)
        da.sort()  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        data = json.load(fh, encoding='UTF-8')
        for dialogue in data:
            if isinstance(dialogue, dict):
                for turn in dialogue['dial']:
                    da = DA.parse_cambridge_da(turn['S']['dact'])
                    if args.skip_hello and len(da) == 1 and da[0].da_type == 'hello':
                        continue  # skip hello() DAs
                    conc = postprocess_sent(turn['S']['ref'])
                    process_instance(da, conc)
                    turns += 1
            else:
                da = DA.parse_cambridge_da(dialogue[0])
                conc = postprocess_sent(dialogue[1])
                process_instance(da, conc)
                turns += 1

        print 'Processed', turns, 'turns.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das)))

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_name)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them up, as Wen does)
        total = float(sum(data_sizes))
        remain = turns
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(ceil(turns * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [turns]
        out_names = [args.out_name]

    # write all data parts
    for part_size, part_name in zip(data_sizes, out_names):

        # create multiple lexicalized references for each instance by relexicalizing sentences
        # with the same DA from the same part
        if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']:

            # group sentences with the same DA
            da_groups = {}
            for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]):
                da_groups[unicode(da)] = da_groups.get(unicode(da), [])
                da_groups[unicode(da)].append((text, filter_abst(abst, slots_to_abstract)))

            for da_str in da_groups.keys():
                seen = set()
                uniq = []
                for text, abst in da_groups[da_str]:
                    sig = text + "\n" + ' '.join([a.slot + str(a.start) for a in abst])
                    if sig not in seen:
                        seen.add(sig)
                        uniq.append((text, abst))
                da_groups[da_str] = uniq

            # relexicalize all abstract sentences for each DA
            relex = []
            for da, abst in zip(das[0:part_size], absts[0:part_size]):
                relex.append(relexicalize(da_groups[unicode(da)],
                                          filter_abst(abst, slots_to_abstract)))

            with open(part_name + '-ref.txt', 'w') as fh:
                for relex_pars in relex:
                    fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n")

        with open(part_name + '-das.txt', 'w') as fh:
            for da in das[0:part_size]:
                fh.write(unicode(da).encode('utf-8') + "\n")
            del das[0:part_size]

        with open(part_name + '-conc_das.txt', 'w') as fh:
            for conc_da in conc_das[0:part_size]:
                fh.write(unicode(conc_da).encode('utf-8') + "\n")
            del conc_das[0:part_size]

        with open(part_name + '-conc.txt', 'w') as fh:
            for conc in concs[0:part_size]:
                fh.write(conc.encode('utf-8') + "\n")
            del concs[0:part_size]

        with open(part_name + '-abst.txt', 'w') as fh:
            for abst in absts[0:part_size]:
                fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n")
            del absts[0:part_size]

        with open(part_name + '-text.txt', 'w') as fh:
            for text in texts[0:part_size]:
                fh.write(text.encode('utf-8') + "\n")
            del texts[0:part_size]
Esempio n. 9
0
def read_data(filename,
              target_cols,
              das_type='cambridge',
              delex_slots=set(),
              delex_slot_names=False,
              delex_das=False):
    """Read the input data from a TSV file."""

    refs_cache = {}

    def cached_preprocess_sent(da, sent):
        """we're caching since with generated data, we're likely to parse the same sentence many times."""
        if (da, sent) not in refs_cache:
            refs_cache[(da, sent)] = preprocess_sent(da, sent, delex_slots,
                                                     delex_slot_names)
        return list(refs_cache[(da, sent)])

    log_info("Reading %s..." % filename)
    data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8')
    log_info("Loaded %d instances." % len(data))

    # force data type to string if the data set doesn't contain human references
    data['orig_ref'] = data['orig_ref'].apply(
        lambda x: '' if not isinstance(x, basestring) else x)
    log_info("Adapted refs data type.")

    if das_type == 'text':  # for MT output classification
        das = [[(tok, None)
                for tok in preprocess_sent(None, sent, False, False)]
               for sent in data['mr']]
    else:
        das = [DA.parse_cambridge_da(da) for da in data['mr']]
    log_info("Parsed DAs.")

    texts_ref = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                 for da, sent in zip(das, data['orig_ref'])]
    log_info("Preprocessed human refs.")
    texts_hyp = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                 for da, sent in zip(das, data['system_ref'])]
    log_info("Preprocessed system outputs.")

    # alternative reference with rating difference / use to compare
    if 'system_ref2' in data.columns:
        texts_hyp2 = [[(tok, None) for tok in cached_preprocess_sent(da, sent)]
                      if isinstance(sent, basestring) else None
                      for da, sent in zip(das, data['system_ref2'])]
    else:
        texts_hyp2 = [None] * len(texts_hyp)
    log_info("Preprocessed 2nd system outputs.")

    # DA delexicalization must take place after text delexicalization
    if das_type != 'text' and delex_das:
        das = [da.get_delexicalized(delex_slots) for da in das]
    log_info("Delexicalized DAs.")

    # fake data indicator
    if 'is_real' in data.columns:
        real_indics = [0 if indic == 0 else 1 for indic in data['is_real']]
    else:
        real_indics = [1 for _ in xrange(len(data))]
    log_info("Retrieved is_real indications.")

    inputs = [(da, ref, hyp, hyp2, ri) for da, ref, hyp, hyp2, ri in zip(
        das, texts_ref, texts_hyp, texts_hyp2, real_indics)]
    log_info("Built inputs list.")

    targets = np.array(
        data[[target_cols] if not isinstance(target_cols, list
                                             ) else target_cols],
        dtype=np.float)
    log_info("Built targets list.")

    return inputs, targets