def process_permutation(permutation): idx = 0 def reduce_sentence(result, cur): if not is_synonym(cur): return fp.append(fp.clone(cur))(result) nonlocal idx value = permutation[idx] idx += 1 # Check if it's not an empty value if value: return fp.append({ 'type': 'text', 'value': value, })(result) return result parts = fp.reduce(reduce_sentence)(sentence) part_idx = 0 def reduce_whitespaces_in_part(result, part): cur = fp.clone(part) nonlocal part_idx # First element if part_idx == 0: cur['value'] = cur['value'].lstrip() # Last element or the following one starts with a space if part_idx == (len(parts) - 1) or parts[part_idx + 1]['value'][0] == ' ': cur['value'] = cur['value'].rstrip() part_idx += 1 if not cur['value']: return result return fp.append(cur)(result) # Remove uneeded whitespaces introduced by optional synonyms stripped_parts = fp.reduce(reduce_whitespaces_in_part)(parts) return stripped_parts
def build_sentence(sentence): entities = [] def reduce_sentence(result, cur): if not utils.is_entity(cur): return result + cur.get('value') entity_name = get_real_entity(cur.get('value')) value = augment.get_entity(entity_name).next( cur.get('variant')) nonlocal entities entities.append({ 'start': len(result), 'end': len(result) + len(value), 'entity': cur.get('value'), # Check if its a synonym here 'value': synonyms_lookup.get(value, value), }) return result + value return { 'intent': name, 'text': fp.reduce(reduce_sentence, '')(sentence), 'entities': entities, }
def build_entity_synonyms(acc, _, name): def reduce_entity(result, cur): synonyms = augment.get_synonyms(cur) if not synonyms: return result return fp.append({ 'value': cur, 'synonyms': synonyms, })(result) return fp.append( *fp.reduce(reduce_entity)(augment.get_entity(name).all()))(acc)
def snips(chatl, **options): """Transform a chatl dataset to a snips representation as per https://snips-nlu.readthedocs.io/en/0.19.1/dataset.html """ augment = Augment(chatl) def get_entity_type(entity): ent_type = entity.get('props', {}).get('type') or entity.get( 'props', {}).get('snips:type') # If the type is not present in the dataset, let's consider it'a a built-in # one. if ent_type and not augment.entities.get(ent_type): return SNIPS_PREFIX + ent_type if SNIPS_PREFIX not in ent_type else ent_type return ent_type def build_entity(acc, entity, name): ent_type = get_entity_type(entity) if ent_type: if SNIPS_PREFIX in ent_type: return fp.append({ ent_type: {}, })(acc) # It has a type present in the dataset, it should be considered as a slot return acc use_synonyms = False def build_entity_value(ent_name): nonlocal use_synonyms synonyms = augment.get_synonyms(ent_name) use_synonyms = use_synonyms or len(synonyms) > 0 return { 'value': ent_name, 'synonyms': synonyms, } values = fp.map(build_entity_value)(augment.get_entity(name).all()) return fp.append({ name: { 'data': values, 'automatically_extensible': entity['props'].get('extensible', 'true') == 'true', 'matching_strictness': float(entity['props'].get('strictness', '1')), 'use_synonyms': use_synonyms, }, })(acc) def build_sentence_part(part): part_value = part.get('value') if not utils.is_entity(part): return {'text': part_value} entity = augment.entities.get(part_value) # Retrieve the inner type of the entity if defined in the dataset ent_type = get_entity_type(entity) or part_value # And check if it references another defined entity because if it's true, # values will be fetched from here referenced_entity = ent_type if augment.entities.get( ent_type) else part_value return { 'entity': ent_type, 'slot_name': part_value, 'text': augment.get_entity(referenced_entity).next(part.get('variant')), } def build_intents(intent): return { 'utterances': fp.map(lambda sentence: { 'data': fp.map(build_sentence_part)(sentence), })(intent.get('data', [])), } return utils.merge( { 'language': 'en', 'intents': fp.map(build_intents)(augment.get_intents()), 'entities': fp.reduce(build_entity)(augment.entities), }, options)
def test_it_should_correctly_transform_data(self): tests = [ { 'it': 'should provide a function which always returns the given value', 'given': lambda d: fp.always(5)(d), 'with': 1, 'expected': 5, }, { 'it': 'should provide a function to extract a prop from an object', 'given': lambda o: fp.prop('value')(o), 'with': {'some': 'thing', 'value': 'five'}, 'expected': 'five', }, { 'it': 'should provide a function to instantiate a class', 'given': lambda d: fp.instantiate(SayHello)(d), 'with': 'jean', 'expected': SayHello('jean'), }, { 'it': 'should instantiate a class with additional parameters if any', 'given': lambda d: fp.instantiate(SayHello, 'other value')(d), 'with': 'jean', 'expected': SayHello('jean', 'other value'), }, { 'it': 'should provide a function to map on an array', 'given': lambda d: fp.map(lambda s: s.upper())(d), 'with': ['one', 'two', 'three'], 'expected': ['ONE', 'TWO', 'THREE'], }, { 'it': 'should map on object values if given an object', 'given': lambda d: fp.map(lambda s: s.upper())(d), 'with': {'a': 'one', 'b': 'two', 'c': 'three'}, 'expected': {'a': 'ONE', 'b': 'TWO', 'c': 'THREE'}, }, { 'it': 'should provide a function to reduce an array', 'given': lambda d: fp.reduce(lambda p, c: (p.append(c) or p) if c > 5 else p)(d), 'with': [1, 2, 3, 4, 5, 6, 7], 'expected': [6, 7], }, { 'it': 'should reduce an object too', 'given': lambda o: fp.reduce(lambda p, c, key: (p.update({ key: c, }) or p) if c > 5 else p)(o), 'with': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7}, 'expected': {'f': 6, 'g': 7}, }, { 'it': 'should reduce with a given accumulator', 'given': lambda d: fp.reduce(lambda p, c: p.update({c: c}) or p, {})(d), 'with': [1, 2, 3], 'expected': {1: 1, 2: 2, 3: 3}, }, { 'it': 'should provide a way to pipe functions', 'given': lambda d: fp.pipe(fp.always('test'), lambda s: s.upper())(d), 'with': 'something', 'expected': 'TEST', }, { 'it': 'should provide a way to flatten an array', 'given': lambda d: fp.flatten(d), 'with': [[1, 2], [3, 4]], 'expected': [1, 2, 3, 4], }, { 'it': 'should flatten an object too', 'given': lambda d: fp.flatten(d), 'with': {'a': [1, 2], 'b': [3, 4]}, 'expected': [1, 2, 3, 4], }, { 'it': 'should append to an array', 'given': lambda d: fp.append(3, 4)(d), 'with': [1, 2], 'expected': [1, 2, 3, 4], }, { 'it': 'should append to an object', 'given': lambda d: fp.append({'some': 'value'}, {'else': 'too'})(d), 'with': {'an': 'object'}, 'expected': {'an': 'object', 'some': 'value', 'else': 'too'}, }, { 'it': 'should filter array elements', 'given': lambda d: fp.filter(lambda a: (a % 2) == 0)(d), 'with': [1, 2, 3, 4], 'expected': [2, 4], }, { 'it': 'should clone an object', 'given': lambda o: fp.clone(o), 'with': {'an': 'object', 'with': {'nested': 'prop'}}, 'expected': {'an': 'object', 'with': {'nested': 'prop'}}, }, ] for test in tests: yield self.it_should_correctly_transform_data, \ test['it'], test['given'], test['with'], test['expected']
def rasa(chatl, **options): """Convert a chatl dataset to a rasa representation as per https://rasa.com/docs/rasa/1.1.4/nlu/training-data-format/ """ augment = Augment(chatl, True) def get_real_entity(name): entity_type = augment.entities.get(name, {}).get('props', {}).get('type') if entity_type and entity_type in augment.entities: return entity_type return name def get_regex_prop(name): return augment.entities.get(name, {}).get('props', {}).get('regex') # For rasa, we need a map of synonyms -> value synonyms_lookup = fp.reduce(lambda acc, synonyms, value: fp.append(*fp.map( lambda s: {s: value})(synonyms))(acc))(augment.synonyms_values) def build_lookup_table(acc, _, name): entity_name = get_real_entity(name) # Entity has regex feature, returns now if get_regex_prop(entity_name): return acc return fp.append({ 'name': name, 'elements': augment.get_entity(entity_name).all(), })(acc) def build_intent_examples(acc, intent, name): def build_sentence(sentence): entities = [] def reduce_sentence(result, cur): if not utils.is_entity(cur): return result + cur.get('value') entity_name = get_real_entity(cur.get('value')) value = augment.get_entity(entity_name).next( cur.get('variant')) nonlocal entities entities.append({ 'start': len(result), 'end': len(result) + len(value), 'entity': cur.get('value'), # Check if its a synonym here 'value': synonyms_lookup.get(value, value), }) return result + value return { 'intent': name, 'text': fp.reduce(reduce_sentence, '')(sentence), 'entities': entities, } return fp.append(*fp.map(build_sentence)(intent.get('data', [])))(acc) def build_entity_synonyms(acc, _, name): def reduce_entity(result, cur): synonyms = augment.get_synonyms(cur) if not synonyms: return result return fp.append({ 'value': cur, 'synonyms': synonyms, })(result) return fp.append( *fp.reduce(reduce_entity)(augment.get_entity(name).all()))(acc) def build_regex_features(acc, _, name): pattern = get_regex_prop(get_real_entity(name)) if pattern: return fp.append({ 'name': name, 'pattern': pattern, })(acc) return acc return utils.merge( { 'rasa_nlu_data': { 'common_examples': fp.reduce(build_intent_examples, [])(augment.get_intents()), 'regex_features': fp.reduce(build_regex_features, [])(augment.entities), 'lookup_tables': fp.reduce(build_lookup_table, [])(augment.entities), 'entity_synonyms': fp.reduce(build_entity_synonyms, [])(augment.entities), }, }, options)
def process_sentence_data(acc, sentence): sentence_synonyms = fp.filter(is_synonym)(sentence) # No synonyms, just returns now if not sentence_synonyms: return fp.append(sentence)(acc) # Get all synonyms values to generate permutations # For optional synonyms, add an empty entry. def reduce_synonyms(result, synonym_data): return fp.append( ([''] if synonym_data.get('optional') else []) + self.get_synonyms(synonym_data['value']))(result) synonyms_data = fp.reduce(reduce_synonyms)(sentence_synonyms) def process_permutation(permutation): idx = 0 def reduce_sentence(result, cur): if not is_synonym(cur): return fp.append(fp.clone(cur))(result) nonlocal idx value = permutation[idx] idx += 1 # Check if it's not an empty value if value: return fp.append({ 'type': 'text', 'value': value, })(result) return result parts = fp.reduce(reduce_sentence)(sentence) part_idx = 0 def reduce_whitespaces_in_part(result, part): cur = fp.clone(part) nonlocal part_idx # First element if part_idx == 0: cur['value'] = cur['value'].lstrip() # Last element or the following one starts with a space if part_idx == (len(parts) - 1) or parts[part_idx + 1]['value'][0] == ' ': cur['value'] = cur['value'].rstrip() part_idx += 1 if not cur['value']: return result return fp.append(cur)(result) # Remove uneeded whitespaces introduced by optional synonyms stripped_parts = fp.reduce(reduce_whitespaces_in_part)(parts) return stripped_parts return fp.append( *fp.map(process_permutation)(list(product( *synonyms_data))))(acc)
def process_intent_data(intent_data): return fp.append({ 'data': fp.reduce(process_sentence_data)(intent_data.get('data', [])), })(intent_data)