def test_it_should_merge_complex_objects_correctly(self): a = { 'one': 'value', 'b': False, 'arr': [1, 2], 'with': { 'nested': { 'prop': 'here!', }, }, } b = { 'arr': [3], 'b': False, 'another': 'value', 'with': { 'nested': { 'arr': [1, 2], }, }, } c = { 'arr': [4], 'another': 'value overloaded', 'b': True, 'with': { 'nested': { 'arr': [3, 4, 5, 6], }, }, } expect(merge(a, b, c)).to.equal({ 'one': 'value', 'another': 'value overloaded', 'b': True, 'arr': [1, 2, 3, 4], 'with': { 'nested': { 'arr': [1, 2, 3, 4, 5, 6], 'prop': 'here!', }, }, })
def main(): # pragma: no cover """Main entry point for the program. """ parser = argparse.ArgumentParser( description='Generates training dataset from a simple DSL.') parser.add_argument('--version', action='version', version='%(prog)s v' + __version__) parser.add_argument('files', type=str, nargs='+', help='One or more DSL files to process') parser.add_argument('-a', '--adapter', type=str, help='Name of the adapter to use') parser.add_argument('-m', '--merge', type=str, help='Options file to merge with the final result') parser.add_argument('--pretty', action='store_true', help='Pretty output') args = parser.parse_args(sys.argv[1:]) data = {} for file in args.files: with open(file, encoding='utf-8') as handle: data = merge(data, parse(handle.read())) if args.merge: options = json.loads(args.merge) else: options = {} if args.adapter: data = getattr(adapters, args.adapter)(data, **options) print(json.dumps(data, indent=2 if args.pretty else None))
def snips(chatl, **options): """Transform a chatl dataset to a snips representation as per https://snips-nlu.readthedocs.io/en/0.19.1/dataset.html """ augment = Augment(chatl) def get_entity_type(entity): ent_type = entity.get('props', {}).get('type') or entity.get( 'props', {}).get('snips:type') # If the type is not present in the dataset, let's consider it'a a built-in # one. if ent_type and not augment.entities.get(ent_type): return SNIPS_PREFIX + ent_type if SNIPS_PREFIX not in ent_type else ent_type return ent_type def build_entity(acc, entity, name): ent_type = get_entity_type(entity) if ent_type: if SNIPS_PREFIX in ent_type: return fp.append({ ent_type: {}, })(acc) # It has a type present in the dataset, it should be considered as a slot return acc use_synonyms = False def build_entity_value(ent_name): nonlocal use_synonyms synonyms = augment.get_synonyms(ent_name) use_synonyms = use_synonyms or len(synonyms) > 0 return { 'value': ent_name, 'synonyms': synonyms, } values = fp.map(build_entity_value)(augment.get_entity(name).all()) return fp.append({ name: { 'data': values, 'automatically_extensible': entity['props'].get('extensible', 'true') == 'true', 'matching_strictness': float(entity['props'].get('strictness', '1')), 'use_synonyms': use_synonyms, }, })(acc) def build_sentence_part(part): part_value = part.get('value') if not utils.is_entity(part): return {'text': part_value} entity = augment.entities.get(part_value) # Retrieve the inner type of the entity if defined in the dataset ent_type = get_entity_type(entity) or part_value # And check if it references another defined entity because if it's true, # values will be fetched from here referenced_entity = ent_type if augment.entities.get( ent_type) else part_value return { 'entity': ent_type, 'slot_name': part_value, 'text': augment.get_entity(referenced_entity).next(part.get('variant')), } def build_intents(intent): return { 'utterances': fp.map(lambda sentence: { 'data': fp.map(build_sentence_part)(sentence), })(intent.get('data', [])), } return utils.merge( { 'language': 'en', 'intents': fp.map(build_intents)(augment.get_intents()), 'entities': fp.reduce(build_entity)(augment.entities), }, options)
def test_it_should_merge_multiple_datasets_intelligently(self): a = { 'intents': { 'get_forecast': { 'props': { 'some': 'prop' }, 'data': [ [ { 'type': 'text', 'value': 'will it rain in ' }, { 'type': 'entity', 'value': 'city', 'variant': None }, ], ], } }, 'entities': { 'city': { 'variants': { 'cityVariant': [ { 'type': 'text', 'value': 'london' }, { 'type': 'synonym', 'value': 'new york' }, ], }, 'data': [ { 'type': 'text', 'value': 'paris' }, { 'type': 'text', 'value': 'rouen' }, { 'type': 'synonym', 'value': 'new york' }, ], 'props': { 'some': 'entity prop' }, }, }, 'synonyms': { 'new york': { 'props': { 'syn': 'prop' }, 'data': [ { 'type': 'text', 'value': 'nyc' }, ], }, }, } b = { 'intents': { 'get_forecast': { 'props': { 'other': 'intent prop' }, 'data': [ [ { 'type': 'text', 'value': 'will it snow in ' }, { 'type': 'entity', 'value': 'city', 'variant': None }, ], ], }, }, 'entities': { 'city': { 'props': { 'another': 'prop' }, 'variants': {}, 'data': [ { 'type': 'text', 'value': 'new york' }, { 'type': 'text', 'value': 'metz' }, { 'type': 'text', 'value': 'caen' }, { 'type': 'text', 'value': 'paris' }, ], }, }, 'synonyms': {}, } c = { 'intents': { 'lights_on': { 'props': {}, 'data': [[{ 'type': 'text', 'value': 'turn the lights on' }]], }, }, 'entities': { 'city': { 'props': {}, 'data': [], 'variants': { 'cityElsewhere': [{ 'type': 'text', 'value': 'amsterdam' }], 'cityVariant': [{ 'type': 'text', 'value': 'sydney' }], }, }, 'room': { 'props': {}, 'data': [], 'variants': {}, }, }, 'synonyms': { 'basement': { 'props': {}, 'data': [{ 'type': 'text', 'value': 'cellar' }], }, 'new york': { 'props': { 'another': 'prop' }, 'data': [ { 'type': 'text', 'value': 'ny' }, { 'type': 'text', 'value': 'nyc' }, ], }, }, } expect(merge(a, b, c)).to.equal({ 'intents': { 'get_forecast': { 'props': { 'some': 'prop', 'other': 'intent prop' }, 'data': [ [ { 'type': 'text', 'value': 'will it rain in ' }, { 'type': 'entity', 'value': 'city', 'variant': None }, ], [ { 'type': 'text', 'value': 'will it snow in ' }, { 'type': 'entity', 'value': 'city', 'variant': None }, ], ], }, 'lights_on': { 'props': {}, 'data': [[{ 'type': 'text', 'value': 'turn the lights on' }]], }, }, 'entities': { 'city': { 'variants': { 'cityElsewhere': [{ 'type': 'text', 'value': 'amsterdam' }], 'cityVariant': [ { 'type': 'text', 'value': 'london' }, { 'type': 'synonym', 'value': 'new york' }, { 'type': 'text', 'value': 'sydney' }, ], }, 'data': [ { 'type': 'text', 'value': 'paris' }, { 'type': 'text', 'value': 'rouen' }, { 'type': 'synonym', 'value': 'new york' }, { 'type': 'text', 'value': 'new york' }, { 'type': 'text', 'value': 'metz' }, { 'type': 'text', 'value': 'caen' }, ], 'props': { 'some': 'entity prop', 'another': 'prop' }, }, 'room': { 'props': {}, 'data': [], 'variants': {}, }, }, 'synonyms': { 'new york': { 'props': { 'syn': 'prop', 'another': 'prop' }, 'data': [ { 'type': 'text', 'value': 'nyc' }, { 'type': 'text', 'value': 'ny' }, ], }, 'basement': { 'props': {}, 'data': [{ 'type': 'text', 'value': 'cellar' }], }, }, })
def rasa(chatl, **options): """Convert a chatl dataset to a rasa representation as per https://rasa.com/docs/rasa/1.1.4/nlu/training-data-format/ """ augment = Augment(chatl, True) def get_real_entity(name): entity_type = augment.entities.get(name, {}).get('props', {}).get('type') if entity_type and entity_type in augment.entities: return entity_type return name def get_regex_prop(name): return augment.entities.get(name, {}).get('props', {}).get('regex') # For rasa, we need a map of synonyms -> value synonyms_lookup = fp.reduce(lambda acc, synonyms, value: fp.append(*fp.map( lambda s: {s: value})(synonyms))(acc))(augment.synonyms_values) def build_lookup_table(acc, _, name): entity_name = get_real_entity(name) # Entity has regex feature, returns now if get_regex_prop(entity_name): return acc return fp.append({ 'name': name, 'elements': augment.get_entity(entity_name).all(), })(acc) def build_intent_examples(acc, intent, name): def build_sentence(sentence): entities = [] def reduce_sentence(result, cur): if not utils.is_entity(cur): return result + cur.get('value') entity_name = get_real_entity(cur.get('value')) value = augment.get_entity(entity_name).next( cur.get('variant')) nonlocal entities entities.append({ 'start': len(result), 'end': len(result) + len(value), 'entity': cur.get('value'), # Check if its a synonym here 'value': synonyms_lookup.get(value, value), }) return result + value return { 'intent': name, 'text': fp.reduce(reduce_sentence, '')(sentence), 'entities': entities, } return fp.append(*fp.map(build_sentence)(intent.get('data', [])))(acc) def build_entity_synonyms(acc, _, name): def reduce_entity(result, cur): synonyms = augment.get_synonyms(cur) if not synonyms: return result return fp.append({ 'value': cur, 'synonyms': synonyms, })(result) return fp.append( *fp.reduce(reduce_entity)(augment.get_entity(name).all()))(acc) def build_regex_features(acc, _, name): pattern = get_regex_prop(get_real_entity(name)) if pattern: return fp.append({ 'name': name, 'pattern': pattern, })(acc) return acc return utils.merge( { 'rasa_nlu_data': { 'common_examples': fp.reduce(build_intent_examples, [])(augment.get_intents()), 'regex_features': fp.reduce(build_regex_features, [])(augment.entities), 'lookup_tables': fp.reduce(build_lookup_table, [])(augment.entities), 'entity_synonyms': fp.reduce(build_entity_synonyms, [])(augment.entities), }, }, options)