Ejemplo n.º 1
0
    def test_it_should_merge_complex_objects_correctly(self):
        a = {
            'one': 'value',
            'b': False,
            'arr': [1, 2],
            'with': {
                'nested': {
                    'prop': 'here!',
                },
            },
        }

        b = {
            'arr': [3],
            'b': False,
            'another': 'value',
            'with': {
                'nested': {
                    'arr': [1, 2],
                },
            },
        }

        c = {
            'arr': [4],
            'another': 'value overloaded',
            'b': True,
            'with': {
                'nested': {
                    'arr': [3, 4, 5, 6],
                },
            },
        }

        expect(merge(a, b, c)).to.equal({
            'one': 'value',
            'another': 'value overloaded',
            'b': True,
            'arr': [1, 2, 3, 4],
            'with': {
                'nested': {
                    'arr': [1, 2, 3, 4, 5, 6],
                    'prop': 'here!',
                },
            },
        })
Ejemplo n.º 2
0
def main():  # pragma: no cover
    """Main entry point for the program.
    """
    parser = argparse.ArgumentParser(
        description='Generates training dataset from a simple DSL.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s v' + __version__)
    parser.add_argument('files',
                        type=str,
                        nargs='+',
                        help='One or more DSL files to process')
    parser.add_argument('-a',
                        '--adapter',
                        type=str,
                        help='Name of the adapter to use')
    parser.add_argument('-m',
                        '--merge',
                        type=str,
                        help='Options file to merge with the final result')
    parser.add_argument('--pretty', action='store_true', help='Pretty output')

    args = parser.parse_args(sys.argv[1:])

    data = {}

    for file in args.files:
        with open(file, encoding='utf-8') as handle:
            data = merge(data, parse(handle.read()))

    if args.merge:
        options = json.loads(args.merge)
    else:
        options = {}

    if args.adapter:
        data = getattr(adapters, args.adapter)(data, **options)

    print(json.dumps(data, indent=2 if args.pretty else None))
Ejemplo n.º 3
0
def snips(chatl, **options):
    """Transform a chatl dataset to a snips representation as per
    https://snips-nlu.readthedocs.io/en/0.19.1/dataset.html
    """
    augment = Augment(chatl)

    def get_entity_type(entity):
        ent_type = entity.get('props', {}).get('type') or entity.get(
            'props', {}).get('snips:type')

        # If the type is not present in the dataset, let's consider it'a a built-in
        # one.
        if ent_type and not augment.entities.get(ent_type):
            return SNIPS_PREFIX + ent_type if SNIPS_PREFIX not in ent_type else ent_type

        return ent_type

    def build_entity(acc, entity, name):
        ent_type = get_entity_type(entity)

        if ent_type:
            if SNIPS_PREFIX in ent_type:
                return fp.append({
                    ent_type: {},
                })(acc)

            # It has a type present in the dataset, it should be considered as a slot
            return acc

        use_synonyms = False

        def build_entity_value(ent_name):
            nonlocal use_synonyms
            synonyms = augment.get_synonyms(ent_name)
            use_synonyms = use_synonyms or len(synonyms) > 0
            return {
                'value': ent_name,
                'synonyms': synonyms,
            }

        values = fp.map(build_entity_value)(augment.get_entity(name).all())

        return fp.append({
            name: {
                'data':
                values,
                'automatically_extensible':
                entity['props'].get('extensible', 'true') == 'true',
                'matching_strictness':
                float(entity['props'].get('strictness', '1')),
                'use_synonyms':
                use_synonyms,
            },
        })(acc)

    def build_sentence_part(part):
        part_value = part.get('value')

        if not utils.is_entity(part):
            return {'text': part_value}

        entity = augment.entities.get(part_value)
        # Retrieve the inner type of the entity if defined in the dataset
        ent_type = get_entity_type(entity) or part_value
        # And check if it references another defined entity because if it's true,
        # values will be fetched from here
        referenced_entity = ent_type if augment.entities.get(
            ent_type) else part_value

        return {
            'entity':
            ent_type,
            'slot_name':
            part_value,
            'text':
            augment.get_entity(referenced_entity).next(part.get('variant')),
        }

    def build_intents(intent):
        return {
            'utterances':
            fp.map(lambda sentence: {
                'data': fp.map(build_sentence_part)(sentence),
            })(intent.get('data', [])),
        }

    return utils.merge(
        {
            'language': 'en',
            'intents': fp.map(build_intents)(augment.get_intents()),
            'entities': fp.reduce(build_entity)(augment.entities),
        }, options)
Ejemplo n.º 4
0
    def test_it_should_merge_multiple_datasets_intelligently(self):
        a = {
            'intents': {
                'get_forecast': {
                    'props': {
                        'some': 'prop'
                    },
                    'data': [
                        [
                            {
                                'type': 'text',
                                'value': 'will it rain in '
                            },
                            {
                                'type': 'entity',
                                'value': 'city',
                                'variant': None
                            },
                        ],
                    ],
                }
            },
            'entities': {
                'city': {
                    'variants': {
                        'cityVariant': [
                            {
                                'type': 'text',
                                'value': 'london'
                            },
                            {
                                'type': 'synonym',
                                'value': 'new york'
                            },
                        ],
                    },
                    'data': [
                        {
                            'type': 'text',
                            'value': 'paris'
                        },
                        {
                            'type': 'text',
                            'value': 'rouen'
                        },
                        {
                            'type': 'synonym',
                            'value': 'new york'
                        },
                    ],
                    'props': {
                        'some': 'entity prop'
                    },
                },
            },
            'synonyms': {
                'new york': {
                    'props': {
                        'syn': 'prop'
                    },
                    'data': [
                        {
                            'type': 'text',
                            'value': 'nyc'
                        },
                    ],
                },
            },
        }

        b = {
            'intents': {
                'get_forecast': {
                    'props': {
                        'other': 'intent prop'
                    },
                    'data': [
                        [
                            {
                                'type': 'text',
                                'value': 'will it snow in '
                            },
                            {
                                'type': 'entity',
                                'value': 'city',
                                'variant': None
                            },
                        ],
                    ],
                },
            },
            'entities': {
                'city': {
                    'props': {
                        'another': 'prop'
                    },
                    'variants': {},
                    'data': [
                        {
                            'type': 'text',
                            'value': 'new york'
                        },
                        {
                            'type': 'text',
                            'value': 'metz'
                        },
                        {
                            'type': 'text',
                            'value': 'caen'
                        },
                        {
                            'type': 'text',
                            'value': 'paris'
                        },
                    ],
                },
            },
            'synonyms': {},
        }

        c = {
            'intents': {
                'lights_on': {
                    'props': {},
                    'data': [[{
                        'type': 'text',
                        'value': 'turn the lights on'
                    }]],
                },
            },
            'entities': {
                'city': {
                    'props': {},
                    'data': [],
                    'variants': {
                        'cityElsewhere': [{
                            'type': 'text',
                            'value': 'amsterdam'
                        }],
                        'cityVariant': [{
                            'type': 'text',
                            'value': 'sydney'
                        }],
                    },
                },
                'room': {
                    'props': {},
                    'data': [],
                    'variants': {},
                },
            },
            'synonyms': {
                'basement': {
                    'props': {},
                    'data': [{
                        'type': 'text',
                        'value': 'cellar'
                    }],
                },
                'new york': {
                    'props': {
                        'another': 'prop'
                    },
                    'data': [
                        {
                            'type': 'text',
                            'value': 'ny'
                        },
                        {
                            'type': 'text',
                            'value': 'nyc'
                        },
                    ],
                },
            },
        }

        expect(merge(a, b, c)).to.equal({
            'intents': {
                'get_forecast': {
                    'props': {
                        'some': 'prop',
                        'other': 'intent prop'
                    },
                    'data': [
                        [
                            {
                                'type': 'text',
                                'value': 'will it rain in '
                            },
                            {
                                'type': 'entity',
                                'value': 'city',
                                'variant': None
                            },
                        ],
                        [
                            {
                                'type': 'text',
                                'value': 'will it snow in '
                            },
                            {
                                'type': 'entity',
                                'value': 'city',
                                'variant': None
                            },
                        ],
                    ],
                },
                'lights_on': {
                    'props': {},
                    'data': [[{
                        'type': 'text',
                        'value': 'turn the lights on'
                    }]],
                },
            },
            'entities': {
                'city': {
                    'variants': {
                        'cityElsewhere': [{
                            'type': 'text',
                            'value': 'amsterdam'
                        }],
                        'cityVariant': [
                            {
                                'type': 'text',
                                'value': 'london'
                            },
                            {
                                'type': 'synonym',
                                'value': 'new york'
                            },
                            {
                                'type': 'text',
                                'value': 'sydney'
                            },
                        ],
                    },
                    'data': [
                        {
                            'type': 'text',
                            'value': 'paris'
                        },
                        {
                            'type': 'text',
                            'value': 'rouen'
                        },
                        {
                            'type': 'synonym',
                            'value': 'new york'
                        },
                        {
                            'type': 'text',
                            'value': 'new york'
                        },
                        {
                            'type': 'text',
                            'value': 'metz'
                        },
                        {
                            'type': 'text',
                            'value': 'caen'
                        },
                    ],
                    'props': {
                        'some': 'entity prop',
                        'another': 'prop'
                    },
                },
                'room': {
                    'props': {},
                    'data': [],
                    'variants': {},
                },
            },
            'synonyms': {
                'new york': {
                    'props': {
                        'syn': 'prop',
                        'another': 'prop'
                    },
                    'data': [
                        {
                            'type': 'text',
                            'value': 'nyc'
                        },
                        {
                            'type': 'text',
                            'value': 'ny'
                        },
                    ],
                },
                'basement': {
                    'props': {},
                    'data': [{
                        'type': 'text',
                        'value': 'cellar'
                    }],
                },
            },
        })
Ejemplo n.º 5
0
def rasa(chatl, **options):
    """Convert a chatl dataset to a rasa representation as per
    https://rasa.com/docs/rasa/1.1.4/nlu/training-data-format/
    """
    augment = Augment(chatl, True)

    def get_real_entity(name):
        entity_type = augment.entities.get(name, {}).get('props',
                                                         {}).get('type')

        if entity_type and entity_type in augment.entities:
            return entity_type

        return name

    def get_regex_prop(name):
        return augment.entities.get(name, {}).get('props', {}).get('regex')

    # For rasa, we need a map of synonyms -> value
    synonyms_lookup = fp.reduce(lambda acc, synonyms, value: fp.append(*fp.map(
        lambda s: {s: value})(synonyms))(acc))(augment.synonyms_values)

    def build_lookup_table(acc, _, name):
        entity_name = get_real_entity(name)

        # Entity has regex feature, returns now
        if get_regex_prop(entity_name):
            return acc

        return fp.append({
            'name': name,
            'elements': augment.get_entity(entity_name).all(),
        })(acc)

    def build_intent_examples(acc, intent, name):
        def build_sentence(sentence):
            entities = []

            def reduce_sentence(result, cur):
                if not utils.is_entity(cur):
                    return result + cur.get('value')

                entity_name = get_real_entity(cur.get('value'))
                value = augment.get_entity(entity_name).next(
                    cur.get('variant'))

                nonlocal entities

                entities.append({
                    'start': len(result),
                    'end': len(result) + len(value),
                    'entity': cur.get('value'),
                    # Check if its a synonym here
                    'value': synonyms_lookup.get(value, value),
                })

                return result + value

            return {
                'intent': name,
                'text': fp.reduce(reduce_sentence, '')(sentence),
                'entities': entities,
            }

        return fp.append(*fp.map(build_sentence)(intent.get('data', [])))(acc)

    def build_entity_synonyms(acc, _, name):
        def reduce_entity(result, cur):
            synonyms = augment.get_synonyms(cur)

            if not synonyms:
                return result

            return fp.append({
                'value': cur,
                'synonyms': synonyms,
            })(result)

        return fp.append(
            *fp.reduce(reduce_entity)(augment.get_entity(name).all()))(acc)

    def build_regex_features(acc, _, name):
        pattern = get_regex_prop(get_real_entity(name))

        if pattern:
            return fp.append({
                'name': name,
                'pattern': pattern,
            })(acc)

        return acc

    return utils.merge(
        {
            'rasa_nlu_data': {
                'common_examples':
                fp.reduce(build_intent_examples, [])(augment.get_intents()),
                'regex_features':
                fp.reduce(build_regex_features, [])(augment.entities),
                'lookup_tables':
                fp.reduce(build_lookup_table, [])(augment.entities),
                'entity_synonyms':
                fp.reduce(build_entity_synonyms, [])(augment.entities),
            },
        }, options)