def getEntities(entityDir, entitiesNameCheck, NAME_POLICY):
    """Retrieves entity value to entity name mapping from the directory with entity lists"""
    entities = {}
    for entityFileName in os.listdir(entityDir):
        entityName = toEntityName(NAME_POLICY, entitiesNameCheck, os.path.splitext(entityFileName)[0])
        with openFile(os.path.join(entityDir, entityFileName), "r") as entityFile:
            for line in entityFile.readlines():
                # remove comments
                line = line.split('#')[0]
                line = line.rstrip().lower()
                for entity in line.split(';'):
                    entities[entity] = entityName
    return entities
    def createUniqueEntityName(self, entity_name):
        """
            Creates unique entity_name based on given string
            intent_name is stripped from not allowed characters, spaces are replaced by _
            if the result exists a modifier is added at the end of the string

            :returns unique entity_name or None if not able to create
        """
        #Normalize the string
        unique_entity_name = toEntityName(self._NAME_POLICY,
                                          [['$special', '\\A']], entity_name)
        if unique_entity_name not in self._entities:
            return unique_entity_name
        #try to modify by a number
        for modifier in range(0, 10000):
            new_unique_entity_name = unique_entity_name + repr(
                modifier)  #create a modified one
            # Check if the name exists
            if new_unique_entity_name not in self._entities:
                return new_unique_entity_name
        return None
Ejemplo n.º 3
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=
        'Decompose Bluemix conversation service entities in .json format to entity files in .csv format',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # positional arguments
    parser.add_argument('entities', help='file with entities in .json format')
    parser.add_argument('entitiesDir', help='directory with entities files')
    # optional arguments
    parser.add_argument(
        '-ne',
        '--common_entities_nameCheck',
        action='append',
        nargs=2,
        help=
        "regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\\L' for lowercase"
    )
    parser.add_argument(
        '-s',
        '--soft',
        required=False,
        help=
        'soft name policy - change intents and entities names without error.',
        action='store_true',
        default="")
    parser.add_argument('-v',
                        '--verbose',
                        required=False,
                        help='verbosity',
                        action='store_true')
    parser.add_argument('--log',
                        type=str.upper,
                        default=None,
                        choices=list(logging._levelToName.values()))
    args = parser.parse_args(argv)

    if __name__ == '__main__':
        setLoggerConfig(args.log, args.verbose)

    NAME_POLICY = 'soft' if args.soft else 'hard'

    with openFile(args.entities, 'r') as entitiesFile:
        entitiesJSON = json.load(entitiesFile)

    systemEntities = []
    # process all entities
    for entityJSON in entitiesJSON:
        # process system entity
        if entityJSON["entity"].strip().lower().startswith("sys-"):
            # issue #82: make entity name check parameter-dependent
            #systemEntities.append(toEntityName(NAME_POLICY, entityJSON["entity"]))
            systemEntities.append(entityJSON["entity"])
        # process normal entity
        else:
            values = []
            # process all entity values
            for valueJSON in entityJSON["values"]:
                value = []
                # synonyms entities
                if 'synonyms' in valueJSON:
                    value.append(valueJSON["value"].strip())
                    # add all synonyms
                    for synonym in valueJSON['synonyms']:
                        # empty-string synonyms are ignored when exported from WA json
                        if synonym.strip() != '':
                            value.append(synonym.strip())
                # for pattern entities add tilde to the value
                if 'patterns' in valueJSON:
                    value.append("~" + valueJSON["value"].strip())
                    # add all synonyms
                    for pattern in valueJSON["patterns"]:
                        value.append(pattern.strip())
                values.append(value)
            # new entity file
            entityFileName = os.path.join(
                args.entitiesDir,
                toEntityName(NAME_POLICY, args.common_entities_nameCheck,
                             entityJSON["entity"])) + ".csv"
            with openFile(entityFileName, "w") as entityFile:
                for value in values:
                    entityFile.write(';'.join(value) + "\n")

    # write file with system entities
    with openFile(os.path.join(args.entitiesDir, "system_entities.csv"),
                  'w') as systemEntitiesFile:
        systemEntitiesFile.write(
            "# a special list for the system entities - only one value at each line\n"
        )
        for systemEntity in systemEntities:
            systemEntitiesFile.write(systemEntity + "\n")

    logger.verbose("Entities from file '%s' were successfully extracted\n",
                   args.entities)
Ejemplo n.º 4
0
    if args.domEnt:
        with open(args.domEnt, 'w') as domEntFile:
            for domain in sorted(domEntMap.keys()):
                entities="NONE;"
                for entity in sorted(domEntMap[domain].keys()):
                    entities += entity + ";"
                domEntFile.write(domain + ";" + entities + "\n")
        if VERBOSE: printf("Domain-entity map '%s' was successfully created\n", args.domEnt)

    if args.domEnt:
        with open(args.intEnt, 'w') as intEntFile:
            for intent in sorted(intEntMap.keys()):
                entities="NONE;"
                for entity in sorted(intEntMap[intent].keys()):
                    entities += entity + ";"
                intEntFile.write(intent + ";" + entities + "\n")
        if VERBOSE: printf("Intent-entity map '%s' was successfully created\n", args.domEnt)

    if args.list:
        with open(args.list, 'w') as listFile:
            # process entities
            entityNames = []
            for entityFileName in os.listdir(args.entitiesDir):
                entityName = toEntityName(NAME_POLICY, args.common_entities_nameCheck , os.path.splitext(entityFileName)[0])
                if entityName not in entityNames:
                    entityNames.append(entityName)
            for entityName in entityNames:
                listFile.write(entityName + ";\n")
        if VERBOSE: printf("Entities list '%s' was successfully created\n", args.list)
Ejemplo n.º 5
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=
        'convert NLU tsv files into domain-entity and intent-entity mappings.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # positional arguments
    parser.add_argument(
        'entitiesDir',
        help=
        'directory with entities files - all of them will be included in output list if specified'
    )
    # optional arguments
    parser.add_argument(
        '-is',
        '--sentences',
        help=
        '.tsv file in NLU format with tagged entities in example sentences in third column and intent names in second column'
    )
    parser.add_argument(
        '-l',
        '--list',
        required=False,
        help='output file with list of all entities (if it should be generated)'
    )
    parser.add_argument(
        '-d',
        '--domEnt',
        required=False,
        help=
        'output file with domain-entity mapping (if it should be generated)')
    parser.add_argument(
        '-i',
        '--intEnt',
        required=False,
        help=
        'output file with intent-entity mapping (if it should be generated)')
    parser.add_argument(
        '-ni',
        '--common_intents_nameCheck',
        action='append',
        nargs=2,
        help=
        "regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase"
    )
    parser.add_argument(
        '-ne',
        '--common_entities_nameCheck',
        action='append',
        nargs=2,
        help=
        "regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase"
    )
    parser.add_argument(
        '-s',
        '--soft',
        required=False,
        help=
        'soft name policy - change intents and entities names without error.',
        action='store_true',
        default="")
    parser.add_argument('-v',
                        '--verbose',
                        required=False,
                        help='verbosity',
                        action='store_true')
    parser.add_argument('--log',
                        type=str.upper,
                        default=None,
                        choices=list(logging._levelToName.values()))
    args = parser.parse_args(argv)

    if __name__ == '__main__':
        setLoggerConfig(args.log, args.verbose)

    NAME_POLICY = 'soft' if args.soft else 'hard'

    domEntMap = defaultdict(dict)
    intEntMap = defaultdict(dict)

    if args.sentences:
        with openFile(args.sentences, "r") as sentencesFile:
            for line in sentencesFile.readlines():
                line = line.rstrip()
                if not line: continue
                intentName = toIntentName(NAME_POLICY,
                                          args.common_intents_nameCheck,
                                          line.split("\t")[1])
                intentText = line.split("\t")[2]
                intentSplit = intentName.split("_", 1)
                domainPart = intentSplit[0]
                intentPart = intentSplit[1]
                for entity in re.findall('<([^>]+)>[^<]+<\/[^>]+>',
                                         intentText):
                    domEntMap[domainPart][entity] = 1
                    intEntMap[intentPart][entity] = 1

    if args.domEnt:
        with openFile(args.domEnt, 'w') as domEntFile:
            for domain in sorted(domEntMap.keys()):
                entities = "NONE;"
                for entity in sorted(domEntMap[domain].keys()):
                    entities += entity + ";"
                domEntFile.write(domain + ";" + entities + "\n")
        logger.debug("Domain-entity map '%s' was successfully created",
                     args.domEnt)

    if args.domEnt:
        with openFile(args.intEnt, 'w') as intEntFile:
            for intent in sorted(intEntMap.keys()):
                entities = "NONE;"
                for entity in sorted(intEntMap[intent].keys()):
                    entities += entity + ";"
                intEntFile.write(intent + ";" + entities + "\n")
        logger.debug("Intent-entity map '%s' was successfully created",
                     args.domEnt)

    if args.list:
        with openFile(args.list, 'w') as listFile:
            # process entities
            entityNames = []
            for entityFileName in os.listdir(args.entitiesDir):
                entityName = toEntityName(NAME_POLICY,
                                          args.common_entities_nameCheck,
                                          os.path.splitext(entityFileName)[0])
                if entityName not in entityNames:
                    entityNames.append(entityName)
            for entityName in entityNames:
                listFile.write(entityName + ";\n")
        logger.debug("Entities list '%s' was successfully created", args.list)
Ejemplo n.º 6
0
                for line in entityFile.readlines():
                    # remove comments
                    line = line.split('#')[0]
                    line = line.rstrip().lower()
                    if line:
                        # create new system entity
                        entityJSON = {}
                        entityJSON['entity'] = line
                        entityJSON['values'] = []
                        entitiesJSON.append(entityJSON)

            # other entities
            else:
                entityName = toEntityName(
                    NAME_POLICY,
                    getattr(config, 'common_entities_nameCheck') if hasattr(
                        config, 'common_entities_nameCheck') else None,
                    entityName)

                # create new entity
                entityJSON = {}
                entityJSON['entity'] = entityName
                valuesJSON = []
                # add all values
                for line in entityFile.readlines():
                    # remove comments
                    line = line.split('#')[0]
                    line = line.strip()
                    if line:
                        rawSynonyms = line.split(';')
                        # strip and lower all items in line
Ejemplo n.º 7
0
        # process normal entity
        else:
            values = []
            # process all entity values
            for valueJSON in entityJSON["values"]:
                value = []
                value.append(valueJSON["value"].strip())
                # add all synonyms
                if 'synonyms' in valueJSON:
                    for synonym in valueJSON['synonyms']:
                        value.append(synonym.strip())
                values.append(value)
            # new entity file
            entityFileName = os.path.join(
                args.entitiesDir,
                toEntityName(NAME_POLICY, args.common_entities_nameCheck,
                             entityJSON["entity"])) + ".csv"
            with open(entityFileName, "w") as entityFile:
                for value in values:
                    entityFile.write(';'.join(value) + "\n")

    # write file with system entities
    with open(os.path.join(args.entitiesDir, "system_entities.csv"),
              'w') as systemEntitiesFile:
        systemEntitiesFile.write(
            "# a special list for the system entities - only one value at each line\n"
        )
        for systemEntity in systemEntities:
            systemEntitiesFile.write(systemEntity + "\n")

    if VERBOSE:
        printf("Entities from file '%s' were successfully extracted\n",