def getEntities(entityDir, entitiesNameCheck, NAME_POLICY): """Retrieves entity value to entity name mapping from the directory with entity lists""" entities = {} for entityFileName in os.listdir(entityDir): entityName = toEntityName(NAME_POLICY, entitiesNameCheck, os.path.splitext(entityFileName)[0]) with openFile(os.path.join(entityDir, entityFileName), "r") as entityFile: for line in entityFile.readlines(): # remove comments line = line.split('#')[0] line = line.rstrip().lower() for entity in line.split(';'): entities[entity] = entityName return entities
def createUniqueEntityName(self, entity_name): """ Creates unique entity_name based on given string intent_name is stripped from not allowed characters, spaces are replaced by _ if the result exists a modifier is added at the end of the string :returns unique entity_name or None if not able to create """ #Normalize the string unique_entity_name = toEntityName(self._NAME_POLICY, [['$special', '\\A']], entity_name) if unique_entity_name not in self._entities: return unique_entity_name #try to modify by a number for modifier in range(0, 10000): new_unique_entity_name = unique_entity_name + repr( modifier) #create a modified one # Check if the name exists if new_unique_entity_name not in self._entities: return new_unique_entity_name return None
def main(argv): parser = argparse.ArgumentParser( description= 'Decompose Bluemix conversation service entities in .json format to entity files in .csv format', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # positional arguments parser.add_argument('entities', help='file with entities in .json format') parser.add_argument('entitiesDir', help='directory with entities files') # optional arguments parser.add_argument( '-ne', '--common_entities_nameCheck', action='append', nargs=2, help= "regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\\L' for lowercase" ) parser.add_argument( '-s', '--soft', required=False, help= 'soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) NAME_POLICY = 'soft' if args.soft else 'hard' with openFile(args.entities, 'r') as entitiesFile: entitiesJSON = json.load(entitiesFile) systemEntities = [] # process all entities for entityJSON in entitiesJSON: # process system entity if entityJSON["entity"].strip().lower().startswith("sys-"): # issue #82: make entity name check parameter-dependent #systemEntities.append(toEntityName(NAME_POLICY, entityJSON["entity"])) systemEntities.append(entityJSON["entity"]) # process normal entity else: values = [] # process all entity values for valueJSON in entityJSON["values"]: value = [] # synonyms entities if 'synonyms' in valueJSON: value.append(valueJSON["value"].strip()) # add all synonyms for synonym in valueJSON['synonyms']: # empty-string synonyms are ignored when exported from WA json if synonym.strip() != '': value.append(synonym.strip()) # for pattern entities add tilde to the value if 'patterns' in valueJSON: value.append("~" + valueJSON["value"].strip()) # add all synonyms for pattern in valueJSON["patterns"]: value.append(pattern.strip()) values.append(value) # new entity file entityFileName = os.path.join( args.entitiesDir, toEntityName(NAME_POLICY, args.common_entities_nameCheck, entityJSON["entity"])) + ".csv" with openFile(entityFileName, "w") as entityFile: for value in values: entityFile.write(';'.join(value) + "\n") # write file with system entities with openFile(os.path.join(args.entitiesDir, "system_entities.csv"), 'w') as systemEntitiesFile: systemEntitiesFile.write( "# a special list for the system entities - only one value at each line\n" ) for systemEntity in systemEntities: systemEntitiesFile.write(systemEntity + "\n") logger.verbose("Entities from file '%s' were successfully extracted\n", args.entities)
if args.domEnt: with open(args.domEnt, 'w') as domEntFile: for domain in sorted(domEntMap.keys()): entities="NONE;" for entity in sorted(domEntMap[domain].keys()): entities += entity + ";" domEntFile.write(domain + ";" + entities + "\n") if VERBOSE: printf("Domain-entity map '%s' was successfully created\n", args.domEnt) if args.domEnt: with open(args.intEnt, 'w') as intEntFile: for intent in sorted(intEntMap.keys()): entities="NONE;" for entity in sorted(intEntMap[intent].keys()): entities += entity + ";" intEntFile.write(intent + ";" + entities + "\n") if VERBOSE: printf("Intent-entity map '%s' was successfully created\n", args.domEnt) if args.list: with open(args.list, 'w') as listFile: # process entities entityNames = [] for entityFileName in os.listdir(args.entitiesDir): entityName = toEntityName(NAME_POLICY, args.common_entities_nameCheck , os.path.splitext(entityFileName)[0]) if entityName not in entityNames: entityNames.append(entityName) for entityName in entityNames: listFile.write(entityName + ";\n") if VERBOSE: printf("Entities list '%s' was successfully created\n", args.list)
def main(argv): parser = argparse.ArgumentParser( description= 'convert NLU tsv files into domain-entity and intent-entity mappings.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # positional arguments parser.add_argument( 'entitiesDir', help= 'directory with entities files - all of them will be included in output list if specified' ) # optional arguments parser.add_argument( '-is', '--sentences', help= '.tsv file in NLU format with tagged entities in example sentences in third column and intent names in second column' ) parser.add_argument( '-l', '--list', required=False, help='output file with list of all entities (if it should be generated)' ) parser.add_argument( '-d', '--domEnt', required=False, help= 'output file with domain-entity mapping (if it should be generated)') parser.add_argument( '-i', '--intEnt', required=False, help= 'output file with intent-entity mapping (if it should be generated)') parser.add_argument( '-ni', '--common_intents_nameCheck', action='append', nargs=2, help= "regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase" ) parser.add_argument( '-ne', '--common_entities_nameCheck', action='append', nargs=2, help= "regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase" ) parser.add_argument( '-s', '--soft', required=False, help= 'soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) NAME_POLICY = 'soft' if args.soft else 'hard' domEntMap = defaultdict(dict) intEntMap = defaultdict(dict) if args.sentences: with openFile(args.sentences, "r") as sentencesFile: for line in sentencesFile.readlines(): line = line.rstrip() if not line: continue intentName = toIntentName(NAME_POLICY, args.common_intents_nameCheck, line.split("\t")[1]) intentText = line.split("\t")[2] intentSplit = intentName.split("_", 1) domainPart = intentSplit[0] intentPart = intentSplit[1] for entity in re.findall('<([^>]+)>[^<]+<\/[^>]+>', intentText): domEntMap[domainPart][entity] = 1 intEntMap[intentPart][entity] = 1 if args.domEnt: with openFile(args.domEnt, 'w') as domEntFile: for domain in sorted(domEntMap.keys()): entities = "NONE;" for entity in sorted(domEntMap[domain].keys()): entities += entity + ";" domEntFile.write(domain + ";" + entities + "\n") logger.debug("Domain-entity map '%s' was successfully created", args.domEnt) if args.domEnt: with openFile(args.intEnt, 'w') as intEntFile: for intent in sorted(intEntMap.keys()): entities = "NONE;" for entity in sorted(intEntMap[intent].keys()): entities += entity + ";" intEntFile.write(intent + ";" + entities + "\n") logger.debug("Intent-entity map '%s' was successfully created", args.domEnt) if args.list: with openFile(args.list, 'w') as listFile: # process entities entityNames = [] for entityFileName in os.listdir(args.entitiesDir): entityName = toEntityName(NAME_POLICY, args.common_entities_nameCheck, os.path.splitext(entityFileName)[0]) if entityName not in entityNames: entityNames.append(entityName) for entityName in entityNames: listFile.write(entityName + ";\n") logger.debug("Entities list '%s' was successfully created", args.list)
for line in entityFile.readlines(): # remove comments line = line.split('#')[0] line = line.rstrip().lower() if line: # create new system entity entityJSON = {} entityJSON['entity'] = line entityJSON['values'] = [] entitiesJSON.append(entityJSON) # other entities else: entityName = toEntityName( NAME_POLICY, getattr(config, 'common_entities_nameCheck') if hasattr( config, 'common_entities_nameCheck') else None, entityName) # create new entity entityJSON = {} entityJSON['entity'] = entityName valuesJSON = [] # add all values for line in entityFile.readlines(): # remove comments line = line.split('#')[0] line = line.strip() if line: rawSynonyms = line.split(';') # strip and lower all items in line
# process normal entity else: values = [] # process all entity values for valueJSON in entityJSON["values"]: value = [] value.append(valueJSON["value"].strip()) # add all synonyms if 'synonyms' in valueJSON: for synonym in valueJSON['synonyms']: value.append(synonym.strip()) values.append(value) # new entity file entityFileName = os.path.join( args.entitiesDir, toEntityName(NAME_POLICY, args.common_entities_nameCheck, entityJSON["entity"])) + ".csv" with open(entityFileName, "w") as entityFile: for value in values: entityFile.write(';'.join(value) + "\n") # write file with system entities with open(os.path.join(args.entitiesDir, "system_entities.csv"), 'w') as systemEntitiesFile: systemEntitiesFile.write( "# a special list for the system entities - only one value at each line\n" ) for systemEntity in systemEntities: systemEntitiesFile.write(systemEntity + "\n") if VERBOSE: printf("Entities from file '%s' were successfully extracted\n",