def main(argv): parser = argparse.ArgumentParser(description='Decompose Bluemix conversation service intents in .json format to intent files in .csv format', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # positional arguments parser.add_argument('intents', help='file with intents in .json format') parser.add_argument('intentsDir', help='directory with intents files') # optional arguments parser.add_argument('-ni', '--common_intents_nameCheck', action='append', nargs=2, help="regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\\L' for lowercase") parser.add_argument('-s', '--soft', required=False, help='soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) NAME_POLICY = 'soft' if args.soft else 'hard' with openFile(args.intents, 'r') as intentsFile: intentsJSON = json.load(intentsFile) # process all intents for intentJSON in intentsJSON: examples = [] # process all example sentences for exampleJSON in intentJSON["examples"]: examples.append(exampleJSON["text"].strip().lower()) # new intent file intentFileName = os.path.join(args.intentsDir, toIntentName(NAME_POLICY, args.common_intents_nameCheck, intentJSON["intent"]) + ".csv") with openFile(intentFileName, "w") as intentFile: for example in examples: intentFile.write((example + "\n")) logger.verbose("Intents from file '%s' were successfully extracted\n", args.intents)
def convertDialogData(self, dialogData, intents): """ Convert Dialog Data into XML and return pointer to the root XML element. """ nodesXml = XML.Element('nodes') for intent in intents: intentData = dialogData.getIntentData(intent) if not intentData.generateNodes(): continue normName = toIntentName('soft', None, intent) # construct the XML structure for each intent nodeXml = XML.Element('node', name=normName.decode('utf-8')) conditionXml = XML.Element('condition') conditionXml.text = intent.decode('utf-8') if intent.decode( 'utf-8').startswith(u'#') else u'#' + intent.decode('utf-8') nodeXml.append(conditionXml) nodeXml.append( self._createOutputElement(intentData.getChannelOutputs(), intentData.getButtons())) if intentData.getVariables(): nodeXml.append( self._createContextElement(intentData.getVariables())) if intentData.getJumpToTarget() and intentData.getJumpToSelector(): nodeXml.append( self._createGotoElement(intentData.getJumpToTarget(), intentData.getJumpToSelector())) nodesXml.append(nodeXml) return nodesXml
def parseXLSXIntoDataBlocks(self, filename): """ Read Excel spreadsheet in T2C format. Store the data as tuples (domain, prefix, intent, rawBlock) into private field. """ printf('Processing xlsx file: %s\n', filename) if not os.path.exists(filename): eprintf('Error: File does not exist: %s\n', filename) return {} try: domainName = unicode( toIntentName(NAME_POLICY, None, os.path.splitext(os.path.split(filename)[1])[0]), 'utf-8') workbook = load_workbook(filename=filename, read_only=True) except (IOError, BadZipfile): eprintf( 'Error: File does not seem to be a valid Excel spreadsheet: %s\n', filename) return {} # Process all the tabs of the file for sheet in workbook.worksheets: printf(' Sheet: %s\n', sheet.title) prefix = unicode(sheet.title, 'utf-8') currentBlock = [] # Separate all data blocks in the sheet, if the currentBlock starts with header, it is considered to be part of currentBlock for row in sheet.iter_rows(max_col=4): validRow = False # Check if the row is valid. Row is valid if it contains at least one column not empty and different from comment for columnIndex in range(0, 4): if row[columnIndex] and row[columnIndex].value and not ( row[columnIndex].value.startswith('//')): validRow = True # Three slashes in the first cell cause whole rest of the line to be treated as comment if row[0].value and row[0].value.startswith('///'): validRow = False # If empty line or header, we store the previous currentBlock-if any if not validRow: if currentBlock: self.__createBlock(domainName, prefix, currentBlock) currentBlock = [] else: # if valid row - we add it to block currentBlock.append( (row[0].value.strip() if row[0].value and not row[0].value.startswith('//') else None, row[1].value.strip() if row[1].value and not row[1].value.startswith('//') else None, row[2].value.strip() if row[2].value and not row[2].value.startswith('//') else None, row[3].value.strip() if row[3].value and not row[3].value.startswith('//') else None)) if currentBlock: self.__createBlock(domainName, prefix, currentBlock)
def __createBlock(self, domain, prefix, block): if not block or not block[0][0]: printf( 'Warning: First cell of the data block does not contain any data. (domain=%s, prefix=%s)\n', domain, prefix) return # Check if there's a label label = None firstCell = block[0][0] if firstCell.startswith(u':') and len(block) > 1: label = firstCell[1:] if label in self._labelsMap: printf( 'Warning: Found a label that has already been assigned to an intent and will be overwritten. Label: %s\n', label) del block[0] if not block or not block[0][0]: printf( 'WARNING: First cell of the goto block does not contain any data. (domain=%s, prefix=%s, label=%s)\n', domain, prefix, label) return firstCell = block[0][0] # If it's entity block, load the entity if firstCell.startswith(u'@'): self.__handleEntityBlock(block) return # Check the intent name conditionHasX = Dialog.X_PLACEHOLDER in firstCell intentName = firstCell if self.__isConditionBlock(firstCell): if conditionHasX and block[1][0]: intentName = re.sub(Dialog.X_PLACEHOLDER, block[1][0], firstCell) else: if firstCell.startswith(u'#'): intentName = firstCell[1:] else: # Create intent name from first sentence by replacing all spaces with underscores and removing accents, commas and slashes intentName = re.sub( "[/,?']", '', re.sub(' ', '_', unidecode.unidecode(intentName), re.UNICODE)) # check intent name fullIntentName = toIntentName(NAME_POLICY, None, domain, prefix, intentName) self._dialogData.getIntentData(fullIntentName, domain) self._dataBlocks.append((domain, prefix, fullIntentName, block)) if label: self._labelsMap[label] = fullIntentName.decode('utf-8')
def createUniqueIntentName(self, intent_name): """ Creates unique intent_name based on given string intent_name is stripped from not allowed characters, spaces are replaced by _ if the result exists a modifier is added at the end of the string :returns unique intent_name or None if not able to create """ #Normalize the string unique_intent_name = toIntentName(self._NAME_POLICY, [['$special', '\\A']], intent_name) if unique_intent_name not in self._intents: return unique_intent_name #try to modify by a number for modifier in range(0, 10000): new_unique_intent_name = unique_intent_name + repr( modifier) #create a modified one # Check if the name exists if new_unique_intent_name not in self._intents: return new_unique_intent_name return None
def parseXLSXIntoDataBlocks(self, filename): """ Reads Excel spreadsheet (in T2C format). Splits it to blocks and stores the data as tuples (domain, prefix, intent, rawBlock) in _dataBlocks, THIS IS THE FIRST PASS THROUGH INPUT (a single file of the INPUT) """ logger.info('Processing xlsx file: %s', filename) if not os.path.exists(filename): logger.error('File does not exist: %s', filename) return {} # Derive domain name from file name (use the same naming policy as for intents) try: domainName = toIntentName( self._NAME_POLICY, None, os.path.splitext(os.path.split(filename)[1])[0]) try: domainName = unicode(domainName, 'utf-8') # Python 2 except NameError: domainName = str(domainName) # Python 3 workbook = load_workbook(filename=filename, read_only=True) except (IOError, BadZipfile): logger.error( 'File does not seem to be a valid Excel spreadsheet: %s', filename) return {} # Process all the tabs of the file for sheet in workbook.worksheets: # get prefix is a sheet title logger.info(' Sheet: %s', sheet.title) try: prefix = unicode(sheet.title, 'utf-8') # Python 2 except NameError: prefix = str(sheet.title) # Python 3 currentBlock = [] # Each cheet starts a new block # Separate all data blocks in the sheet, if the currentBlock starts with header, the header is considered to be part of currentBlock for row in sheet.iter_rows(max_col=4): validRow = False # Check if the row is valid. Row is valid if it contains at least one column not empty and different from comment for columnIndex in range(0, 4): if row[columnIndex] and row[columnIndex].value and not ( row[columnIndex].value.startswith('//')): validRow = True # Three slashes in the first cell cause whole rest of the line to be treated as comment if row[0].value and row[0].value.startswith('///'): validRow = False if not validRow: # If behind the block, we save the currentBlock (if any was populated) if currentBlock: self.__createBlock(domainName, prefix, currentBlock) currentBlock = [] else: # if valid row - we add the raw to block currentBlock.append( (escape(row[0].value.strip()) if row[0].value and not row[0].value.startswith('//') else None, escape(row[1].value.strip()) if row[1].value and not row[1].value.startswith('//') else None, escape(row[2].value.strip()) if row[2].value and not row[2].value.startswith('//') else None, escape(row[3].value.strip()) if row[3].value and not row[3].value.startswith('//') else None)) if currentBlock: self.__createBlock( domainName, prefix, currentBlock) # store the last block of the sheet
def main(argv): parser = argparse.ArgumentParser(description='Converts intents files to one file in NLU tsv format', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # positional arguments parser.add_argument('intentsDir', help='directory with intents files - all of them will be included in output file') parser.add_argument('output', help='file with output intents in NLU data .tsv format') # optional arguments parser.add_argument('-e', '--entityDir', required=False, help='directory with lists of entities in csv files (file names = entity names), used to tag those entities in output') parser.add_argument('-l', '--list', required=False, help='file with list of all intents (if it should be generated)') parser.add_argument('-m', '--map', required=False, help='file with domain to intents map (if it should be generated)') parser.add_argument('-p', '--prefix', required=False, help='prefix for all generated intents (if it should be added)') parser.add_argument('-ni', '--common_intents_nameCheck', action='append', nargs=2, help="regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase") parser.add_argument('-ne', '--common_entities_nameCheck', action='append', nargs=2, help="regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase") parser.add_argument('-s', '--soft', required=False, help='soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true', default="") parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) NAME_POLICY = 'soft' if args.soft else 'hard' PREFIX = toIntentName(NAME_POLICY, args.common_intents_nameCheck, args.prefix) if args.entityDir: entities = getEntities(args.entityDir, args.common_entities_nameCheck, NAME_POLICY) with openFile(args.output, 'w') as outputFile: # process intents intentNames = [] for intentFileName in os.listdir(args.intentsDir): intentName = toIntentName(NAME_POLICY, args.common_intents_nameCheck, PREFIX, os.path.splitext(intentFileName)[0]) if intentName not in intentNames: intentNames.append(intentName) with open(os.path.join(args.intentsDir, intentFileName), "r") as intentFile: for line in intentFile.readlines(): # remove comments line = line.split('#')[0] if args.entityDir: line = tagEntities(line, entities) if line: outputFile.write("1\t" + intentName + "\t" + line) logger.verbose("Intents file '%s' was successfully created", args.output) if args.list: with openFile(args.list, 'w') as intentsListFile: for intentName in intentNames: intentsListFile.write(intentName + "\n") logger.verbose("Intents list '%s' was successfully created", args.list) if args.map: domIntMap = {} for intentName in intentNames: intentSplit = intentName.split("_",1) domainPart = intentSplit[0] intentPart = intentSplit[1] if domainPart in domIntMap: domIntMap[domainPart] = domIntMap[domainPart] + ";" + intentPart else: domIntMap[domainPart] = ";" + intentPart with openFile(args.map, 'w') as intentsMapFile: for domainPart in domIntMap.keys(): intentsMapFile.write(domainPart + domIntMap[domainPart] + "\n") logger.verbose("Domain-intent map '%s' was successfully created", args.output)
print('generated_intents parameter is not defined, ignoring') if not hasattr(config, 'common_outputs_intents'): print( 'Outputs_intents parameter is not defined, output will be generated to console.' ) intents = [] pathList = getattr(config, 'common_intents') if hasattr(config, 'common_generated_intents'): pathList = pathList + getattr(config, 'common_generated_intents') filesAtPath = getFilesAtPath(pathList) for intentFileName in filesAtPath: intentName = toIntentName( NAME_POLICY, args.common_intents_nameCheck, os.path.splitext(os.path.basename(intentFileName))[0]) with codecs.open(intentFileName, encoding='utf8') as intentFile: intent = {} intent['intent'] = intentName examples = [] for line in intentFile: # remove comments line = line.split('#')[0] line = line.rstrip().lower() if line and not line in examples: examples.append(line) elif line in examples: printf( 'Example used twice for the intent %s, omitting:%s /n', intentName, line)
required=False, help='verbosity', action='store_true') args = parser.parse_args(sys.argv[1:]) VERBOSE = args.verbose if args.soft: NAME_POLICY = 'soft' else: NAME_POLICY = 'hard' with open(args.intents, 'r') as intentsFile: intentsJSON = json.load(intentsFile) # process all intents for intentJSON in intentsJSON: examples = [] # process all example sentences for exampleJSON in intentJSON["examples"]: examples.append(exampleJSON["text"].strip().lower()) # new intent file intentFileName = os.path.join( args.intentsDir, toIntentName(NAME_POLICY, args.common_intents_nameCheck, intentJSON["intent"]) + ".csv") with open(intentFileName, "w") as intentFile: for example in examples: intentFile.write((example + "\n").encode('utf8')) if VERBOSE: printf("Intents from file '%s' were successfully extracted\n", args.intents)
def main(argv): parser = argparse.ArgumentParser( description= 'Converts intent csv files to .json format of Watson Conversation Service', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-c', '--common_configFilePaths', help='configuaration file', action='append') parser.add_argument('-oc', '--common_output_config', help='output configuration file') parser.add_argument( '-ii', '--common_intents', help= 'directory with intent csv files to be processed (all of them will be included in output json)', action='append') #-gi is functionsally equivalent to -ii parser.add_argument( '-gi', '--common_generated_intents', help= 'directory with generated intent csv files to be processed (all of them will be included in output json)', action='append') parser.add_argument( '-od', '--common_outputs_directory', required=False, help='directory where the otputs will be stored (outputs is default)') parser.add_argument('-oi', '--common_outputs_intents', help='file with output json with all the intents') parser.add_argument( '-ni', '--common_intents_nameCheck', action='append', nargs=2, help= "regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\\L' for lowercase" ) parser.add_argument( '-s', '--soft', required=False, help= 'soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) config = Cfg(args) NAME_POLICY = 'soft' if args.soft else 'hard' logger.info('STARTING: ' + os.path.basename(__file__)) if not hasattr(config, 'common_intents'): logger.info('intents parameter is not defined.') if not hasattr(config, 'common_generated_intents'): logger.info('generated_intents parameter is not defined, ignoring') if not hasattr(config, 'common_outputs_intents'): logger.info( 'Outputs_intents parameter is not defined, output will be generated to console.' ) intents = [] pathList = getattr(config, 'common_intents') if hasattr(config, 'common_generated_intents'): pathList = pathList + getattr(config, 'common_generated_intents') filesAtPath = getFilesAtPath(pathList) for intentFileName in sorted(filesAtPath): intentName = toIntentName( NAME_POLICY, args.common_intents_nameCheck, os.path.splitext(os.path.basename(intentFileName))[0]) with openFile(intentFileName, 'r', encoding='utf8') as intentFile: intent = {} intent['intent'] = intentName examples = [] for line in intentFile: # remove comments line = line.split('#')[0] line = line.rstrip().lower() #non-ascii characters fix #line = line.encode('utf-8') if line: example = processExample(line, intentName, examples) #adding to the list if example: examples.append(example) intent['examples'] = examples intents.append(intent) if hasattr(config, 'common_outputs_directory') and hasattr( config, 'common_outputs_intents'): if not os.path.exists(getattr(config, 'common_outputs_directory')): os.makedirs(getattr(config, 'common_outputs_directory')) logger.info('Created new output directory ' + getattr(config, 'common_outputs_directory')) with codecs.open(os.path.join( getattr(config, 'common_outputs_directory'), getattr(config, 'common_outputs_intents')), 'w', encoding='utf8') as outputFile: outputFile.write(json.dumps(intents, indent=4, ensure_ascii=False)) else: print(json.dumps(intents, indent=4, ensure_ascii=False)) logger.info('FINISHING: ' + os.path.basename(__file__))
parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') args = parser.parse_args(sys.argv[1:]) VERBOSE = args.verbose if args.soft: NAME_POLICY = 'soft' else: NAME_POLICY = 'hard' domEntMap = defaultdict(dict) intEntMap = defaultdict(dict) if args.sentences: with open(args.sentences, "r") as sentencesFile: for line in sentencesFile.readlines(): line = line.rstrip() if not line: continue intentName = toIntentName(NAME_POLICY, args.common_intents_nameCheck, line.split("\t")[1]) intentText = line.split("\t")[2] intentSplit = intentName.split("_",1) domainPart = intentSplit[0] intentPart = intentSplit[1] for entity in re.findall('<([^>]+)>[^<]+<\/[^>]+>', intentText): domEntMap[domainPart][entity] = 1 intEntMap[intentPart][entity] = 1 if args.domEnt: with open(args.domEnt, 'w') as domEntFile: for domain in sorted(domEntMap.keys()): entities="NONE;" for entity in sorted(domEntMap[domain].keys()): entities += entity + ";" domEntFile.write(domain + ";" + entities + "\n")
def main(argv): parser = argparse.ArgumentParser( description= 'convert NLU tsv files into domain-entity and intent-entity mappings.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # positional arguments parser.add_argument( 'entitiesDir', help= 'directory with entities files - all of them will be included in output list if specified' ) # optional arguments parser.add_argument( '-is', '--sentences', help= '.tsv file in NLU format with tagged entities in example sentences in third column and intent names in second column' ) parser.add_argument( '-l', '--list', required=False, help='output file with list of all entities (if it should be generated)' ) parser.add_argument( '-d', '--domEnt', required=False, help= 'output file with domain-entity mapping (if it should be generated)') parser.add_argument( '-i', '--intEnt', required=False, help= 'output file with intent-entity mapping (if it should be generated)') parser.add_argument( '-ni', '--common_intents_nameCheck', action='append', nargs=2, help= "regex and replacement for intent name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase" ) parser.add_argument( '-ne', '--common_entities_nameCheck', action='append', nargs=2, help= "regex and replacement for entity name check, e.g. '-' '_' for to replace hyphens for underscores or '$special' '\L' for lowercase" ) parser.add_argument( '-s', '--soft', required=False, help= 'soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true') parser.add_argument('--log', type=str.upper, default=None, choices=list(logging._levelToName.values())) args = parser.parse_args(argv) if __name__ == '__main__': setLoggerConfig(args.log, args.verbose) NAME_POLICY = 'soft' if args.soft else 'hard' domEntMap = defaultdict(dict) intEntMap = defaultdict(dict) if args.sentences: with openFile(args.sentences, "r") as sentencesFile: for line in sentencesFile.readlines(): line = line.rstrip() if not line: continue intentName = toIntentName(NAME_POLICY, args.common_intents_nameCheck, line.split("\t")[1]) intentText = line.split("\t")[2] intentSplit = intentName.split("_", 1) domainPart = intentSplit[0] intentPart = intentSplit[1] for entity in re.findall('<([^>]+)>[^<]+<\/[^>]+>', intentText): domEntMap[domainPart][entity] = 1 intEntMap[intentPart][entity] = 1 if args.domEnt: with openFile(args.domEnt, 'w') as domEntFile: for domain in sorted(domEntMap.keys()): entities = "NONE;" for entity in sorted(domEntMap[domain].keys()): entities += entity + ";" domEntFile.write(domain + ";" + entities + "\n") logger.debug("Domain-entity map '%s' was successfully created", args.domEnt) if args.domEnt: with openFile(args.intEnt, 'w') as intEntFile: for intent in sorted(intEntMap.keys()): entities = "NONE;" for entity in sorted(intEntMap[intent].keys()): entities += entity + ";" intEntFile.write(intent + ";" + entities + "\n") logger.debug("Intent-entity map '%s' was successfully created", args.domEnt) if args.list: with openFile(args.list, 'w') as listFile: # process entities entityNames = [] for entityFileName in os.listdir(args.entitiesDir): entityName = toEntityName(NAME_POLICY, args.common_entities_nameCheck, os.path.splitext(entityFileName)[0]) if entityName not in entityNames: entityNames.append(entityName) for entityName in entityNames: listFile.write(entityName + ";\n") logger.debug("Entities list '%s' was successfully created", args.list)
required=False, help= 'soft name policy - change intents and entities names without error.', action='store_true', default="") parser.add_argument('-v', '--verbose', required=False, help='verbosity', action='store_true', default="") args = parser.parse_args(sys.argv[1:]) VERBOSE = args.verbose NAME_POLICY = 'soft' if args.soft else 'hard' PREFIX = toIntentName(NAME_POLICY, args.common_intents_nameCheck, args.prefix) if args.entityDir: entities = getEntities(args.entityDir, NAME_POLICY) with open(args.output, 'w') as outputFile: # process intents intentNames = [] for intentFileName in os.listdir(args.intentsDir): intentName = toIntentName(NAME_POLICY, args.common_intents_nameCheck, PREFIX, os.path.splitext(intentFileName)[0]) if intentName not in intentNames: intentNames.append(intentName) with open(os.path.join(args.intentsDir, intentFileName), "r") as intentFile: