def __init__(self, configDictionary): super(MLBrain, self).__init__(configDictionary) self.profile = {"name": "mlbrain-module", "class": "mlbrain"} self.nlp_module = None self.concept_module = None self.data_storage_path = utils.getKeyFromSectionInConfiguration( 'brain', 'data_storage_path', None, configDictionary) self.dictionary_data_source_path = utils.getKeyFromSectionInConfiguration( 'bender-training', 'dictionary_data_source_path', None, configDictionary) self.dictionary_output_path = utils.getKeyFromSectionInConfiguration( 'bender-training', 'dictionary_output_path', None, configDictionary) if self.dictionary_output_path == None: print( '**** ERROR: No Dictionary output path defined in bender-training section of config-file.' ) sys.exit(1) self.dict_filename = os.path.join(self.dictionary_output_path, 'dictionary.dict') self.remove_stop_words = int( utils.getKeyFromSectionInConfiguration('bender-training', 'remove_stop_words', '0', configDictionary)) self.language = utils.getKeyFromSectionInConfiguration( 'bender-training', 'data_language_short', 'en', configDictionary) self.stop_words = get_stop_words(self.language) utils.safe_create_directory(self.dictionary_output_path) if self.data_storage_path == None: print('**** ERROR: No data storage path specified. Exiting!') sys.exit(1) self.dictionary_manager = GensimDictionary( self.dictionary_data_source_path, self.dict_filename, self.remove_stop_words, self.stop_words) self._loadData()
def __init__(self, configDictionary): super(HunSpelling, self).__init__(configDictionary) self.profile = { "name" : "hunspelling-module", "class": "spelling", "supported-languages" : ["de", "en", "tr"] } self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.dict_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-dict-file', None, configDictionary) if self.dict_file == None: print('*** Missing spelling-dict-file in configuration. Exiting.') sys.exit(1) self.aff_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-aff-file', None, configDictionary) if self.dict_file == None: print('*** Missing spelling-aff-file in configuration. Exiting.') sys.exit(1) self.add_words_file = utils.getKeyFromSectionInConfiguration('spelling', 'training-add-words-from-file', None, configDictionary) self.speller = hunspell.HunSpell(self.dict_file, self.aff_file) if self.speller == None: print('>>>>>> Could not create speller...') tokenizer_language = utils.getKeyFromSectionInConfiguration('spelling', 'tokenizer-language', 'german', configDictionary) try: self.tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(tokenizer_language)) except: print('>>>>>> Could not load TOKENIZER language file.') sys.exit(1) if self.add_words_file != None: self.train()
def __init__(self, configDictionary): super(SpacySIM, self).__init__(configDictionary) self.profile = { "name": "spacy-module", "class": "nlp-deepnn-analysis", "supported_request_types": ["text"], "supported-languages": ['de', 'en'] } self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.maxSim = int( utils.getKeyFromSectionInConfiguration('similarity', 'max_similarity_to_return', 1, configDictionary)) self.language_model = utils.getKeyFromSectionInConfiguration( 'similarity', 'language_model', 'de_core_news_sm', configDictionary) self.simLowerThreshold = float( utils.getKeyFromSectionInConfiguration( 'similarity', 'similarity_lower_threshold', 0.5, configDictionary)) self.simHigherThreshold = float( utils.getKeyFromSectionInConfiguration( 'similarity', 'similarity_higher_threshold', 0.7, configDictionary)) self.configDictionary = configDictionary self.update_lock = threading.Lock() self.alldocs = None
def __init__(self, configDictionary): super(EnchantSpelling, self).__init__(configDictionary) self.profile = { "name": "enchantspelling-module", "class": "spelling", "supported-languages": ["de", "en", "tr"] } self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.dict_language = utils.getKeyFromSectionInConfiguration( 'spelling', 'spelling-language-full', None, configDictionary) try: self.speller = enchant.Dict(self.dict_language) except: print('>>>>>>> Could not load language spelling dictionary ', self.dict_language) sys.exit(1) self.add_words_file = utils.getKeyFromSectionInConfiguration( 'spelling', 'training-add-words-from-file', None, configDictionary) tokenizer_language = utils.getKeyFromSectionInConfiguration( 'spelling', 'tokenizer-language', 'german', configDictionary) try: self.tokenizer = nltk.data.load( 'tokenizers/punkt/{0}.pickle'.format(tokenizer_language)) except: print('>>>>>> Could not load TOKENIZER language file.') sys.exit(1) if self.add_words_file != None: self.train()
def __init__(self, moduleConfigSection, configDictionary): super(WMDLogicMT, self).__init__(moduleConfigSection, configDictionary) global WMD_FILE self.profile = { "name": "wmd-logic", "class": "internalmachine-logic", 'accepted-languages': [ 'de', 'en', 'fr', 'tr', 'it', 'nl', 'se', 'no', 'fi', 'pl', 'cz', 'hu' ], 'accepted-media-types': ['text/utf8'], 'returned-media-types': ['text/utf8'], 'requires-original-query': True, 'returns-response-id': True, 'always-ask': True } self.module_config = moduleConfigSection self.config_dict = configDictionary language = utils.getKeyFromSectionInConfiguration( 'bender-training', 'data_language_short', 'en', self.config_dict) self.remove_stop_words = int( utils.getKeyFromSectionInConfiguration('bender-training', 'remove_stop_words', 1, self.config_dict)) retraining_interval_mins = int( self.module_config.get('retraining_interval_in_minutes', 23)) if retraining_interval_mins < 5: retraining_interval_mins = 5 self.retraining_interval_in_seconds = retraining_interval_mins * 60 self.stop_words = get_stop_words(language) self.higher_threshold = float( self.module_config.get('wmd_higher_threshold', 0.7)) self.lower_threshold = float( self.module_config.get('wmd_lower_threshold', 0.5)) self.num_results = int(self.module_config.get('max_wmd_results', '10')) self.num_instances = int( self.module_config.get('wmd_num_instances', '10')) self.wmd_timeout = int(self.module_config.get('wmd_timeout', '30')) self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.is_master = int(self.module_config.get('is-master', 0)) self.contribution_factor = int( self.module_config.get('contribution-factor', 500)) self.wmd_model = None self.wmd_instances = [] self.query_results = {} self.process_queues = {} self.chunk_size = 0 self.learning_update_timer = None self.learning_lock = threading.Lock() self.requires_learning = False self.wmd_instances_lock = threading.Lock()
def __init__(self, configDictionary): self.configDictionary = configDictionary self.train_data_source_file = utils.getKeyFromSectionInConfiguration( 'bender-training', 'train_data_source_file', None, configDictionary) if not self.train_data_source_file: print( "Config does not contain 'train_data_source_file', please provide one." ) exit(1) self.query_media_type = utils.getKeyFromSectionInConfiguration( 'bender-training', 'query_media_type', None, configDictionary) self.response_media_type = utils.getKeyFromSectionInConfiguration( 'bender-training', 'response_media_type', None, configDictionary) self.raw_data_format = utils.getKeyFromSectionInConfiguration( 'bender-training', 'raw_data_format', None, configDictionary) self.train_data_q_media_type = utils.getKeyFromSectionInConfiguration( 'bender-training', 'train_data_q_media_type', None, configDictionary) self.train_data_a_media_type = utils.getKeyFromSectionInConfiguration( 'bender-training', 'train_data_a_media_type', None, configDictionary) self.output_path = utils.getKeyFromSectionInConfiguration( 'bender-training', 'output_path', None, configDictionary) self.train_data_queries_root_dir = utils.getKeyFromSectionInConfiguration( 'bender-training', 'converted_train_data_q_path', None, configDictionary) self.train_data_answers_dir = utils.getKeyFromSectionInConfiguration( 'bender-training', 'converted_train_data_a_path', None, configDictionary) self.generate_lsi = int( utils.getKeyFromSectionInConfiguration('bender-training', 'generate_lsi', 0, configDictionary)) concept = utils.getModulenameFromConfiguration( 'concept', 'modules.concept.mlconcept.MLConcept', configDictionary) utils.validate_module_class(concept, MLConcept) self.concept = utils.initialize_class(concept, configDictionary) nlp = utils.getModulenameFromConfiguration('nlp', 'modules.nlp.mlnlp.MLNLP', configDictionary) utils.validate_module_class(nlp, MLNLP) self.nlp = utils.initialize_class(nlp, configDictionary) utils.safe_create_directory(self.output_path) self.configDictionary = configDictionary self.question_file = '' self.answers_file = '' self.questions = [] self.answers = [] self.internalMachineLogics = []
def __init__(self, configDictionary): super(MongoDBStorage, self).__init__(configDictionary) self.database_name = utils.getKeyFromSectionInConfiguration( 'permanentstorage', 'mongodb_storage_database', 'bender_database', configDictionary) self.database_uri = utils.getKeyFromSectionInConfiguration( 'permanentstorage', 'mongodb_storage_database_uri', 'mongodb://localhost:27017/', configDictionary) self.client = MongoClient(self.database_uri) self.database = self.client[self.database_name] self.entries = self.database['benderentries'] # We need to create indexes, but this is left for later... # self.statements.create_index('ownerID', unique=True) self.profile = { "name": "mongodb-storage", "class": "permanent-storage", "verson": "1.0" }
def __init__(self, brain, configDict): self.config_dict = configDict self.dict_source_dir = utils.getKeyFromSectionInConfiguration( 'bender-training', 'dictionary_data_source_path', None, configDict) self.train_data_source_dir = utils.getKeyFromSectionInConfiguration( 'bender-training', 'converted_train_data_q_path', None, configDict) self.num_topics = int( utils.getKeyFromSectionInConfiguration('bender-training', 'num_topics_lsi', 200, configDict)) self.output_root_path = utils.getKeyFromSectionInConfiguration( 'bender-training', 'output_path', None, configDict) self.dict_filename = self.output_root_path + '/dictionary/dictionary.dict' self.corpus_filename = self.output_root_path + '/corpus/corpus.mm' self.tfidf_corpus_filename = self.output_root_path + '/corpus/tfidf.mm' self.lsi_filename = self.output_root_path + '/models/lsi.model' self.tfidf_model_filename = self.output_root_path + '/models/tfidf.model' self.lsi_index_filename = self.output_root_path + '/index/lsi_index' self.tfidf_idx_filename = self.output_root_path + '/index/tfidf.index' self.doc2id_filename = self.output_root_path + '/ids/word2id.pickle' self.id2doc_filename = self.output_root_path + '/ids/id2word.pickle' self.language = utils.getKeyFromSectionInConfiguration( 'bender-training', 'data_language_short', 'en', configDict) self.remove_stop_words = int( utils.getKeyFromSectionInConfiguration('bender-training', 'remove_stop_words', 1, configDict)) self.stop_words = get_stop_words(self.language) self.brain = brain self._create_directories()
def __init__(self, brain, configDict): self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.config_dict = configDict self.lsi_index_mutex = threading.Lock() self.num_topics = int( utils.getKeyFromSectionInConfiguration('bender-training', 'num_topics_lsi', 200, configDict)) self.output_root_path = utils.getKeyFromSectionInConfiguration( 'bender-training', 'output_path', None, configDict) self.corpus_filename = self.output_root_path + '/corpus/corpus.mm' self.tfidf_corpus_filename = self.output_root_path + '/corpus/tfidf.mm' self.lsi_filename = self.output_root_path + '/models/lsi.model' self.tfidf_model_filename = self.output_root_path + '/models/tfidf.model' self.lsi_index_filename = self.output_root_path + '/index/lsi_index' self.tfidf_idx_filename = self.output_root_path + '/index/tfidf.index' self.doc2id_filename = self.output_root_path + '/ids/word2id.pickle' self.id2doc_filename = self.output_root_path + '/ids/id2word.pickle' self.word2vec_filename = os.path.join(self.output_root_path, 'vectors', 'word2vec.embeddings') self.language = utils.getKeyFromSectionInConfiguration( 'bender-training', 'data_language_short', 'en', configDict) self.remove_stop_words = int( utils.getKeyFromSectionInConfiguration('bender-training', 'remove_stop_words', 1, configDict)) self.stop_words = get_stop_words(self.language) self.brain = brain retraining_interval_mins = int( utils.getKeyFromSectionInConfiguration( 'bender-training', 'retraining_interval_in_minutes', 30, configDict)) if retraining_interval_mins < 5: retraining_interval_mins = 5 self.retraining_interval_in_seconds = retraining_interval_mins * 60 self.learning_update_timer = None self.learning_lock = threading.Lock() self.requires_learning = False self._load()
def __init__(self, configDictionary): super(LSISimilarity, self).__init__(configDictionary) self.profile = { "name": "lsi-module", "class": "latent-semantic-analysis", "supported_request_types": ["text"], "supported-languages": ["de"] } self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.maxSim = int( utils.getKeyFromSectionInConfiguration('similarity', 'max_similarity_to_return', 1, configDictionary)) self.simLowerThreshold = float( utils.getKeyFromSectionInConfiguration( 'similarity', 'similarity_lower_threshold', 0.5, configDictionary)) self.simHigherThreshold = float( utils.getKeyFromSectionInConfiguration( 'similarity', 'similarity_higher_threshold', 0.7, configDictionary)) self.configDictionary = configDictionary
def __init__(self, configDictionary): super(MLSession, self).__init__(configDictionary) self.profile = { "name": "mlsession-module", "class": "session", "session-timeout": 10400 } self.wordTokenizer = RegexpTokenizer(u'\w+') self.session_timeout = int( utils.getKeyFromSectionInConfiguration( 'session', 'session_timeout', self.profile['session-timeout'], configDictionary)) self.sessions = {}
def __init__(self, configDictionary): self.config = configDictionary self.key = 'nmxcvjkhsdf98u53429kjhasd901423jkhdsfzcxvmnuitgre4325809cneu3io' log_path = utils.getKeyFromSectionInConfiguration( 'bender-core', 'log_directory', 'logs', configDictionary) hash_key = utils.getKeyFromSectionInConfiguration( 'bender-core', 'log_hash', None, configDictionary) self.lock = threading.Lock() if hash_key == None: logger = logging.getLogger(os.path.basename(sys.argv[0])) logger.error( 'Missing `log_hash` in `bender-core` section of configuration file. Will exit!' ) sys.exit(1) new_hash_key = self.key + hash_key self.hash_key = hashlib.sha256(new_hash_key).hexdigest() requests_log_file = os.path.join(log_path, 'requests.log') if os.path.exists(requests_log_file): allentries = codecs.open(requests_log_file, 'r', 'utf-8').readlines() if len(allentries) > 0: self.last_req_log_entry = allentries[-1].strip() else: self.last_req_log_entry = None else: self.last_req_log_entry = None self.requests_log_file = codecs.open(requests_log_file, 'a', 'utf-8') p_log_file = os.path.join(log_path, 'performance.log') if os.path.exists(p_log_file): allentries = codecs.open(p_log_file, 'r', 'utf-8').readlines() if len(allentries) > 0: self.last_log_entry = allentries[-1].strip() else: self.last_log_entry = None else: self.last_log_entry = None self.performance_log_file = codecs.open(p_log_file, 'a', 'utf-8')
def _loadAnnoyIndex(self): self.loading_lock.acquire() self.dictionary = self.brain.getDictionary() self.num_topics_lsi = int( utils.getKeyFromSectionInConfiguration('bender-training', 'num_topics_lsi', 200, self.config_dict)) self.output_path = self.module_config.get('annoy_data_path', '') self.accuracy = int(self.module_config.get('accuracy', 500)) self.max_results = int(self.module_config.get('max_results', 100)) retraining_iv = int( self.module_config.get('retraining_interval_in_minutes', 7)) self.retraining_interval_in_seconds = retraining_iv * 60 utils.safe_create_directory(self.output_path) self.lsi_vectors_filename = os.path.join(self.output_path, LSI_VECTORS_FNAME) self.lsi_model_filename = os.path.join(self.output_path, LSI_MODEL_FNAME) self.tfidf_model_filename = os.path.join(self.output_path, TFIDF_MODEL_FNAME) self.annoy_index_filename = os.path.join(self.output_path, ANNOY_OUTPUT_FNAME) self.clipped_corpus_filename = os.path.join(self.output_path, CLIPPED_CORPUS_FNAME) if os.path.exists(self.annoy_index_filename): self.mm = gensim.corpora.MmCorpus(self.lsi_vectors_filename) num_features, num_docs = self.mm.num_terms, min( self.mm.num_docs, MAX_DOCS) self.index_annoy = annoy.AnnoyIndex(num_features, metric='angular') self.index_annoy.load(self.annoy_index_filename) else: print( '**** ERROR: Annoy index does not exist. Please train first!') sys.exit(1) if os.path.exists(self.lsi_model_filename): self.lsi_model = gensim.models.LsiModel.load( self.lsi_model_filename) else: print('**** ERROR: Annoy LSI Model missing. Please train first!') sys.exit(1) if os.path.exists(self.tfidf_model_filename): self.tfidf_model = gensim.models.TfidfModel.load( self.tfidf_model_filename) self.loading_lock.release()
def __init__(self, configDictionary): super(JSONStorage, self).__init__(configDictionary) self.entries = {} jsonDatabaseFile = utils.getKeyFromSectionInConfiguration( 'permanentstorage', 'json_storage_database', '/tmp/-json-storage.json', configDictionary) self.mutex_lock = threading.Lock() self.jsonDataFile = jsonDatabaseFile self.profile = { "name": "json-storage", "class": "permanent-storage", "verson": "1.0" } if os.path.isfile(self.jsonDataFile): self.entries = json.load( codecs.open(self.jsonDataFile, 'r', 'utf-8')) else: utils.safe_create_directory(os.path.dirname(self.jsonDataFile))
def __init__(self, configDictionary): self.configuration = configDictionary self.security = BenderSecurity() logFile = utils.getKeyFromSectionInConfiguration('bender-core', 'bender_core_logfile', 'logs/bender-core.log', configDictionary) utils.safe_create_directory(os.path.dirname(logFile)) logging.basicConfig(level=logging.WARNING, filename=logFile + '.libs.log', format=LOG_FORMAT) self.logger = logging.getLogger(os.path.basename(sys.argv[0])) loggingFileHandler = logging.FileHandler(logFile) loggingFileHandler.setLevel(logging.INFO) loggingMemoryHandler = logging.handlers.MemoryHandler(128, target=loggingFileHandler) loggingMemoryHandler.setLevel(logging.INFO) loggingFormatter = logging.Formatter(LOG_FORMAT) self.logger.addHandler(loggingMemoryHandler) loggingFileHandler.setFormatter(loggingFormatter) loggingMemoryHandler.setFormatter(loggingFormatter) self.logger.setLevel(logging.INFO) self.logger.info('###################### STARTING A NEW BENDER INSTANCE #######################') self.logger.info('running %s' % ' '.join(sys.argv)) self.interactive = int(utils.getKeyFromSectionInConfiguration('bender-core', 'interactive', 0, configDictionary)) self.num_results = int(utils.getKeyFromSectionInConfiguration('bender-core', 'num_results', 1, configDictionary)) self.use_hli = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_hli', 1, configDictionary)) self.use_lookup = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_lookup', 1, configDictionary)) self.name = utils.getKeyFromSectionInConfiguration('bender-core', 'name', 'Bender', configDictionary) self.personality = utils.getKeyFromSectionInConfiguration('bender-core', 'personality', 'mail-responder', configDictionary) self.lowerCL = float(utils.getKeyFromSectionInConfiguration('bender-core', 'lowerconfidence_level', 0.85, configDictionary)) self.higherCL = float(utils.getKeyFromSectionInConfiguration('bender-core', 'higherconfidence_level', 0.95, configDictionary)) self.reinforcementTimeout = int(utils.getKeyFromSectionInConfiguration('bender-core', 'reinforcement_timeout', 7 * 24 * 60 * 60, configDictionary)) self.useSimilarity = int(utils.getKeyFromSectionInConfiguration('bender-core', 'use_similarity', 1, configDictionary)) transient_storage = utils.getModulenameFromConfiguration('transientstorage', 'modules.storage.transientstorage.MLTransientStorage', configDictionary) permanent_storage = utils.getModulenameFromConfiguration('permanentstorage', 'modules.storage.permanentstorage.MLPermanentStorage', configDictionary) lookup = utils.getModulenameFromConfiguration('lookup', 'modules.lookup.mllookup.MLLookup', configDictionary) session = utils.getModulenameFromConfiguration('session', 'modules.session.mlsession.MLSession', configDictionary) nlp = utils.getModulenameFromConfiguration('nlp', 'modules.nlp.mlnlp.MLNLP', configDictionary) humanLogic = utils.getModulenameFromConfiguration('humanlogic', 'modules.humanlogic.mlhumanlogic.MLHumanLogic', configDictionary) concept = utils.getModulenameFromConfiguration('concept', 'modules.concept.mlconcept.MLConcept', configDictionary) stt = utils.getModulenameFromConfiguration('stt', 'modules.speech.mlstt.MLSTT', configDictionary) tts = utils.getModulenameFromConfiguration('tts', 'modules.speech.mltts.MLTTS', configDictionary) responseProcessor = utils.getModulenameFromConfiguration('response-postprocessor', 'modules.response.mlresponseproc.MLResponseProcessor', configDictionary) dataInfusor = utils.getModulenameFromConfiguration('datainfusor', 'modules.datainfusor.mldatainfusor.MLDataInfusor', configDictionary) similarity = utils.getModulenameFromConfiguration('similarity', 'module.similarity.mlsimilarity.MLSimilarity', configDictionary) indexedResponseProc = utils.getModulenameFromConfiguration('indexed-response-processor', 'module.response.mlidxresponseproc.MLIndexedResponseProcessor', configDictionary) brain = utils.getModulenameFromConfiguration('brain', 'modules.brain.mlbrain.MLBrain', configDictionary) spelling = utils.getModulenameFromConfiguration('spelling', 'modules.spelling.mlspelling.MLSpelling', configDictionary) utils.validate_module_class(transient_storage, MLTransientStorage) self.transientStorage = utils.initialize_class(transient_storage, configDictionary) utils.validate_module_class(permanent_storage, MLPermanentStorage) self.permanentStorage = utils.initialize_class(permanent_storage, configDictionary) utils.validate_module_class(lookup, MLLookup) self.lookup = utils.initialize_class(lookup, configDictionary) utils.validate_module_class(session, MLSession) self.session = utils.initialize_class(session, configDictionary) utils.validate_module_class(nlp, MLNLP) self.nlp = utils.initialize_class(nlp, configDictionary) utils.validate_module_class(humanLogic, MLHumanLogic) self.humanLogic = utils.initialize_class(humanLogic, configDictionary) self.humanLogicClass = humanLogic utils.validate_module_class(concept, MLConcept) self.concept = utils.initialize_class(concept, configDictionary) utils.validate_module_class(stt, MLSpeechToText) self.stt = utils.initialize_class(stt, configDictionary) utils.validate_module_class(tts, MLTextToSpeech) self.tts = utils.initialize_class(tts, configDictionary) utils.validate_module_class(dataInfusor, MLDataInfusor) self.dataInfusor = utils.initialize_class(dataInfusor, configDictionary) utils.validate_module_class(responseProcessor, MLResponseProcessor) self.responseProcessor = utils.initialize_class(responseProcessor, configDictionary) utils.validate_module_class(similarity, MLSimilarity) self.similarity = utils.initialize_class(similarity, configDictionary) utils.validate_module_class(indexedResponseProc, MLIndexedResponseProcessor) self.indexedResponseProcessor = utils.initialize_class(indexedResponseProc, configDictionary) utils.validate_module_class(brain, MLBrain) self.brain = utils.initialize_class(brain, configDictionary) utils.validate_module_class(spelling, MLSpelling) self.spelling = utils.initialize_class(spelling, configDictionary) self.machineLogic = MLMachineLogic(configDictionary) # NOTE: Even though we try to instantiate the classes in the right order # and try to call their 'initForBender' in the right order, you should NEVER, in your module # depend on any of the modules being already initialized in your implementation of these two # methods: # __init__ # initForBender # Instead, you should request any other module instance only when you actually need them # during the processing of data # # The only exception is MLBrain that relies on concept and nlp being already initialized before # itself is called ... and this only during training... self.permanentStorage.initForBender(self) self.transientStorage.initForBender(self) self.nlp.initForBender(self) self.concept.initForBender(self) self.brain.initForBender(self) self.lookup.initForBender(self) self.session.initForBender(self) self.humanLogic.initForBender(self) self.stt.initForBender(self) self.tts.initForBender(self) self.dataInfusor.initForBender(self) self.responseProcessor.initForBender(self) self.similarity.initForBender(self) self.indexedResponseProcessor.initForBender(self) self.machineLogic.initForBender(self) self.spelling.initForBender(self) self.dataProviders = [] dataproviders = utils.getSectionFromConfiguration('dataproviders', [ 'modules.dataproviders.mldataprovider.MLDataProvier' ], configDictionary) for dp in dataproviders: module = dp['module'] utils.validate_module_class(module, MLDataProvider) newInstance = utils.initialize_class_with_config_section(module, dp, configDictionary) newInstance.initForBender(self) self.dataProviders.append(newInstance) self.dataExtractors = [] dataextractors = utils.getSectionFromConfiguration('dataextractors', [ 'modules.dataextractors.mldataextrator.MLDataExtractor' ], configDictionary) for de in dataextractors: module = de['module'] utils.validate_module_class(module, MLDataExtractor) newInstance = utils.initialize_class_with_config_section(module, de, configDictionary) newInstance.initForBender(self) self.dataExtractors.append(newInstance) self.internalMachineLogics = [] internalMLs = utils.getSectionFromConfiguration('i-machinelogic', None, configDictionary) for iml in internalMLs: module = iml['module'] utils.validate_module_class(module, MLInternalMachineLogic) newInstance = utils.initialize_class_with_config_section(module, iml, configDictionary) newInstance.initForBender(self) self.internalMachineLogics.append(newInstance) self.preprocessors = [] preprocs = utils.getSectionFromConfiguration('preprocessors', None, configDictionary) for preproc in preprocs: module = preproc['module'] utils.validate_module_class(module, MLRequestProcessor) newInstance = utils.initialize_class_with_config_section(module, preproc, configDictionary) newInstance.initForBender(self) self.preprocessors.append(newInstance) self.jobData = {} self.sessionData = {} self.configuration = configDictionary self.dataInfusor.setDataExtractorsAndProviders(self.dataExtractors, self.dataProviders) self.benderjob_logger = BenderJobLogger(configDictionary)
def create_annoy_index(moduleConfigSection, configDict, dictionary, corpus, in_q, out_q): logger = logging.getLogger(os.path.basename(sys.argv[0])) module_config = moduleConfigSection config_dict = configDict num_topics_lsi = int( utils.getKeyFromSectionInConfiguration('bender-training', 'num_topics_lsi', 200, configDict)) output_path = module_config.get('annoy_data_path', '') accuracy = int(module_config.get('accuracy', 500)) max_results = int(module_config.get('max_results', 100)) utils.safe_create_directory(output_path) lsi_vectors_file = os.path.join(output_path, LSI_VECTORS_FNAME) lsi_model_filename = os.path.join(output_path, LSI_MODEL_FNAME) tfidf_vectors_file = os.path.join(output_path, TFIDF_MODEL_FNAME) annoy_output_filename = os.path.join(output_path, ANNOY_OUTPUT_FNAME) clipped_output_filename = os.path.join(output_path, CLIPPED_CORPUS_FNAME) tfidf = gensim.models.TfidfModel(corpus) logger.info('Saving Tfidf...') tfidf.save(tfidf_vectors_file) logger.info('*** START generating LSI...') lsi = gensim.models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=num_topics_lsi) logger.info('*** DONE generating LSI...') lsi.save(lsi_model_filename) logger.info('*** SAVED generating LSI...') # convert all articles to latent semantic space, store the result as a MatrixMarket file # normalize all vectors to unit length, to simulate cossim in libraries that only support euclidean distance gensim.corpora.MmCorpus.serialize(lsi_vectors_file, (gensim.matutils.unitvec(vec) for vec in lsi[tfidf[corpus]])) mm = gensim.corpora.MmCorpus(lsi_vectors_file) num_features, num_docs = mm.num_terms, min(mm.num_docs, MAX_DOCS) clipped = numpy.empty((num_docs, num_features), dtype=numpy.float32) for docno, doc in enumerate(itertools.islice(mm, num_docs)): clipped[docno] = gensim.matutils.sparse2full(doc, num_features) logger.info('*** Saving clipped corpus as NUMPY...') numpy.save(clipped_output_filename, clipped) logger.info('*** Generating ANNOY...') clipped_corpus = gensim.matutils.Dense2Corpus(clipped, documents_columns=False) index_annoy = annoy.AnnoyIndex(num_features, metric='angular') for i, vec in enumerate(clipped_corpus): index_annoy.add_item( i, list(gensim.matutils.sparse2full(vec, num_features).astype(float))) logger.info('*** Building ANNOY...') index_annoy.build(accuracy) logger.info('*** Saving ANNOY...') index_annoy.save(annoy_output_filename) out_q.put('DONE') out_q.close() done = in_q.get() os._exit(0) return True
verbose = False dump = False for opt, arg in options: if opt == '-h': print('No Help yet') sys.exit(0) elif opt in ('-v', '--version'): print("V1.0beta") sys.exit(0) elif opt in ('-c', '--config'): configFile = arg elif opt in ('-V', '--verbose'): verbose = True elif opt in ('-d', '--dump'): dump = True print("============================ Bender Trainer ==============================") print(" Bender 2.0a - Copyright (c) 2019 Imdat Solak") print(" Written by: Imdat Solak ([email protected])") print("=========================================================================") benderConfig = utils.getBenderConfiguration(configFile, verbose, False) logger = logging.getLogger(os.path.basename(sys.argv[0])) logFile = utils.getKeyFromSectionInConfiguration('bender-training', 'train_log_file', 'logs/bender-train.log', benderConfig) utils.safe_create_directory(os.path.dirname(logFile)) logging.basicConfig(filename=logFile, format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info('**************** NEW TRAINING SESSION STARTED: %s' % ' '.join(sys.argv)) tdc = TrainingDataConverter(benderConfig) tdc.train()