def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class() fo = open(APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
def __init__(self, data_model): """ Iniital Setting :param data_model: Setting Slack Model. Slack Model has the a lot of paramater """ self.slack_channel = data_model.slack_channel self.data = "" self.parameter = data_model.parameter_dict self.model_name = "../model_word_match/ChainerDialogue" self.generation_limit = 200 """ We confirm channel number https://api.slack.com/methods/channels.list """ self.chan = data_model.chan self.usr = data_model.user self.mecab_dict = data_model.mecab_dict self.Mecab = MeCab.Tagger("-Owakati -d %s" % self.mecab_dict) XP.set_library(False, 0) self.XP = XP wn_summary_list = APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() self.class_word_vector = self.__make_class_word_vector(file_list) self.sqlite_twitter_summary = SqliteTwitterSummary( self.class_word_vector) self.word_class_dict = self.sqlite_twitter_summary.make_class_word_dict( ) self.word_class = "" self.multi_train_execute = ExecuteAttentionDialogue() self.elastic_search = GetAnswer()
def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() wn_average_vector_list = APP_ROOT + '/../../Data/wn_average_vector_list.txt' self.input_module = InputFileCython(wn_average_vector_list) self.input_module.input_special_format_file() vector_list = self.input_module.get_file_data() class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = re.sub("\]|\[|\'", "", self.input_module.get_file_data()[0].strip()) class_word_vector.update({file.strip().replace(".txt", ""): word_list.split(",")}) for vector in vector_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary_multi/" + vector.strip()) self.input_module.input_special_format_file() vector_list = [] if vector.strip() not in class_average_vector: for value in self.input_module.get_file_data(): value = re.sub("\]|\[|\'", "", value.strip()) [vector_list.append(each_value) for each_value in value.split(" ") if each_value != ""] vector_list = list(map(float, vector_list)) class_average_vector.update({vector.strip().replace(".txt_vector.txt", ""): vector_list}) class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_word_vector, class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name)
def get_file_data(self, file): """ get file data :param file: summary word net data :return: """ self.input_module = InputFileCython(file) self.input_module.input_special_format_file() return self.input_module.get_file_data()
def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file()
class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class( ) fo = open( APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython( class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity( )
def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" for file in file_list: self.input_module = InputFileCython( APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" + file.strip()) self.input_module.input_special_format_file() self.class_summary_exclude.exclude_data( OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() class_word_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = (list(map(lambda x:x.strip(), self.input_module.get_file_data()))) class_word_vector.update({file.strip().replace("_summary.txt", ""): word_list}) sqlite_twitter_cython = SqliteTwitterSummaryCython(class_word_vector) sqlite_twitter_cython.call_sql()
def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" + file.strip()) self.input_module.input_special_format_file() self.class_summary_exclude.exclude_data(OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_51519_limit05_out_put_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file()
class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython(file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class() fo = open(APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython(class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity()
def __make_class_word_vector(self, file_list): """ Make class word vector dict :param file_list: :return: """ class_word_vector = {} for file in file_list: self.input_module = InputFileCython( APP_ROOT + "/../Data/wn_total_summary_51519_limit05_out_put//" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = (list( map(lambda x: x.strip(), self.input_module.get_file_data()))) class_word_vector.update( {file.strip().replace("_summary.txt", ""): word_list}) return class_word_vector
def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update({class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "")
class Test_ClassSummaryExclude(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() self.class_summary_exclude = ClassSummaryExcludeCython() def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" for file in file_list: self.input_module = InputFileCython( APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" + file.strip()) self.input_module.input_special_format_file() self.class_summary_exclude.exclude_data( OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
class Test_ClassSummaryCosineSimilarty(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_51519_limit05_out_put_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() class_word_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = (list(map(lambda x:x.strip(), self.input_module.get_file_data()))) class_word_vector.update({file.strip().replace("_summary.txt", ""): word_list}) sqlite_twitter_cython = SqliteTwitterSummaryCython(class_word_vector) sqlite_twitter_cython.call_sql()
class Test_ClassSummaryExclude(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wn_summary_list = APP_ROOT + '/../../Data/wn_total_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() self.class_summary_exclude = ClassSummaryExcludeCython() def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() OUT_PUT_PATH = APP_ROOT + "/../../Data/wn_total_summary_51519_limit05_out_put/" for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_total_summary_51519_limit05/" + file.strip()) self.input_module.input_special_format_file() self.class_summary_exclude.exclude_data(OUT_PUT_PATH, file.strip(), self.input_module.get_file_data())
def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() wn_summary_list = APP_ROOT + '/../../Data/wn_summary_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() count = 0 class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if count == 0: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector) else: class_summary = ClassSummaryCython( file.strip(), self.input_module.get_file_data(), wiki_vector, class_word_vector, class_average_vector) class_word_vector, class_average_vector = class_summary.summary_class( ) fo = open( APP_ROOT + "/../../Data/test/" + file.strip() + "_vector.txt", 'w') sys.stdout = fo print(class_average_vector[file.strip()]) fo.close() sys.stdout = sys.__stdout__ class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython( class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity( )
class Test_ClassSummary(unittest.TestCase): """Test Class Summary class. """ def setUp(self): """ setting initial paramater Args: data: test file name split_module: setting the split_module instance """ wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' self.input_module = InputFileCython(wiki_vector_file_name) def test_summary_class(self): """ test make summary dict """ self.input_module.input_fast_large_file() wiki_vector = self.input_module.get_vector() read_file = '13996061-n.txt' class_summary = ClassSummary(read_file, wiki_vector) class_summary.summary_class()
def test_summary_class(self): """ test make summary dict """ file_list = self.input_module.get_file_data() wn_average_vector_list = APP_ROOT + '/../../Data/wn_average_vector_list.txt' self.input_module = InputFileCython(wn_average_vector_list) self.input_module.input_special_format_file() vector_list = self.input_module.get_file_data() class_word_vector = {} class_average_vector = {} for file in file_list: self.input_module = InputFileCython(APP_ROOT + "/../../Data/wn_summary/" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = re.sub( "\]|\[|\'", "", self.input_module.get_file_data()[0].strip()) class_word_vector.update( {file.strip().replace(".txt", ""): word_list.split(",")}) for vector in vector_list: self.input_module = InputFileCython( APP_ROOT + "/../../Data/wn_summary_multi/" + vector.strip()) self.input_module.input_special_format_file() vector_list = [] if vector.strip() not in class_average_vector: for value in self.input_module.get_file_data(): value = re.sub("\]|\[|\'", "", value.strip()) [ vector_list.append(each_value) for each_value in value.split(" ") if each_value != "" ] vector_list = list(map(float, vector_list)) class_average_vector.update({ vector.strip().replace(".txt_vector.txt", ""): vector_list }) class_summary_cosine_similarity_cython = ClassSummaryCosineSimilarityCython( class_word_vector, class_average_vector) class_summary_cosine_similarity_cython.summary_class_use_cosine_similarity( )
def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update( {class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "")
class SqliteTwitterSummary(object): """ Twitter Save to the SQLite """ def __init__(self, class_word_vector): """ Initial Setting Get the mecab dict by the yaml """ Twitter = namedtuple("Twitter", ["mecab"]) config_file = "enviroment_twitter.yml" with open(config_file, encoding="utf-8") as cf: e = yaml.load(cf) twitter = Twitter(e["twitter"]["mecab"]) self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab) conn = sqlite3.connect('./twitter_data.db') self.cur = conn.cursor() self.class_word_vector = class_word_vector self.class_average_vector = {} self.class_word_dict = self.make_class_word_dict() #self.__initial_setting_vector() def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update({class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "") def make_class_word_dict(self): """ make remake the data format """ word_class_dict = {} for class_name, word_list in self.class_word_vector.items(): word_dict = {} for word in word_list: if word not in word_dict: word_dict.update({word: 1}) if class_name not in word_class_dict: word_class_dict.update({class_name: word_dict}) return word_class_dict def call_sql(self): """ call SQlite and save the twitter in the SQLite """ self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""") file_list = os.listdir("./data_latest/") #for file in file_list: # os.remove("./data/" + file) for source_txt, replay_txt in self.cur.fetchall(): class_name = self.judge_class(source_txt, replay_txt) # class_name = self.judge_class_wiki_vector(source_txt, replay_txt) print(class_name) print(source_txt) print(replay_txt) source_file = open("./data_latest/" + class_name + '_source_twitter_data.txt', 'a') replay_file = open("./data_latest/" + class_name + '_replay_twitter_data.txt', 'a') replay_file.write(self.tagger.parse(source_txt).replace("\n", "") + '\n') source_file.write(self.tagger.parse(replay_txt).replace('\n', '') + '\n') source_file.close() replay_file.close() def judge_class(self, source_txt, replay_txt=""): """ Judge word class :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) for class_name in self.class_word_vector.keys(): word_match_count = self.__match_word_count(total_text, class_name) if class_name not in class_match_rate: class_match_rate.update({class_name: 1.0 * word_match_count / len(self.class_word_dict[class_name])}) if max(class_match_rate.values()) == 0.0: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def judge_class_wiki_vector(self, source_txt, replay_txt=""): """ Judge word class by wiki vector :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) self.class_summary.summary_vector_word_list(total_text) summary_vector = self.class_summary.get_average_vector() for class_name, average_vector in self.class_average_vector.items(): class_match_rate.update({class_name: self.cosine_similarity.cosine_similarity(summary_vector, average_vector)}) print(class_match_rate) if max(class_match_rate.values()) <= 0.1: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def __mecab_method(self, text): """ Call Mecab method split process and choose noum :param text: :return: only noum """ res = self.tagger.parseToNode("".join(text)) split_nonum = [] while res: feature = res.feature.split(",") if feature[0] == u"名詞": split_nonum.append(feature[6]) res = res.next return split_nonum def __match_word_count(self, total_text, class_name): """ count matthing word word class :param total_text: source text and reply text :param class_name: choose class name :return: matthing count """ word_match_count = 0 for word in total_text: if word in self.class_word_dict[class_name]: word_match_count = word_match_count + 1 return word_match_count
class SlackApp(): """ Slack Call app You preapre the chainer model, You execute the bellow command, you can play the dialogue app Example python app.py """ def __init__(self, data_model): """ Iniital Setting :param data_model: Setting Slack Model. Slack Model has the a lot of paramater """ self.slack_channel = data_model.slack_channel self.data = "" self.parameter = data_model.parameter_dict self.model_name = "../model_word_match/ChainerDialogue" self.generation_limit = 200 """ We confirm channel number https://api.slack.com/methods/channels.list """ self.chan = data_model.chan self.usr = data_model.user self.mecab_dict = data_model.mecab_dict self.Mecab = MeCab.Tagger("-Owakati -d %s" % self.mecab_dict) XP.set_library(False, 0) self.XP = XP wn_summary_list = APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt' self.input_module = InputFileCython(wn_summary_list) self.input_module.input_special_format_file() file_list = self.input_module.get_file_data() self.class_word_vector = self.__make_class_word_vector(file_list) self.sqlite_twitter_summary = SqliteTwitterSummary( self.class_word_vector) self.word_class_dict = self.sqlite_twitter_summary.make_class_word_dict( ) self.word_class = "" self.multi_train_execute = ExecuteAttentionDialogue() self.elastic_search = GetAnswer() def __make_class_word_vector(self, file_list): """ Make class word vector dict :param file_list: :return: """ class_word_vector = {} for file in file_list: self.input_module = InputFileCython( APP_ROOT + "/../Data/wn_total_summary_51519_limit05_out_put//" + file.strip()) self.input_module.input_special_format_file() if file.strip() not in class_word_vector: word_list = (list( map(lambda x: x.strip(), self.input_module.get_file_data()))) class_word_vector.update( {file.strip().replace("_summary.txt", ""): word_list}) return class_word_vector def call_method(self): """ Slack api call 1: read sentence 2: model return the sentence """ if self.slack_channel.rtm_connect(): while True: self.data = self.slack_channel.rtm_read() self.__judge_print() time.sleep(1) else: print("connection Fail") def __judge_print(self): """ judge slack call for chainer Example: chainer:{your sentence} chainer return the sentence chainer_train:{your sentence} start train """ if len(self.data) >= 1 and "text" in self.data[0]: input_text = self.data[0]["text"] print(input_text) if "chainer:" in input_text: # predict if "?" in input_text or "?" in input_text: replace_input = re.sub("chainer:|\?", "", input_text.strip()) self.elastic_search.search_data(replace_input) if len(self.elastic_search.search_result) > 0: hyp_batch = self.elastic_search.search_result[0] print(hyp_batch) if hyp_batch["image"]: word = hyp_batch["image"] + "\n" + hyp_batch[ "title"] + "\n" + hyp_batch[ "abstract"] + "\n" + hyp_batch["url"] else: word = hyp_batch["title"] + "\n" + hyp_batch[ "abstract"] + "\n" + hyp_batch["url"] else: word = "No match" else: # input sentence src_batch = self.__input_sentence() hyp_batch = self.__predict_sentence(src_batch) word = ''.join(hyp_batch[0]).replace("</s>", "") # show predict word print( self.slack_channel.api_call("chat.postMessage", user=self.usr, channel=self.chan, text=word)) if "chainer_train" in self.data[0]["text"]: self.__setting_parameter() self.__multi_train() def __multi_train(self): """ Call multi train """ self.multi_train_execute.train_mulit_model() def __input_sentence(self): """ return sentence for chainer predict """ text = self.__mecab_method(self.data[0]["text"].replace( "chainer:", "")) self.word_class = self.sqlite_twitter_summary.judge_class( self.data[0]["text"].replace("chainer:", "")) ## self.word_class = self.sqlite_twitter_summary.judge_class_wiki_vector(self.data[0]["text"].replace("chainer:", "")) data = [text] src_batch = [ x + ["</s>"] * (self.generation_limit - len(x) + 1) for x in data ] return src_batch def __predict_sentence(self, src_batch): """ predict sentence :param src_batch: get the source sentence :return: """ self.model_name = "../model_word_match/ChainerDialogue_" + self.word_class print(self.word_class) dialogue = EncoderDecoderModelAttention(self.parameter) src_vocab = Vocabulary.load(self.model_name + '.srcvocab') trg_vocab = Vocabulary.load(self.model_name + '.trgvocab') model = AttentionDialogue.load_spec(self.model_name + '.spec', self.XP) serializers.load_hdf5(self.model_name + '.weights', model) hyp_batch = dialogue.forward_implement(src_batch, None, src_vocab, trg_vocab, model, False, self.generation_limit) print(hyp_batch) return hyp_batch def __setting_parameter(self): """ setteing each patamater """ self.parameter["word2vec"] = self.model_name train_path = "../twitter/" self.parameter["source"] = train_path + "source_twitter_data.txt" self.parameter["target"] = train_path + "replay_twitter_data.txt" def __mecab_method(self, text): """ Call the mecab method :param text: user input text :return: """ mecab_text = self.Mecab.parse(text) return mecab_text.split(" ")
class ProducerConsumerThreadSqlTwitter(object): """ Producer Consumer Multi Thread Crawling. Using the Consumer Producer pattern Reference Python Consumer Producer pattern http://agiliq.com/blog/2013/10/producer-consumer-problem-in-python/ Multi Thread Design pattern """ def get_file_data(self, file): """ get file data :param file: summary word net data :return: """ self.input_module = InputFileCython(file) self.input_module.input_special_format_file() return self.input_module.get_file_data() def producer_run(self): """ Running Producer """ file_list = self.get_file_data(APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt') class_word_vector = {} for file in file_list: if file.strip() not in class_word_vector: word_list = (list(map(lambda x:x.strip(), self.get_file_data(APP_ROOT + "/../Data/wn_total_summary_51519_limit05_out_put/" + file.strip())))) class_word_vector.update({file.strip().replace("_summary.txt", ""): word_list}) global class_queue global check_queue while True: if class_queue not in check_queue.queue: try: class_queue.put(class_word_vector) check_queue.put(class_word_vector) except queue.Empty: print("Queue Full") pass else: log_text = "Produced " print(log_text) time.sleep(random.uniform(0.0, 0.5)) def consumer_run(self): """ Running Consumer """ global class_queue while True: try: class_word_vector = class_queue.get() except class_queue.Empty: print("Queue Empty") pass else: log_text = "Consume " print(log_text) sqlite_twitter = SqliteTwitterSummaryCython(class_word_vector) sqlite_twitter.call_sql() class_queue.task_done() # Setting the wait time, I refered to the bellow link # https://www.w3.org/Protocols/HTTP-NG/http-prob.html time.sleep(random.uniform(0.601, 0.602))
This script for parallel command multi thread program """ if __name__ == '__main__': """ args -r: setting word net link list Example: '/../../Data/wn_summary_list.txt' """ parser = argparse.ArgumentParser() parser.add_argument('--read_word_net_list_file', '-r', default='', help='set word net list file') args = parser.parse_args() # Word Net File wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file input_module = InputFileCython(wn_summary_split_list) input_module.input_special_format_file() file_list = input_module.get_file_data() # Wiki Vector wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' input_module = InputFileCython(wiki_vector_file_name) input_module.input_fast_large_file() wiki_vector = input_module.get_vector() producerConsumer = ProducerConsumerClassSummary() multi_thread_producer_crawl_instance = threading.Thread(target=producerConsumer.producer_run, args=([file_list])) multi_thread_consumer_crawl_instance = threading.Thread(target=producerConsumer.consumer_run, args=([wiki_vector])) multi_thread_producer_crawl_instance.start() multi_thread_consumer_crawl_instance.start()
if __name__ == '__main__': """ args -r: setting word net link list Example: '/../../Data/wn_summary_list.txt' """ parser = argparse.ArgumentParser() parser.add_argument('--read_word_net_list_file', '-r', default='', help='set word net list file') args = parser.parse_args() # Word Net File wn_summary_split_list = APP_ROOT + "/../../Data/" + args.read_word_net_list_file input_module = InputFileCython(wn_summary_split_list) input_module.input_special_format_file() file_list = input_module.get_file_data() # Wiki Vector wiki_vector_file_name = APP_ROOT + '/../../Data/jawiki_vector/jawiki_vector.txt' input_module = InputFileCython(wiki_vector_file_name) input_module.input_fast_large_file() wiki_vector = input_module.get_vector() producerConsumer = ProducerConsumerClassSummary() multi_thread_producer_crawl_instance = threading.Thread( target=producerConsumer.producer_run, args=([file_list])) multi_thread_consumer_crawl_instance = threading.Thread( target=producerConsumer.consumer_run, args=([wiki_vector])) multi_thread_producer_crawl_instance.start() multi_thread_consumer_crawl_instance.start()
class SqliteTwitterSummary(object): """ Twitter Save to the SQLite """ def __init__(self, class_word_vector): """ Initial Setting Get the mecab dict by the yaml """ Twitter = namedtuple("Twitter", ["mecab"]) config_file = "enviroment_twitter.yml" with open(config_file, encoding="utf-8") as cf: e = yaml.load(cf) twitter = Twitter(e["twitter"]["mecab"]) self.tagger = MeCab.Tagger("-Owakati -d %s" % twitter.mecab) conn = sqlite3.connect('./twitter_data.db') self.cur = conn.cursor() self.class_word_vector = class_word_vector self.class_average_vector = {} self.class_word_dict = self.make_class_word_dict() #self.__initial_setting_vector() def __initial_setting_vector(self): # Wiki vector dict wiki_vector_file_name = APP_ROOT + '/../Data/jawiki_vector/jawiki_vector_delete_first.txt' self.input_module = InputFileCython(wiki_vector_file_name) self.input_module.input_fast_large_file() self.wiki_vector = self.input_module.get_vector() # Make average vector dict wiki_average_vector_file_name_list = APP_ROOT + '/../Data/wn_summary_multi_vector_list.txt' self.input_module = InputFileCython(wiki_average_vector_file_name_list) self.input_module.input_special_format_file() summary_vector_file_list = self.input_module.get_file_data() for file in summary_vector_file_list: read_file = APP_ROOT + "/../Data/wn_summary_multi_vector/" + file self.input_module = InputFileCython(read_file) self.input_module.input_file_str_list() summary_vector_file_list = self.input_module.get_file_data() class_name = file.replace("_summary.txt_vector.txt", "") if class_name not in self.class_average_vector: self.class_average_vector.update( {class_name: summary_vector_file_list}) self.class_summary = ClassSummary("", self.wiki_vector, "") self.cosine_similarity = ClassCosineSimilarity("", "") def make_class_word_dict(self): """ make remake the data format """ word_class_dict = {} for class_name, word_list in self.class_word_vector.items(): word_dict = {} for word in word_list: if word not in word_dict: word_dict.update({word: 1}) if class_name not in word_class_dict: word_class_dict.update({class_name: word_dict}) return word_class_dict def call_sql(self): """ call SQlite and save the twitter in the SQLite """ self.cur.execute("""SELECT source_txt, replay_txt FROM ms_rinna;""") file_list = os.listdir("./data_latest/") #for file in file_list: # os.remove("./data/" + file) for source_txt, replay_txt in self.cur.fetchall(): class_name = self.judge_class(source_txt, replay_txt) # class_name = self.judge_class_wiki_vector(source_txt, replay_txt) print(class_name) print(source_txt) print(replay_txt) source_file = open( "./data_latest/" + class_name + '_source_twitter_data.txt', 'a') replay_file = open( "./data_latest/" + class_name + '_replay_twitter_data.txt', 'a') replay_file.write( self.tagger.parse(source_txt).replace("\n", "") + '\n') source_file.write( self.tagger.parse(replay_txt).replace('\n', '') + '\n') source_file.close() replay_file.close() def judge_class(self, source_txt, replay_txt=""): """ Judge word class :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) for class_name in self.class_word_vector.keys(): word_match_count = self.__match_word_count(total_text, class_name) if class_name not in class_match_rate: class_match_rate.update({ class_name: 1.0 * word_match_count / len(self.class_word_dict[class_name]) }) if max(class_match_rate.values()) == 0.0: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def judge_class_wiki_vector(self, source_txt, replay_txt=""): """ Judge word class by wiki vector :param source_txt: twitter source text :param replay_txt: twitter replay text :return: most match class """ class_match_rate = {} total_text = [] source_wakati_text = self.__mecab_method(source_txt.strip()) total_text.extend(source_wakati_text) if replay_txt != "": replay_wakati_text = self.__mecab_method(replay_txt.strip()) total_text.extend(replay_wakati_text) self.class_summary.summary_vector_word_list(total_text) summary_vector = self.class_summary.get_average_vector() for class_name, average_vector in self.class_average_vector.items(): class_match_rate.update({ class_name: self.cosine_similarity.cosine_similarity( summary_vector, average_vector) }) print(class_match_rate) if max(class_match_rate.values()) <= 0.1: return "other" else: return max(class_match_rate.items(), key=operator.itemgetter(1))[0] def __mecab_method(self, text): """ Call Mecab method split process and choose noum :param text: :return: only noum """ res = self.tagger.parseToNode("".join(text)) split_nonum = [] while res: feature = res.feature.split(",") if feature[0] == u"名詞": split_nonum.append(feature[6]) res = res.next return split_nonum def __match_word_count(self, total_text, class_name): """ count matthing word word class :param total_text: source text and reply text :param class_name: choose class name :return: matthing count """ word_match_count = 0 for word in total_text: if word in self.class_word_dict[class_name]: word_match_count = word_match_count + 1 return word_match_count
class ProducerConsumerThreadSqlTwitter(object): """ Producer Consumer Multi Thread Crawling. Using the Consumer Producer pattern Reference Python Consumer Producer pattern http://agiliq.com/blog/2013/10/producer-consumer-problem-in-python/ Multi Thread Design pattern """ def get_file_data(self, file): """ get file data :param file: summary word net data :return: """ self.input_module = InputFileCython(file) self.input_module.input_special_format_file() return self.input_module.get_file_data() def producer_run(self): """ Running Producer """ file_list = self.get_file_data( APP_ROOT + '/../Data/wn_total_summary_51519_limit05_out_put_list.txt') class_word_vector = {} for file in file_list: if file.strip() not in class_word_vector: word_list = (list( map( lambda x: x.strip(), self.get_file_data( APP_ROOT + "/../Data/wn_total_summary_51519_limit05_out_put/" + file.strip())))) class_word_vector.update( {file.strip().replace("_summary.txt", ""): word_list}) global class_queue global check_queue while True: if class_queue not in check_queue.queue: try: class_queue.put(class_word_vector) check_queue.put(class_word_vector) except queue.Empty: print("Queue Full") pass else: log_text = "Produced " print(log_text) time.sleep(random.uniform(0.0, 0.5)) def consumer_run(self): """ Running Consumer """ global class_queue while True: try: class_word_vector = class_queue.get() except class_queue.Empty: print("Queue Empty") pass else: log_text = "Consume " print(log_text) sqlite_twitter = SqliteTwitterSummaryCython(class_word_vector) sqlite_twitter.call_sql() class_queue.task_done() # Setting the wait time, I refered to the bellow link # https://www.w3.org/Protocols/HTTP-NG/http-prob.html time.sleep(random.uniform(0.601, 0.602))