def __init__(self, connected_to_internet): """ Initializer. Prepares Config object. :param connected_to_internet: True - connected to the Interner, False - disconnected """ self.connected_to_internet = connected_to_internet self.font_cache = {} self.voice_commands_cache = {} self.cd_titles = {} self.cd_track_names_cache = {} self.screensaver_cache = {} self.config_class = Config() self.config = self.config_class.config self.screen_rect = self.config_class.screen_rect self.config[LABELS] = self.get_labels() self.weather_config = self.get_weather_config() self.pygame_screen = self.config_class.pygame_screen self.CURRENT_WORKING_DIRECTORY = os.getcwd() self.read_storage() self.discogs_util = DiscogsUtil(self.k1) self.image_util = ImageUtil(self) self.file_util = FileUtil(self) if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context self.podcasts_util = None self.db_util = None self.bluetooth_util = None
def get_refresh_video_barrage(self, cid, row_barrages): barrage_file_path = FileUtil.get_barrage_file_path(cid) # 检查该cid的弹幕文件是否存在,如果不存在,那么此时的row_barrages数据将全部写入文件中, # 如果存在,那么就只要找到更新的弹幕记录。 barrage_count = 0 if FileUtil.is_file_exists(barrage_file_path): last_barrage_index = -1 # 记录文件中最后一条弹幕在row_barrages中的下标。 barrage_count = FileUtil.get_file_line_count(barrage_file_path) last_n_barrages = FileUtil.get_file_last_n_line_content(barrage_file_path, 5) Logger.print_console_info(u"当前文件的最后n条弹幕:\n" + u"\n".join(last_n_barrages)) for index in xrange(len(row_barrages) - 1, -1, -1): if self.__is_same_barrage(last_n_barrages, row_barrages[index]): # 获得存储在弹幕文件中的最后一条弹幕,在更新弹幕序列中的位置。 last_barrage_index = index break # 当前弹幕数据没有更新 if last_barrage_index == (len(row_barrages) - 1): row_barrages = [] Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"弹幕数据没有更新。") # 此时部分的弹幕数据需要更新 elif last_barrage_index >= 0: Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"有弹幕数据更新:" + u"\t" + str(len(row_barrages) - last_barrage_index - 1)) row_barrages = row_barrages[last_barrage_index + 1: len(row_barrages)] # 弹幕全文都要更新 elif last_barrage_index == -1: Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u"\t" + u"有弹幕数据更新:" + u"\t" + str(len(row_barrages))) barrage_count += len(row_barrages) Logger.print_console_info(unicode(DateTimeUtil.get_cur_timestamp("%Y-%m-%d %H:%M:%S")) + u" 当前弹幕总条数:" + unicode(barrage_count) + u"\n\n") return row_barrages
def __init__(self, connected_to_internet): """ Initializer. Prepares Config object. """ self.connected_to_internet = connected_to_internet self.font_cache = {} self.image_cache = {} self.voice_commands_cache = {} self.cd_titles = {} self.cd_track_names_cache = {} self.screensaver_cache = {} self.image_cache_base64 = {} self.svg_cache = {} self.album_art_url_cache = {} self.config_class = Config() self.config = self.config_class.config self.config[LABELS] = self.get_labels() self.weather_config = self.get_weather_config() self.PYGAME_SCREEN = self.config[PYGAME_SCREEN] self.file_util = FileUtil(self.config) self.CURRENT_WORKING_DIRECTORY = os.getcwd() self.discogs_util = DiscogsUtil() self.COLOR_MAIN = self.color_to_hex(self.config[COLORS][COLOR_BRIGHT]) self.COLOR_ON = self.color_to_hex(self.config[COLORS][COLOR_CONTRAST]) self.COLOR_OFF = self.color_to_hex( self.config[COLORS][COLOR_DARK_LIGHT]) self.COLOR_MUTE = self.color_to_hex(self.config[COLORS][COLOR_MUTE]) if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context
def gen_user_topic_lda_by_time_window(cls, barrage_seg_list, cid): dictionary = corpora.Dictionary.load(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-words.dict")) lda_model = models.TfidfModel.load(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model")) time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid) for time_window in time_window_list: time_window.gen_user_word_frequency() # 产生该时间窗口内的用户词频信息。 time_window.gen_user_topic_lda(dictionary, lda_model) # 产生该时间窗口内的用户所发词语的tfidf权重信息。 return time_window_list
def segment_barrages(barrages, cid=None, is_corpus=False): if os.path.exists(FileUtil.get_word_segment_result_file_path(cid)): # 如果存在切词保存结果文件,那么直接从切词文件中读取 return load_segment_barrages(cid) index = 0 barrage_seg_list = [] for barrage in barrages: barrage_seg = BarrageSeg(barrage.play_timestamp, barrage.sender_id, barrage.row_id, index) sentence_seg = __segment_sentence(barrage.content) # 一条弹幕分词之后的结果 if len(sentence_seg) <= 0: # 对于其中的词语全部都被过滤的弹幕,不保存它的信息(防止影片末尾日期刷屏) continue barrage_seg.sentence_seg_list = sentence_seg barrage_seg_list.append(barrage_seg) index += 1 if is_corpus is False: # 将分词结果写入测试文件中,检查分词情况 __save_segment_word_to_file(barrage_seg_list, cid) # 将分词的结果以json的形式写入文件中,以供今后分析zscore的时候调用。 save_segment_barrages(barrage_seg_list, cid) # 将视频v全体的弹幕数据作为语料库,便于生成tf-idf模型 corpus = DictConfig.gen_corpus_info(barrage_seg_list, cid) # 以分好词的弹幕作为训练集,训练tf-idf模型 DictConfig.gen_tfidf_model(corpus, cid) # 以分好词的弹幕作为训练集,训练lda模型 DictConfig.gen_lda_model(corpus, cid) return barrage_seg_list
def build_window(danmaku_list, window_size, step_length, parse_dict): window_list = [] current_start = 0 current_end = current_start + window_size current_danmaku = [] current_index = 0 if FileUtil.is_file_exists(constants.WINDOW_LOG): os.remove(constants.WINDOW_LOG) while current_start < danmaku_list[-1].videoSecond: logging.info("Building time window " + str(current_index) + "...") for danmaku in danmaku_list: if current_start <= danmaku.videoSecond <= current_end: current_danmaku.append(danmaku) elif danmaku.videoSecond > current_end: break # write_window_log(current_index, current_start, current_end, current_danmaku) time_window = TimeWindow(current_index, current_start, current_end) time_window.buildUsers(danmakuutil.extract_users(current_danmaku)) time_window.buildTSCs(len(current_danmaku)) time_window.buildTSCLength(current_danmaku) # time_window.buildEntropy(current_danmaku, parse_dict) # time_window.buildUserFeature(danmakuutil.extract_user_feature(current_danmaku, parse_dict, "Word-Frequency")) window_list.append(time_window) current_index += 1 current_start += step_length current_danmaku = [] current_end = current_start + window_size return window_list
def gen_tfidf_model(cls, corpus, cid): # let’s initialize a tfidf transformation: logging.debug(u"生成 tfidf 模型!!!") tfidf = models.TfidfModel(corpus) tfidf.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-tfidf.model"))
def __save_segment_word_to_file(barrage_seg_list, cid): # barrage_seg_list -> barrage_seg -> sentence_seg_list -> sentence_seg word_segment_file = os.path.join(FileUtil.get_word_segment_dir(), "test-" + cid + "-seg-result.txt") with codecs.open(word_segment_file, "wb", "utf-8") as output_file: for barrage_seg in barrage_seg_list: for word_seg in barrage_seg.sentence_seg_list: output_file.write(word_seg.word + u"\t" + word_seg.flag + u"\n")
def load_segment_barrages(cid): json_data = [] file_path = FileUtil.get_word_segment_result_file_path(cid) with codecs.open(file_path, "rb", "utf-8") as input_file: for line in input_file: json_data.append(line) json_str = u"".join(json_data) barrage_seg_list_json = json.loads(json_str) barrage_seg_list = BarrageSeg.dict2barrageseglist(barrage_seg_list_json) return barrage_seg_list
def __save_time_window_info_to_file(cls, cid, time_window_list): file_path = os.path.join(FileUtil.get_zscore_dir(), str(cid) + "-time-window-info.txt") with codecs.open(file_path, "wb", "utf-8") as output_file: for time_window in time_window_list: time_window_info = unicode(str(time_window.time_window_index)) + u"\t" \ + DateTimeUtil.format_barrage_play_timestamp(time_window.start_timestamp) + u"\t" \ + DateTimeUtil.format_barrage_play_timestamp(time_window.end_timestamp) + u"\t" \ + unicode(str(time_window.barrage_count)) + u"\t" \ + unicode(str(time_window.valid_barrage_word_count)) + u"\n" output_file.write(time_window_info)
def __save_similarity_matrix_to_local(cls, sim_matrix, time_window_index): matrix_file_name = os.path.join( FileUtil.get_similarity_matrix_dir(), "matrix-" + str(time_window_index) + ".txt") with codecs.open(matrix_file_name, "wb", "utf-8") as output_file: np.savetxt(fname=output_file, X=sim_matrix, fmt="%.2f", delimiter="\t", newline="\n")
def gen_corpus_info(cls, barrage_seg_list, cid): # 获得每条弹幕分好之后的词语 texts = [] for barrage_seg in barrage_seg_list: text = [] for word_seg in barrage_seg.sentence_seg_list: text.append(word_seg.word) texts.append(text) # 为文本中的每一个词语赋予一个数字下标 dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference dictionary.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-words.dict")) logging.debug(dictionary.token2id) # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use corpora.MmCorpus.serialize(os.path.join(FileUtil.get_train_model_dir(), str(cid) + '-barrage-corpus.mm'), corpus) return corpus
def gen_sorted_zscore_file(self, threshold_value=1.0): sorted_zscore_file_name = FileUtil.get_cid_from_barrage_file_path(self.zscore_file_path) + "-sorted-zscore.txt" with codecs.open(sorted_zscore_file_name, "wb", "utf-8") as output_file: for time_window_index, zscore in self.zscore_list: if zscore < threshold_value: continue total_seconds = time_window_index * self.slide_time_interval # 时间窗口起始地秒数 zscore_info = unicode(str(time_window_index)) + u"\t" + unicode( str(zscore)) + u"\t" + DateTimeUtil.format_barrage_play_timestamp(total_seconds) + u"\n" # logger.debug(zscore_info) output_file.write(zscore_info)
def build_word2vec_model(barrage_corpus_dirname, barrage_corpus_file_type="txt"): train_sentences = TrainSentences(barrage_corpus_dirname, barrage_corpus_file_type) """ min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them, default 5 size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training algorithm has, default 100 workers: training parallelization, to speed up training, default = 1 worker = no parallelization """ model = gensim.models.Word2Vec(train_sentences, min_count=5, size=150, workers=multiprocessing.cpu_count()) model.save(os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt"))
def gen_corpus_words(): barrage_corpus_files = glob.glob(os.path.join(FileUtil.get_corpus_dir(), "*.txt")) file_lists = [barrage_corpus_files[0: 501], barrage_corpus_files[501: 1001], barrage_corpus_files[1001: 1501], barrage_corpus_files[1501: 2001], barrage_corpus_files[2001: 2501], barrage_corpus_files[2501: 3001], barrage_corpus_files[3001: len(barrage_corpus_files)]] pools = Pool(7) file_index = 0 for file_list in file_lists: file_index += 1 pools.apply_async(__gen_corpus_words, args=(file_list, "all-corpus-" + str(file_index) + ".txt")) pools.close() pools.join()
def __save_high_emotion_clips_to_file(self, high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould): file_path = os.path.join(FileUtil.get_zscore_dir(), self.cid + "-high-emotion-clips.txt") with codecs.open(file_path, "wb", "utf-8") as output_file: output_file.write(unicode(str(global_zscore_threshold)) + u"\t" + unicode(str(left_zscore_threshold)) + u"\t" + unicode(str(right_zscore_threshould)) + u"\n") for emotion_clip in high_emotion_clips: str_info = u"" for item in emotion_clip: str_info += (unicode(str(item)) + u"\t") str_info = str_info[0: len(str_info) - 1] + u"\n" output_file.write(str_info)
def gen_corpus_info(cls, barrage_seg_list, cid): # 获得每条弹幕分好之后的词语 texts = [] for barrage_seg in barrage_seg_list: text = [] for word_seg in barrage_seg.sentence_seg_list: text.append(word_seg.word) texts.append(text) # 为文本中的每一个词语赋予一个数字下标 dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference dictionary.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-words.dict")) logging.debug(dictionary.token2id) # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use corpora.MmCorpus.serialize( os.path.join(FileUtil.get_train_model_dir(), str(cid) + '-barrage-corpus.mm'), corpus) return corpus
def gen_sorted_barrage_file(barrage_file_path): barrages = get_barrage_from_txt_file(barrage_file_path) # 弹幕信息已经按照降序进行排好序。 sorted_file_name = FileUtil.get_cid_from_barrage_file_path(barrage_file_path) + "-sorted.txt" with codecs.open(sorted_file_name, "wb", "utf-8") as output_file: for barrage in barrages: play_time_stamp = unicode(str(float(barrage.play_timestamp))) # barrage_str = DateTimeUtil.format_barrage_play_timestamp(play_time_stamp) + u"\t" + play_time_stamp \ # + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" + barrage.font_color + u"\t" \ # + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" + barrage.sender_id + u"\t" \ # + barrage.row_id + u"\t" + barrage.content + u"\n" barrage_str = play_time_stamp + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" \ + barrage.font_color + u"\t" + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" \ + barrage.sender_id + u"\t" + barrage.row_id + u"\t" + barrage.content + u"\n" output_file.write(barrage_str) return barrages
def build_dicts(cls): if not cls.__HAS_LOAD_USER_DICT: # 还未加载用户词典 cls.__HAS_LOAD_USER_DICT = True # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词 jieba.load_userdict(os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt")) logging.debug(u"自定义弹幕词典加载成功!!!") # 初始化停用词列表 cls.__init_stopwords() # 初始化替换词词典 cls.__init_replace_words() # 初始化接受词性的词典 cls.__init_accept_nominal() # 初始化emoji替换词典 cls.__init_emoji_replace_dict() # 初始化弃用标点符号词典 cls.__init_reject_punctuation_set()
def gen_sorted_zscore_file(self, threshold_value=1.0): sorted_zscore_file_name = FileUtil.get_cid_from_barrage_file_path( self.zscore_file_path) + "-sorted-zscore.txt" with codecs.open(sorted_zscore_file_name, "wb", "utf-8") as output_file: for time_window_index, zscore in self.zscore_list: if zscore < threshold_value: continue total_seconds = time_window_index * self.slide_time_interval # 时间窗口起始地秒数 zscore_info = unicode( str(time_window_index)) + u"\t" + unicode( str(zscore) ) + u"\t" + DateTimeUtil.format_barrage_play_timestamp( total_seconds) + u"\n" # logger.debug(zscore_info) output_file.write(zscore_info)
def gen_sorted_barrage_file(barrage_file_path): barrages = get_barrage_from_txt_file(barrage_file_path) # 弹幕信息已经按照降序进行排好序。 sorted_file_name = FileUtil.get_cid_from_barrage_file_path( barrage_file_path) + "-sorted.txt" with codecs.open(sorted_file_name, "wb", "utf-8") as output_file: for barrage in barrages: play_time_stamp = unicode(str(float(barrage.play_timestamp))) # barrage_str = DateTimeUtil.format_barrage_play_timestamp(play_time_stamp) + u"\t" + play_time_stamp \ # + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" + barrage.font_color + u"\t" \ # + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" + barrage.sender_id + u"\t" \ # + barrage.row_id + u"\t" + barrage.content + u"\n" barrage_str = play_time_stamp + u"\t" + barrage.type + u"\t" + barrage.font_size + u"\t" \ + barrage.font_color + u"\t" + barrage.unix_timestamp + u"\t" + barrage.pool + u"\t" \ + barrage.sender_id + u"\t" + barrage.row_id + u"\t" + barrage.content + u"\n" output_file.write(barrage_str) return barrages
def get_parse_dict(danmaku_list): logging.info("Starting parsing sentences in Danmaku...") parse_dict = dict() jieba.load_userdict(constants.USER_DICT_PATH) emotion_dict_path = os.path.join(FileUtil.get_project_root_path(), "WordSegment", "emotion_dict.txt") emotion_dict = load_emotion_dict(emotion_dict_path) for danmaku in danmaku_list: rowId = danmaku.rowId if danmaku.content is not None: words = wordSegment(emotion_dict, danmaku.content) parse_dict[rowId] = words else: parse_dict[rowId] = None logging.info("parse dictionary has generated!") return parse_dict
def build_dicts(cls): if not cls.__HAS_LOAD_USER_DICT: # 还未加载用户词典 cls.__HAS_LOAD_USER_DICT = True # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词 jieba.load_userdict( os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt")) logging.debug(u"自定义弹幕词典加载成功!!!") # 初始化停用词列表 cls.__init_stopwords() # 初始化替换词词典 cls.__init_replace_words() # 初始化接受词性的词典 cls.__init_accept_nominal() # 初始化emoji替换词典 cls.__init_emoji_replace_dict() # 初始化弃用标点符号词典 cls.__init_reject_punctuation_set()
def __save_high_emotion_clips_to_file(self, high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould): file_path = os.path.join(FileUtil.get_zscore_dir(), self.cid + "-high-emotion-clips.txt") with codecs.open(file_path, "wb", "utf-8") as output_file: output_file.write( unicode(str(global_zscore_threshold)) + u"\t" + unicode(str(left_zscore_threshold)) + u"\t" + unicode(str(right_zscore_threshould)) + u"\n") for emotion_clip in high_emotion_clips: str_info = u"" for item in emotion_clip: str_info += (unicode(str(item)) + u"\t") str_info = str_info[0:len(str_info) - 1] + u"\n" output_file.write(str_info)
def parse_barrage_xml_to_txt(xml_file_path): # 获取xml文件中的全部内容。 with codecs.open(xml_file_path, "rb", "utf-8") as input_file: content = [] for line in input_file: content.append(line) content = u"\n".join(content) # 弹幕出现的播放时间,弹幕类型,字体大小,字体颜色,弹幕出现的unix时间戳,弹幕池,弹幕创建者id,弹幕id pattern = re.compile(r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>', re.S) barrages = re.findall(pattern, content) if len(barrages) <= 0: return None txt_file_name = FileUtil.get_cid_from_barrage_file_path(xml_file_path) + ".txt" with codecs.open(txt_file_name, "wb", "utf-8") as output_file: for barrage in barrages: output_file.write(u"\t".join(barrage) + u"\n") return barrages
def gen_corpus_words(): barrage_corpus_files = glob.glob( os.path.join(FileUtil.get_corpus_dir(), "*.txt")) file_lists = [ barrage_corpus_files[0:501], barrage_corpus_files[501:1001], barrage_corpus_files[1001:1501], barrage_corpus_files[1501:2001], barrage_corpus_files[2001:2501], barrage_corpus_files[2501:3001], barrage_corpus_files[3001:len(barrage_corpus_files)] ] pools = Pool(7) file_index = 0 for file_list in file_lists: file_index += 1 pools.apply_async(__gen_corpus_words, args=(file_list, "all-corpus-" + str(file_index) + ".txt")) pools.close() pools.join()
def load_high_emotion_clips_from_file(cls, cid): file_path = os.path.join(FileUtil.get_zscore_dir(), cid + "-high-emotion-clips-lda.txt") first_line_flag = True high_emotion_clips = [] global_zscore_threshold = 0 left_zscore_threshold = 0 right_zscore_threshould = 0 with codecs.open(file_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") if first_line_flag: first_line_flag = False global_zscore_threshold = split_info[0] left_zscore_threshold = split_info[1] right_zscore_threshould = split_info[2] continue high_emotion_clips.append(split_info) return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould
def build_word2vec_model(barrage_corpus_dirname, barrage_corpus_file_type="txt"): train_sentences = TrainSentences(barrage_corpus_dirname, barrage_corpus_file_type) """ min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them, default 5 size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training algorithm has, default 100 workers: training parallelization, to speed up training, default = 1 worker = no parallelization """ model = gensim.models.Word2Vec(train_sentences, min_count=5, size=150, workers=multiprocessing.cpu_count()) model.save( os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt"))
def parse_barrage_xml_to_txt(xml_file_path): # 获取xml文件中的全部内容。 with codecs.open(xml_file_path, "rb", "utf-8") as input_file: content = [] for line in input_file: content.append(line) content = u"\n".join(content) # 弹幕出现的播放时间,弹幕类型,字体大小,字体颜色,弹幕出现的unix时间戳,弹幕池,弹幕创建者id,弹幕id pattern = re.compile( r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>', re.S) barrages = re.findall(pattern, content) if len(barrages) <= 0: return None txt_file_name = FileUtil.get_cid_from_barrage_file_path( xml_file_path) + ".txt" with codecs.open(txt_file_name, "wb", "utf-8") as output_file: for barrage in barrages: output_file.write(u"\t".join(barrage) + u"\n") return barrages
def save_barrages_to_local(self, cid, row_barrages, is_corpus=False): barrage_count = len(row_barrages) if barrage_count <= 0: # 是对于要存储入数据库的弹幕来说的。 return barrage_file_path = FileUtil.get_barrage_file_path(cid, is_corpus) if is_corpus: if barrage_count < 100: # 弹幕数量小于100的弹幕不作为语料库弹幕数据。 return row_barrages = self.sort_barrages(row_barrages) # 如果需要作为语料库的信息,那么 弹幕数量 频率至少为每10秒钟 一条,这样才能保持内容的连贯性。 try: total_seconds = float(row_barrages[-1][0].strip()) if (total_seconds / 10) > barrage_count: return except Exception as exception: print exception return with codecs.open(barrage_file_path, "ab", "utf-8") as output_file: for barrage in row_barrages: if barrage is not None: output_file.write(u"\t".join(barrage) + u"\n")
def extend_emotion_dict(self): barrage_model = gensim.models.Word2Vec.load(os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt")) standard_word_dict = {} # {word, (category, degree, level)} for category, word_set in self.emotion_dict.items(): for word, emotion_degree, emotion_level in word_set: standard_word_dict[word] = (category, emotion_degree, emotion_level) extend_word_dict = {} # {word, (category, degree, level)} for word, word_info in standard_word_dict.items(): category = word_info[0] emotion_degree = word_info[1] emotion_level = word_info[2] try: similar_word_list = barrage_model.most_similar(positive=[word]) except Exception as exception: logger.info(exception) continue for index in xrange(0, len(similar_word_list)): similar_word, similar = similar_word_list[index] similar_degree = float(emotion_degree) * similar similar_level = int(emotion_level) if (similar_word not in standard_word_dict.keys()) and (similar_word not in extend_word_dict.keys()): extend_word_dict[similar_word] = (category, similar_degree, similar_level) elif similar_word in extend_word_dict.keys(): last_similar_degree = extend_word_dict[similar_word][1] if last_similar_degree < similar_degree: extend_word_dict[similar_word] = (category, similar_degree, similar_level) with codecs.open("extend-emotion-words.txt", "wb", "utf-8") as output_file: for word, word_info in extend_word_dict.items(): category = word_info[0] degree = word_info[1] level = word_info[2] output_file.write(category + u"\t" + word + u"\t" + unicode(str(degree)) + u"\t" + unicode(str(level)) + u"\n")
first_line_flag = True high_emotion_clips = [] global_zscore_threshold = 0 left_zscore_threshold = 0 right_zscore_threshould = 0 with codecs.open(file_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") if first_line_flag: first_line_flag = False global_zscore_threshold = split_info[0] left_zscore_threshold = split_info[1] right_zscore_threshould = split_info[2] continue high_emotion_clips.append(split_info) return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould if __name__ == "__main__": zscore = Zscore("2171229", os.path.join(FileUtil.get_zscore_dir(), "hd-zscore-result-lda.txt"), 30, 10, 4) # zscore.gen_sorted_zscore_file(threshold_value=5) # # zscore.gen_possible_high_emotion_clips() high_emotion_clips = zscore.gen_possible_high_emotion_clips() for emotion_clip in high_emotion_clips: str_info = u"" for item in emotion_clip: str_info += (unicode(str(item)) + u"\t") str_info = str_info[0: len(str_info) - 1] str_info += u"\n" print str_info
def gen_lda_model(cls, corpus, cid): logging.debug(u"生成 lda 模型!!!") lda = models.LdaModel(corpus, num_topics=10) lda.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model"))
def gen_tfidf_model(cls, corpus, cid): # let’s initialize a tfidf transformation: logging.debug(u"生成 tfidf 模型!!!") tfidf = models.TfidfModel(corpus) tfidf.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-tfidf.model"))
for barrage_seg in barrage_seg_list: corpus_words = u"" if len(barrage_seg.sentence_seg_list) <= 0: continue # 弹幕中的词语有可能全部被替换掉了,没有剩下任何词语。 for word_seg in barrage_seg.sentence_seg_list: corpus_words += (word_seg.word + u"\t") corpus_words = corpus_words[0: len(corpus_words) - 1] + u"\n" output_file.write(corpus_words) # 根据语料库建立 word2vec 模型 # 参数: barrage_corpus_dirname 弹幕语料的路径 # barrage_corpus_file_type 弹幕语料存储的文件类型 def build_word2vec_model(barrage_corpus_dirname, barrage_corpus_file_type="txt"): train_sentences = TrainSentences(barrage_corpus_dirname, barrage_corpus_file_type) """ min_count: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them, default 5 size: Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training algorithm has, default 100 workers: training parallelization, to speed up training, default = 1 worker = no parallelization """ model = gensim.models.Word2Vec(train_sentences, min_count=5, size=150, workers=multiprocessing.cpu_count()) model.save(os.path.join(FileUtil.get_train_model_dir(), "barrage-corpusword2vec-model.txt")) if __name__ == "__main__": train_sentences = TrainSentences(FileUtil.get_corpus_dir()) gen_corpus_words()
# 根据语料库信息生成lda模型 @classmethod def gen_lda_model(cls, corpus, cid): logging.debug(u"生成 lda 模型!!!") lda = models.LdaModel(corpus, num_topics=10) lda.save(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model")) # 初始化所有的字典信息。 @classmethod def build_dicts(cls): if not cls.__HAS_LOAD_USER_DICT: # 还未加载用户词典 cls.__HAS_LOAD_USER_DICT = True # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词 jieba.load_userdict(os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt")) logging.debug(u"自定义弹幕词典加载成功!!!") # 初始化停用词列表 cls.__init_stopwords() # 初始化替换词词典 cls.__init_replace_words() # 初始化接受词性的词典 cls.__init_accept_nominal() # 初始化emoji替换词典 cls.__init_emoji_replace_dict() # 初始化弃用标点符号词典 cls.__init_reject_punctuation_set() if __name__ == "__main__": lda = models.LdaModel.load(os.path.join(FileUtil.get_train_model_dir(), "9-barrage-lda.model"))
def save_segment_barrages(barrage_seg_list, cid): save_file_path = FileUtil.get_word_segment_result_file_path(cid) json_str = json.dumps(barrage_seg_list, default=lambda obj: obj.__dict__) with codecs.open(save_file_path, "wb", "utf-8") as output_file: output_file.write(json_str)
def __save_similarity_matrix_to_local(cls, sim_matrix, time_window_index): matrix_file_name = os.path.join(FileUtil.get_similarity_matrix_dir(), "matrix-" + str(time_window_index) + ".txt") with codecs.open(matrix_file_name, "wb", "utf-8") as output_file: np.savetxt(fname=output_file, X=sim_matrix, fmt="%.2f", delimiter="\t", newline="\n")
logging.debug(u"生成 lda 模型!!!") lda = models.LdaModel(corpus, num_topics=10) lda.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model")) # 初始化所有的字典信息。 @classmethod def build_dicts(cls): if not cls.__HAS_LOAD_USER_DICT: # 还未加载用户词典 cls.__HAS_LOAD_USER_DICT = True # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词 jieba.load_userdict( os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt")) logging.debug(u"自定义弹幕词典加载成功!!!") # 初始化停用词列表 cls.__init_stopwords() # 初始化替换词词典 cls.__init_replace_words() # 初始化接受词性的词典 cls.__init_accept_nominal() # 初始化emoji替换词典 cls.__init_emoji_replace_dict() # 初始化弃用标点符号词典 cls.__init_reject_punctuation_set() if __name__ == "__main__": lda = models.LdaModel.load( os.path.join(FileUtil.get_train_model_dir(), "9-barrage-lda.model"))
# 创建barrage对象 barrage = Barrage(play_timestamp=barrage_timestamp, sender_id=sender_name, content=content) barrages.append(barrage) # 为每一个用户的名称对应一个唯一的数字表示 dictionary = corpora.Dictionary(sender_name_list) dictionary.save("live_sender_name.dict") # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示 for barrage in barrages: barrage.sender_id = str(dictionary.token2id[barrage.sender_id]) return barrages if __name__ == "__main__": # barrages = get_barrage_from_txt_file("../../data/local/9.txt") # file_path = FileUtil.get_word_segment_result_file_path("../../data/local/9.txt") # barrage_seg_list = wordseg.segment_barrages(barrages) # wordseg.save_segment_barrages(file_path, barrage_seg_list) # barrage_seg_list = wordseg.load_segment_barrages(file_path) # for barrage_seg in barrage_seg_list: # print str(barrage_seg.play_timestamp), u"\t", u"\t".join([seg.word + u"\t" + seg.flag for seg # in barrage_seg.sentence_seg_list]) gen_sorted_barrage_file(os.path.join(FileUtil.get_local_data_dir(), "2171229.txt")) # parse_barrage_xml_to_txt("4547002.xml") # barrages = get_barrage_from_live_text_file(os.path.join(FileUtil.get_project_root_path(), "data", "AlphaGo", # "bilibili", "2016-03-09.txt")) # for barrage in barrages: # print barrage.play_timestamp, u"\t", barrage.sender_id, u"\t", barrage.content, u"\n"
class DictConfig(object): __HAS_LOAD_USER_DICT = False # 检测是否加载了用户自定义的词典 # 停用词词典信息 __STOP_WORDS = set([]) # 停用词集合信息 # 停用词词典的加载路径,用户可以自定义添加。 __STOP_WORDS_PATH_SET = set([ os.path.join(FileUtil.get_dict_dir(), "stopwords-zh-dict.txt"), os.path.join(FileUtil.get_dict_dir(), "stopwords-en-dict.txt") ]) # 替换词词典信息 # 替换词词典的替换词次序十分重要,所以用了list。例如 !{1,3}这个替换规则就应该在!!!!+这个替换规则后面。 __REPLACE_WORDS = [] __REPLACE_WORDS_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "replace-dict.txt")]) # 替换颜表情词典信息 __REPLACE_EMOJI = {} __REPLACE_EMOJI_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "emoji-dict.txt")]) # 接受词性词典 ---- 现在代码中没有用词性来过滤处理 __ACCEPT_NOMINAL = set([]) __ACCEPT_NOMINAL_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "accept-nominal-dict.txt")]) # 拒绝接受的单个标点符号词典 __REJECT_PUNCTUATION = set([]) __REJECT_PUNCTUATION_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "reject-punctuation-dict.txt")]) # 程度副词词典加载(来自知网数据) __DEGREE_ADVERB = {} __DEGREE_ADVERB_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "degree-adverb-dict.txt")]) # 否定词词典加载 __NEGATIVES = set([]) __NEGATIVES_PATH_SET = set( [os.path.join(FileUtil.get_dict_dir(), "negatives-dict.txt")]) # 情感词典加载 __EMOTION = {} # 情感词典的格式 {情感词类别:(情感词,情感强度,情感极性)} __EMOTION_PATH_SET = set([ os.path.join(FileUtil.get_dict_dir(), "emotion-extend-dict.txt"), os.path.join(FileUtil.get_dict_dir(), "emotion-dict.txt") ]) @classmethod def get_stopwords_set(cls): return cls.__STOP_WORDS @classmethod def get_stopwords_dict_path_set(cls): return cls.__STOP_WORDS_PATH_SET @classmethod def get_replace_words_list(cls): return cls.__REPLACE_WORDS @classmethod def get_accept_nominal_set(cls): return cls.__ACCEPT_NOMINAL @classmethod def get_emoji_replace_dict(cls): return cls.__REPLACE_EMOJI @classmethod def get_reject_punctuation_dict(cls): return cls.__REJECT_PUNCTUATION @classmethod def get_degree_adverb_dict(cls): return cls.__DEGREE_ADVERB @classmethod def get_negatives_set(cls): return cls.__NEGATIVES # 直接加载情感词典 情感词典的格式 {情感词类别:(情感词,情感强度,情感极性)} 供情感分析使用 @classmethod def load_emotion_dict(cls): cls.__EMOTION = {} for emotion_dict_path in cls.__EMOTION_PATH_SET: with codecs.open(emotion_dict_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split(u"\t") if len(split_info) < 4: continue category = split_info[0] # 情感词类别 word = split_info[1] # 情感词 degree = split_info[2] # 情感强度 level = split_info[3] # 情感极性 if category not in cls.__EMOTION.keys(): cls.__EMOTION[category] = set([(word, degree, level)]) else: cls.__EMOTION[category].add((word, degree, level)) return cls.__EMOTION # 初始化填充停用词列表信息。 @classmethod def __init_stopwords(cls): if cls.__STOP_WORDS: return cls.__STOP_WORDS = set([" ", "\r", "\n", "\t"]) for stopwords_dict_path in cls.__STOP_WORDS_PATH_SET: with codecs.open(stopwords_dict_path, "rb", "utf-8") as input_file: for line in input_file: stopwords = line.strip() cls.__STOP_WORDS.add(stopwords) logging.debug(u"停用词词典构建完成!!!") @classmethod def __init_replace_words(cls): if cls.__REPLACE_WORDS: return for replace_words_path in cls.__REPLACE_WORDS_PATH_SET: with codecs.open(replace_words_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") word_pattern = split_info[0] replace_word = split_info[1] replace_flag = split_info[ 2] # 替换词的词性,因为今后将会用到 过滤数字 和 无用标点的选项 cls.__REPLACE_WORDS.append( (word_pattern, replace_word, replace_flag)) logging.debug(u"替换词词典构建完成!!!") @classmethod def __init_accept_nominal(cls): if cls.__ACCEPT_NOMINAL: return for accept_nominal_path in cls.__ACCEPT_NOMINAL_PATH_SET: with codecs.open(accept_nominal_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") accept_nominal = split_info[0] cls.__ACCEPT_NOMINAL.add(accept_nominal) logging.debug(u"接受词性词典加载成功!!!") @classmethod def __init_emoji_replace_dict(cls): if cls.__REPLACE_EMOJI: return for emoji_dict_path in cls.__REPLACE_EMOJI_PATH_SET: with codecs.open(emoji_dict_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") if len(split_info) < 2: # 一般情况下emoji表情都包含两列,一列为表情,另一列为替换词,这两列必须有;第三列为表情说明,可有可无。 continue # emoji替换词典里的表情是可能重复的,因为表情太复杂来不及检查,这里将会出现以最后一个定义为准。 emoji = split_info[0] replace_word = split_info[1] cls.__REPLACE_EMOJI[emoji] = replace_word logging.debug(u"emoji 替换词典加载完成!!!") # 加载拒绝的单个标点词的词典 @classmethod def __init_reject_punctuation_set(cls): if cls.__REJECT_PUNCTUATION: return for reject_punctuation_path in cls.__REJECT_PUNCTUATION_PATH_SET: with codecs.open(reject_punctuation_path, "rb", "utf-8") as input_file: for line in input_file: punctuation = line.strip() cls.__REJECT_PUNCTUATION.add(punctuation) logging.debug(u"弃用标点符号词典加载完成!!!") # 初始化程度副词词典 @classmethod def load_degree_adverb_dict(cls): cls.__DEGREE_ADVERB = {} for degree_adverb_path in cls.__DEGREE_ADVERB_PATH_SET: with codecs.open(degree_adverb_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") degree_adverb = split_info[0] score = split_info[1] if degree_adverb not in cls.__DEGREE_ADVERB.keys(): cls.__DEGREE_ADVERB[degree_adverb] = float(score) logging.debug(u"程度副词词典加载完成!!!") return cls.__DEGREE_ADVERB # 初始化否定词词典 @classmethod def load_negatives_set(cls): cls.__NEGATIVES = set([]) for negatives_path in cls.__NEGATIVES_PATH_SET: with codecs.open(negatives_path, "rb", "utf-8") as input_file: for line in input_file: negative = line.strip() cls.__NEGATIVES.add(negative) logging.debug(u"否定词词典加载完成!!!") return cls.__NEGATIVES # 将待实验视频v的全体弹幕信息作为语料库,为训练tf-idf模型以及lda模型做准备 # 根据分好词的barrage_seg_list(分好词、过滤好停词),为弹幕中的每一个词语对应一个唯一的编号。 @classmethod def gen_corpus_info(cls, barrage_seg_list, cid): # 获得每条弹幕分好之后的词语 texts = [] for barrage_seg in barrage_seg_list: text = [] for word_seg in barrage_seg.sentence_seg_list: text.append(word_seg.word) texts.append(text) # 为文本中的每一个词语赋予一个数字下标 dictionary = corpora.Dictionary(texts) # store the dictionary, for future reference dictionary.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-words.dict")) logging.debug(dictionary.token2id) # 根据生成的字典,生成语料库信息(语料的词用id表示,后面对应的是count。) corpus = [dictionary.doc2bow(text) for text in texts] # store to disk, for later use corpora.MmCorpus.serialize( os.path.join(FileUtil.get_train_model_dir(), str(cid) + '-barrage-corpus.mm'), corpus) return corpus # 根据语料库corpus生成tf-idf模型 @classmethod def gen_tfidf_model(cls, corpus, cid): # let’s initialize a tfidf transformation: logging.debug(u"生成 tfidf 模型!!!") tfidf = models.TfidfModel(corpus) tfidf.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-tfidf.model")) # 根据语料库信息生成lda模型 @classmethod def gen_lda_model(cls, corpus, cid): logging.debug(u"生成 lda 模型!!!") lda = models.LdaModel(corpus, num_topics=10) lda.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model")) # 初始化所有的字典信息。 @classmethod def build_dicts(cls): if not cls.__HAS_LOAD_USER_DICT: # 还未加载用户词典 cls.__HAS_LOAD_USER_DICT = True # 载入自定义的弹幕词典,优化弹幕特有词语的切词,以及颜表情的切词 jieba.load_userdict( os.path.join(FileUtil.get_dict_dir(), "barrage-word-dict.txt")) logging.debug(u"自定义弹幕词典加载成功!!!") # 初始化停用词列表 cls.__init_stopwords() # 初始化替换词词典 cls.__init_replace_words() # 初始化接受词性的词典 cls.__init_accept_nominal() # 初始化emoji替换词典 cls.__init_emoji_replace_dict() # 初始化弃用标点符号词典 cls.__init_reject_punctuation_set()
def gen_lda_model(cls, corpus, cid): logging.debug(u"生成 lda 模型!!!") lda = models.LdaModel(corpus, num_topics=10) lda.save( os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model"))
left_zscore_threshold = 0 right_zscore_threshould = 0 with codecs.open(file_path, "rb", "utf-8") as input_file: for line in input_file: split_info = line.strip().split("\t") if first_line_flag: first_line_flag = False global_zscore_threshold = split_info[0] left_zscore_threshold = split_info[1] right_zscore_threshould = split_info[2] continue high_emotion_clips.append(split_info) return high_emotion_clips, global_zscore_threshold, left_zscore_threshold, right_zscore_threshould if __name__ == "__main__": zscore = Zscore( "2171229", os.path.join(FileUtil.get_zscore_dir(), "hd-zscore-result-lda.txt"), 30, 10, 4) # zscore.gen_sorted_zscore_file(threshold_value=5) # # zscore.gen_possible_high_emotion_clips() high_emotion_clips = zscore.gen_possible_high_emotion_clips() for emotion_clip in high_emotion_clips: str_info = u"" for item in emotion_clip: str_info += (unicode(str(item)) + u"\t") str_info = str_info[0:len(str_info) - 1] str_info += u"\n" print str_info
parse_dict = dict() jieba.load_userdict(constants.USER_DICT_PATH) emotion_dict_path = os.path.join(FileUtil.get_project_root_path(), "WordSegment", "emotion_dict.txt") emotion_dict = load_emotion_dict(emotion_dict_path) for danmaku in danmaku_list: rowId = danmaku.rowId if danmaku.content is not None: words = wordSegment(emotion_dict, danmaku.content) parse_dict[rowId] = words else: parse_dict[rowId] = None logging.info("parse dictionary has generated!") return parse_dict if __name__ == "__main__": # 测试代码 danmaku_list = getDataSource(constants.DATASOURCE) emotion_dict_path = os.path.join(FileUtil.get_project_root_path(), "WordSegment", "emotion_dict.txt") emotion_dict = load_emotion_dict(emotion_dict_path) for (key, value_set) in emotion_dict.items(): print key, u"\t", u"\t".join(value for value in value_set), u"\n" for danmaku in danmaku_list: if danmaku.content is None: continue words = wordSegment(emotion_dict, danmaku.content) for word in words: print word.content
class EmployeeService: dept = Department() employee = Employee() dept_service = DepartmentService() fileUtil = FileUtil("employee.txt") # # fileUtil.objects = employees # fileUtil.construct_file_headers("ID", "First Name","Last Name") # fileUtil.construct_file() db = DbUtil("employee.db") def __int__(self): print("Default: Employee Service") def check_date_of_birth(self, dob): date_format = "%Y-%m-%d" try: yy, mm, dd = str(dob).split("-") dob_entered = date(int(yy), int(mm), int(dd)) age = self.calculate_age(dob_entered) if age < 24: return False else: return True except ValueError: print(sys.exc_info()[1]) # dob_entered = datetime.datetime.strptime(dob, date_format) def calculate_age(self, dob): today = date.today() return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day)) def create_employee_table(self): create_table_query = """ CREATE table employee( emp_id varchar(10) primary key, fname varchar(50) not null, lname varchar(50) not null, dob date, dept_id varchar(10), FOREIGN KEY (dept_id) REFERENCES department(dept_id) ); """ try: self.db.execute_query(create_table_query) except: print("Unable to create employee table \n") print(sys.exc_info()[1]) print("\n") def save_employee(self, emp: Employee): dob = emp.dob if self.check_date_of_birth(dob): try: self.db.execute_dynamic_query( "insert into employee (emp_id,fname,lname,dob,dept_id) values (?,?,?,?,?)", emp.emp_id, emp.fname, emp.lname, emp.dob, emp.dept.dept_id) self.db.connection.commit() print("Congrats : Employee - " + emp.fname + " details saved \n") except sqlite3.IntegrityError: print( "Sorry- Unable to save employee details. ID already exists \n" ) except: print( "Sorry- Unable to save employee details. Invalid Values entered \n" ) print(sys.exc_info()) else: print( "Oops !! - Employee - " + emp.fname + " too young to get registered. To register, you must be older than 24 years. \n" ) def fetch_all_employees(self): employees = [] query = "select * from employee" try: self.db.execute_query(query) query_result = self.db.fetch_all() for emp in query_result: self.employee = Employee() self.employee.emp_id = emp[0] self.employee.fname = emp[1] self.employee.lname = emp[2] self.employee.dob = emp[3] self.employee.dept = self.dept_service.fetch_department_by_id( emp[4]) employees.append(self.employee) print("Employee Details \n") print("******************************************\n") print("#Id" + "\t \t \t" + "DOB" + "\t\t\t" + "Department" + "\t\t\t" + "fname" + "\t\t\t\t" + "lname" + "\n") print( "**************************************************************************************************** \n" ) for emp in employees: print(emp.emp_id + "\t" + emp.dob + "\t" + emp.dept.dept_name + "\t\t\t\t\t" + emp.fname + " " + emp.lname + "\n") except: print("Unable to fetch all employees \n") print(sys.exc_info()[1]) print("\n") return employees def fetch_employee_by_id(self, eid: str): self.employee = Employee() try: self.db.execute_dynamic_query( "select * from employee where emp_id = ?", eid) query_result = self.db.fetch_one() self.employee.emp_id = query_result[0] self.employee.fname = query_result[1] self.employee.lname = query_result[2] self.employee.dob = query_result[3] self.employee.dept = self.dept_service.fetch_department_by_id( query_result[4]) print("#Id" + "\t \t \t" + "DOB" + "\t\t\t" + "Department" + "\t\t\t" + "fname" + "\t\t\t\t" + "lname" + "\n") print( "**************************************************************************************************** \n" ) print(self.employee.emp_id + "\t" + self.employee.dob + "\t" + self.employee.dept.dept_name + "\t\t\t\t\t" + self.employee.fname + " " + self.employee.lname + "\n") return self.employee except: print("Unable to fetch Employee Id - " + eid + " . Please enter correct employee ID.\n") print(sys.exc_info()[1]) print("\n") def delete_employee(self, eid): try: emp_to_be_deleted = self.fetch_employee_by_id(eid) self.db.execute_dynamic_query( "delete from employee where emp_id=?", eid) print("Successfully Deleted Employee - " + emp_to_be_deleted.fname + "\n") self.fetch_all_employees() except: print("Unable to delete Employee Id - " + eid + ". Check employee Id.\n") print(sys.exc_info()[1]) print("\n") def update_employee(self, emp: Employee): try: emp_result = self.fetch_employee_by_id(emp.emp_id) try: if emp.__eq__(emp_result): # print("Successfully Updated Employee- " + emp.emp_id + "\n") else: date_util = DateUtil() if date_util.check_date_of_birth(emp.dob): self.db.execute_dynamic_query( "update employee set fname = ?, lname =?, dob = ? , dept_id = ? where emp_id=?", emp.fname, emp.lname, emp.dob, emp.dept.dept_id, emp.emp_id) self.db.connection.commit() print("Successfully Updated Employee- " + emp.emp_id + "\n") else: print("Sorry!! Unable to update Employee- " + emp.emp_id + "\n") except AttributeError: if "fname" in str(sys.exc_info()[1]): print("First Name cannot be null") elif "lname" in str(sys.exc_info()[1]): print("Last Name cannot be null") elif "dob" in str(sys.exc_info()[1]): print("Date of birth is not in correct format") except ValueError: print(sys.exc_info()[1]) except: print("Unable to update employee Id - " + str(emp.emp_id) + ". Check employee Id.\n") print(sys.exc_info()) print("\n")
str(cid) + "-barrage-words.dict")) lda_model = models.TfidfModel.load(os.path.join(FileUtil.get_train_model_dir(), str(cid) + "-barrage-lda.model")) time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid) for time_window in time_window_list: time_window.gen_user_word_frequency() # 产生该时间窗口内的用户词频信息。 time_window.gen_user_topic_lda(dictionary, lda_model) # 产生该时间窗口内的用户所发词语的tfidf权重信息。 return time_window_list if __name__ == "__main__": barrage_file_path = "../../data/local/9.txt" # "../../data/local/9.txt" "../../data/AlphaGo/bilibili/2016-03-09.txt" "../../data/local/2065063.txt" barrages = dataloader.get_barrage_from_txt_file(barrage_file_path) # barrages = dataloader.get_barrage_from_live_text_file(barrage_file_path) cid = FileUtil.get_cid_from_barrage_file_path(barrage_file_path) barrage_seg_list = wordseg.segment_barrages(barrages, cid) # time_window_list = TimeWindow.gen_time_window_barrage_info(barrage_seg_list, cid) # for time_window in time_window_list: # str_info = '' # for barrage_seg in time_window.barrage_or_seg_list: # for sentence_seg in barrage_seg.sentence_seg_list: # str_info += (sentence_seg.word + sentence_seg.flag + u"\t") # print str(time_window.time_window_index), u"\t", str(time_window.start_timestamp), u"\t",\ # str(time_window.end_timestamp), u"\t", str_info # time_window_list = TimeWindow.gen_user_word_frequency_by_time_window(barrage_seg_list) # with codecs.open(FileUtil.get_word_segment_result_file_path(cid), "wb", "utf-8") as output_file: # for time_window in time_window_list: # str_info = str(time_window.time_window_index) + u"\t" # for user_id, word_frequency in time_window.user_word_frequency_dict.items():
content=content) barrages.append(barrage) # 为每一个用户的名称对应一个唯一的数字表示 dictionary = corpora.Dictionary(sender_name_list) dictionary.save("live_sender_name.dict") # 在将barrages中的barrage用户名称替换为刚刚生成的对应数字表示 for barrage in barrages: barrage.sender_id = str(dictionary.token2id[barrage.sender_id]) return barrages if __name__ == "__main__": # barrages = get_barrage_from_txt_file("../../data/local/9.txt") # file_path = FileUtil.get_word_segment_result_file_path("../../data/local/9.txt") # barrage_seg_list = wordseg.segment_barrages(barrages) # wordseg.save_segment_barrages(file_path, barrage_seg_list) # barrage_seg_list = wordseg.load_segment_barrages(file_path) # for barrage_seg in barrage_seg_list: # print str(barrage_seg.play_timestamp), u"\t", u"\t".join([seg.word + u"\t" + seg.flag for seg # in barrage_seg.sentence_seg_list]) gen_sorted_barrage_file( os.path.join(FileUtil.get_local_data_dir(), "2171229.txt")) # parse_barrage_xml_to_txt("4547002.xml") # barrages = get_barrage_from_live_text_file(os.path.join(FileUtil.get_project_root_path(), "data", "AlphaGo", # "bilibili", "2016-03-09.txt")) # for barrage in barrages: # print barrage.play_timestamp, u"\t", barrage.sender_id, u"\t", barrage.content, u"\n"