def __init__(self, path): print "Loding" self.tag2id = WordId() self.word2id = WordId() self.path = path self.parent_tag_finder = ParentTagger() print "Loading done"
class TagWord(object): def __init__(self, path): print "Loding" self.tag2id = WordId() self.word2id = WordId() self.path = path self.parent_tag_finder = ParentTagger() print "Loading done" def _txt_tag_generator(self): path = self.path tag2id = self.tag2id data_files = glob(join(path, "*.data")) zhihu_data = [join(path, "zhihu")] zhihu_data.extend(data_files) print "Processing..." g = open(join(path, "topic_dict")) topic_dict = loads(g.read()) count = 0 for data_src in zhihu_data: print "Processing...", data_src with open(data_src) as f: for line in f: # if count > 1000: # break # count += 1 data = loads(line) if "tags" in data: tags = data["tags"] else: continue tags_processed = [] if "zhihu" not in data_src: for tag in tags: if tag in topic_dict and tag not in banned_tag_list: tags_processed.append(tag) if not tags_processed: continue else: tags = tags_processed # print tags # raw_input() """ 查找上级标签 """ parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags) tags.extend(parent_list) id_list = tag2id.id_list_by_word_list(tags) yield data["txt"], id_list def txt_tag_generator(self): word2id = self.word2id for k, v in self._txt_tag_generator(): words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()] yield word2id.id_list_by_word_list(words), v def tofile(self): word_id2tag_id = list(self.txt_tag_generator()) path = DATA_DIR self.tag2id.tofile(join(path, "tag2id")) self.word2id.tofile(join(path, "word2id")) tofile(join(path, "word_id2tag_id"), word_id2tag_id)
class TagWord(object): def __init__(self, path): print "Loding" self.tag2id = WordId() self.word2id = WordId() self.path = path self.parent_tag_finder = ParentTagger() print "Loading done" def _txt_tag_generator(self): path = self.path tag2id = self.tag2id data_files = glob(join(path, '*.data')) zhihu_data = [join(path, 'zhihu')] zhihu_data.extend(data_files) print 'Processing...' g = open(join(path, 'topic_dict')) topic_dict = loads(g.read()) count = 0 for data_src in zhihu_data: print 'Processing...', data_src with open(data_src) as f: for line in f: #if count > 1000: # break #count += 1 data = loads(line) if 'tags' in data: tags = data['tags'] else: continue tags_processed = [] if 'zhihu' not in data_src: for tag in tags: if tag in topic_dict and tag not in banned_tag_list: tags_processed.append(tag) if not tags_processed: continue else: tags = tags_processed #print tags #raw_input() ''' 查找上级标签 ''' parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags) tags.extend(parent_list) id_list = tag2id.id_list_by_word_list(tags) yield data['txt'], id_list def txt_tag_generator(self): word2id = self.word2id for k, v in self._txt_tag_generator(): words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()] yield word2id.id_list_by_word_list(words) , v def tofile(self): word_id2tag_id = list(self.txt_tag_generator()) path = DATA_DIR self.tag2id.tofile(join(path, 'tag2id')) self.word2id.tofile(join(path, 'word2id')) tofile(join(path, 'word_id2tag_id'), word_id2tag_id)