Exemple #1
0
 def __init__(self, path):
     print "Loding"
     self.tag2id = WordId()
     self.word2id = WordId()
     self.path = path
     self.parent_tag_finder = ParentTagger()
     print "Loading done"
Exemple #2
0
 def __init__(self, path):
     print "Loding"
     self.tag2id = WordId()
     self.word2id = WordId()
     self.path = path
     self.parent_tag_finder = ParentTagger()
     print "Loading done"
Exemple #3
0
class TagWord(object):
    def __init__(self, path):
        print "Loding"
        self.tag2id = WordId()
        self.word2id = WordId()
        self.path = path
        self.parent_tag_finder = ParentTagger()
        print "Loading done"

    def _txt_tag_generator(self):
        path = self.path
        tag2id = self.tag2id
        data_files = glob(join(path, "*.data"))
        zhihu_data = [join(path, "zhihu")]
        zhihu_data.extend(data_files)

        print "Processing..."
        g = open(join(path, "topic_dict"))
        topic_dict = loads(g.read())

        count = 0
        for data_src in zhihu_data:
            print "Processing...", data_src
            with open(data_src) as f:
                for line in f:
                    # if count > 1000:
                    #    break
                    # count += 1
                    data = loads(line)
                    if "tags" in data:
                        tags = data["tags"]
                    else:
                        continue

                    tags_processed = []
                    if "zhihu" not in data_src:
                        for tag in tags:
                            if tag in topic_dict and tag not in banned_tag_list:
                                tags_processed.append(tag)

                        if not tags_processed:
                            continue
                        else:
                            tags = tags_processed
                            # print tags
                            # raw_input()
                    """
                    查找上级标签
                    """
                    parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
                    tags.extend(parent_list)
                    id_list = tag2id.id_list_by_word_list(tags)
                    yield data["txt"], id_list

    def txt_tag_generator(self):
        word2id = self.word2id
        for k, v in self._txt_tag_generator():
            words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
            yield word2id.id_list_by_word_list(words), v

    def tofile(self):
        word_id2tag_id = list(self.txt_tag_generator())
        path = DATA_DIR
        self.tag2id.tofile(join(path, "tag2id"))
        self.word2id.tofile(join(path, "word2id"))
        tofile(join(path, "word_id2tag_id"), word_id2tag_id)
Exemple #4
0
class TagWord(object):
    def __init__(self, path):
        print "Loding"
        self.tag2id = WordId()
        self.word2id = WordId()
        self.path = path
        self.parent_tag_finder = ParentTagger()
        print "Loading done"

    def _txt_tag_generator(self):
        path = self.path
        tag2id = self.tag2id
        data_files = glob(join(path, '*.data'))
        zhihu_data = [join(path, 'zhihu')]
        zhihu_data.extend(data_files)


        print 'Processing...'
        g = open(join(path, 'topic_dict'))
        topic_dict = loads(g.read())

        count = 0
        for data_src in zhihu_data:
            print 'Processing...', data_src
            with open(data_src) as f:
                for line in f:
                    #if count > 1000:
                    #    break
                    #count += 1
                    data = loads(line)
                    if 'tags' in data:
                        tags = data['tags']
                    else:
                        continue


                    tags_processed = []
                    if 'zhihu' not in data_src:
                        for tag in tags:
                            if tag in topic_dict and tag not in banned_tag_list:
                                tags_processed.append(tag)

                        if not tags_processed:
                            continue
                        else:
                            tags = tags_processed
                            #print tags
                            #raw_input()
                    '''
                    查找上级标签
                    '''
                    parent_list = self.parent_tag_finder.get_parent_tag_list_by_list(tags)
                    tags.extend(parent_list)
                    id_list = tag2id.id_list_by_word_list(tags)
                    yield data['txt'], id_list

    def txt_tag_generator(self):
        word2id = self.word2id
        for k, v in self._txt_tag_generator():
            words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
            yield word2id.id_list_by_word_list(words) , v

    def tofile(self):
        word_id2tag_id = list(self.txt_tag_generator())
        path = DATA_DIR
        self.tag2id.tofile(join(path, 'tag2id'))
        self.word2id.tofile(join(path, 'word2id'))
        tofile(join(path, 'word_id2tag_id'), word_id2tag_id)