Beispiel #1
0
    def get_sample_meta(self, sample_id):
        try:
            str_sample_meta = self.db_content.Get(str(sample_id))
            return decode_sample_meta(str_sample_meta)

        except KeyError:
            return None
Beispiel #2
0
    def get_categories_useinfo(self):
        categories = self.get_categories()
        db_content = self.db_content

        categories_useinfo = {}
        for category_1 in (~categories.categories_1):
            categories_useinfo[category_1] = 0
        for category_2 in (~categories.categories_2):
            categories_useinfo[category_2] = 0
        for category_3 in (~categories.categories_3):
            categories_useinfo[category_3] = 0

        unknown_categories = {}

        rowidx = 0
        for i in db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue

            (sample_id, category_id, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext

            if not category_id in categories_useinfo:
                if category_id in unknown_categories:
                    unknown_categories[category_id] += 1
                else:
                    unknown_categories[category_id] = 1
            else:
                categories_useinfo[category_id] += 1

            rowidx += 1

        return categories_useinfo, unknown_categories
Beispiel #3
0
    def get_bad_samples(self):
        samples = self

        none_samples = []
        empty_samples = []
        normal_samples = []
        rowidx = 0
        for i in samples.db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue
            (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext

            if content is None:
                none_samples.append((sample_id, url))
            elif len(content) == 0:
                empty_samples.append((sample_id, url))
            else:
                normal_samples.append((sample_id, url))

            rowidx += 1

        logging.debug(Logger.debug("Get %d bad samples. None: %d Empty: %d Normal: %d" % (len(none_samples) + len(empty_samples) +len(normal_samples), len(none_samples), len(empty_samples), len(normal_samples))))

        return none_samples, empty_samples, normal_samples
Beispiel #4
0
    def refresh_content(self):
        db_content = self.samples.db_content
        urls = []
        rowidx = 0
        for i in db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue
            (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext


            if content is None:
                logging.debug(Logger.debug("content is None: sample_id %d" % (sample_id)))
                urls.append((sample_id, category, date, title, key, url, cat1, cat2, cat3))
            elif len(content) == 0 :
                logging.debug(Logger.debug("len(conntent) == 0: sample_id %d" % (sample_id)))
                urls.append((sample_id, category, date, title, key, url, cat1, cat2, cat3))

            if rowidx % 100 == 0:
                logging.debug(Logger.debug("refresh content - %d" % (rowidx)))
            rowidx += 1

        for (sample_id, category, date, title, key, url, cat1, cat2, cat3) in urls:
            logging.debug(Logger.debug("--------------------------------"))
            logging.debug(Logger.debug("sample_id: %d url:%s" % (sample_id, url)))
            try:
                rsp = requests.get(url)
                if rsp.ok:
                    #filename = "no_%d.html" % sample_id
                    #print rsp.text.encode('utf-8')
                    #f = open(filename, "wb+")
                    #f.write(rsp.text.encode('utf-8'))
                    #f.close()
                    content = rsp.text
                    version = "1"
                    msgext = (version, content, (cat1, cat2, cat3))

                    sample_data = (sample_id, category, date, title, key, url, msgext)
                    rowstr = msgpack.dumps(sample_data)
                    db_content.Put(str(sample_id), rowstr)
                else:
                    version = "1"
                    msgext = (version, "", (cat1, cat2, cat3))
                    sample_data = (sample_id, category, date, title, key, url, msgext)
                    rowstr = msgpack.dumps(sample_data)
                    db_content.Put(str(sample_id), rowstr)
                    logging.warn(Logger.warn("Get page failed. status: %d sample_id: %d url: %s" % (rsp.status_code, sample_id, url)))
            except:

                version = "1"
                msgext = (version, None, (cat1, cat2, cat3))
                sample_data = (sample_id, category, date, title, key, url, msgext)
                rowstr = msgpack.dumps(sample_data)
                db_content.Put(str(sample_id), rowstr)
                logging.warn(Logger.warn("Connection failed. sample_id: %d url: %s" % (sample_id, url)))
Beispiel #5
0
    def purge(self):
        samples = self.samples
        db_content = samples.db_content

        none_samples, empty_samples, _ = samples.get_bad_samples()
        purged_samples = [ sample_id for (sample_id, url) in none_samples]

        logging.debug(Logger.debug("Purgging %d samples...." % (len(purged_samples))))
        total_samples = samples.get_total_samples()

        for sample_id in purged_samples:
            db_content.Delete(str(sample_id))
            logging.debug(Logger.debug("Purge None content sample %d" % (sample_id)))
        total_samples -= len(purged_samples)

        for (sample_id, url) in empty_samples:
            db_content.Delete(str(sample_id))
            logging.debug(Logger.debug("Purge empty content sample %d" % (sample_id)))
        total_samples -= len(empty_samples)

        logging.debug(Logger.debug("Purge Done. Remaining %d samples." % (total_samples)))


        invalid_class_samples = []
        invalid_categories = {}
        rowidx = 0
        for i in db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue
            (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext

            if not cat1 in self.main_categories:
                invalid_class_samples.append(sample_id)
                if (cat1, cat2) in invalid_categories:
                    invalid_categories[(cat1, cat2)] += 1
                else:
                    invalid_categories[(cat1, cat2)] = 1

        for (cat1, cat2) in invalid_categories:
            logging.debug(Logger.debug("<I> <%s:%s::> %d" % (cat1, cat2, invalid_categories[(cat1, cat2)])))

        logging.debug(Logger.debug("Total invalid class samples %d in %d categories" % (len(invalid_class_samples), len(invalid_categories)) ))


        for sample_id in invalid_class_samples:
            db_content.Delete(str(sample_id))
        logging.debug(Logger.debug("Deleted %d invalid class samples." % (len(invalid_class_samples))))
Beispiel #6
0
    def query_by_id(self, sample_id):
        try:
            sample_content = self.db_content.Get(str(sample_id))
            (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content)
            (version, content, (cat1, cat2, cat3)) = msgext

            print "sample id: %d" % (sample_id)
            print "category: %d" % (category)
            print "key: %s" % (key)
            print "url: %s" % (url)
            print "date: %s" % (date)
            print "title: %s" % (title)
            print "---------------- content ----------------"
            print "%s" % (content)
            sample_terms, term_map = self.corpus.vocabulary.seg_content(content)
            print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map))
            #for term_id in term_map:
            terms_list = sorted_dict_by_values(term_map, reverse=True)
            for (term_id, term_used_in_sample) in terms_list:
                if term_used_in_sample <= 1:
                    continue
                term_text = self.corpus.vocabulary.get_term_text(term_id)
                #sample_terms = term_map[term_id]
                print "%s(%d): %d" % (term_text, term_id, term_used_in_sample)

        except KeyError:
            print "Sample %d not found in db_content." % (sample_id)

        db_sm = self.tsm.open_db_sm()
        try:
            str_sample_info = db_sm.Get(str(sample_id))
            (category, sample_terms, term_map) = msgpack.loads(str_sample_info)
            print ""
            print "---------------- keywords ----------------"
            print ""
            terms_list = sorted_dict_by_values(term_map, reverse = True)
            for (term_id, term_used_in_sample) in terms_list:
                if term_used_in_sample <= 1:
                    continue
                term_text = self.corpus.vocabulary.get_term_text(term_id)
                print "%s\t%d\t(id:%d)" % (term_text, term_used_in_sample, term_id)

        except KeyError:
            print "Sample %d not found in db_sm." % (sample_id)

        finally:
            self.tsm.close_db(db_sm)
Beispiel #7
0
    def rebuild_categories(self):

        samples = self
        categories = samples.get_categories()

        db_content = samples.db_content

        #categories.clear_categories()

        batch_content = leveldb.WriteBatch()
        rowidx = 0
        for i in db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue
            (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext
            #try:
                #(version, content, (cat1, cat2, cat3)) = msgext
            #except ValueError:
                #bad_samples.append(sample_id)
                #rowidx += 1
                #continue

            version = "1"
            msgext = (version, content, (cat1, cat2, cat3))

            category_id = categories.create_or_get_category_id(cat1, cat2, cat3)

            sample_data = (sample_id, category_id, date, title, key, url, msgext)
            rowstr = msgpack.dumps(sample_data)
            batch_content.Put(str(sample_id), rowstr)

            #if category_id != category:
            #print category_id, category, cat1, cat2, cat3
            self.tsm.set_sample_category(sample_id, category_id)
            #logging.debug(Logger.debug("[%d] %d %d=<%s:%s:%s:>" % (rowidx, sample_id, category_id, cat1, cat2, cat3)))

            rowidx += 1

        db_content.Write(batch_content, sync=True)

        self.tsm.save_sample_matrix(self.tsm.sm_matrix)

        categories.save_categories()
        categories.print_categories()
Beispiel #8
0
def export_samples_to_xls(samples, xls_file):

    wb = xlwt.Workbook(encoding='utf-8')
    ws = wb.add_sheet("negative opinions")
    ws.write(0, 0, 'CATEGORY')
    ws.write(0, 1, 'DATE')
    ws.write(0, 2, 'CAT1')
    ws.write(0, 3, 'CAT2')
    ws.write(0, 4, 'TITLE')
    ws.write(0, 5, 'KEY')
    ws.write(0, 6, 'URL')
    ws.write(0, 7, 'CONTENT')

    style_date = xlwt.XFStyle()
    style_date.num_format_str = 'YYYY.MM.DD'
    rowidx = 0
    for i in samples.db_content.RangeIter():
        row_id = i[0]
        if row_id.startswith("__"):
            continue
        (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
        (version, content, (cat1, cat2, cat3)) = msgext
        if content is None:
            content = ""
        if len(content) >= 1024 * 32:
            content = content[:1024*32 - 1]
        ws.write(rowidx + 1, 0, category)

        (y, m, d, h, mi, s) = date
        ws.write(rowidx + 1, 1, datetime(y, m, d, h, mi, s), style_date)

        ws.write(rowidx + 1, 2, cat1)
        ws.write(rowidx + 1, 3, cat2)
        ws.write(rowidx + 1, 4, title)
        ws.write(rowidx + 1, 5, key)
        ws.write(rowidx + 1, 6, url)
        ws.write(rowidx + 1, 7, content)

        if rowidx % 100 == 0:
            logging.debug(Logger.debug("[%d] %d %s" % (rowidx, sample_id, title)))
        rowidx += 1

    wb.save(xls_file)
Beispiel #9
0
    def query_by_id(self, samples_positive, samples_unlabeled, sample_id):
        tsm_positive = samples_positive.tsm
        tsm_unlabeled = samples_unlabeled.tsm

        sensitive_words = {
                ##u"立案":3.0,
                ##u"获刑":3.0,
                ##u"受贿":3.0,
                ##u"有期徒刑":3.0,
                ##u"宣判":3.0,
                ##u"审计":2.0,
                ##u"调查":2.0
                }

        sensitive_terms = self.transform_sensitive_terms(sensitive_words, self.vocabulary)

        try:
            sample_content = samples_unlabeled.db_content.Get(str(sample_id))
            #(_, category, date, title, key, url, content) = msgpack.loads(sample_content)

            (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content)
            (version, content, (cat1, cat2, cat3)) = msgext

            print "sample id: %d" % (sample_id)
            print "category: %d" % (category)
            print "key: %s" % (key)
            print "url: %s" % (url)
            print "date: %s" % (date)
            print "title: %s" % (title)
            print "---------------- content ----------------"
            #print "%s" % (content)

            sample_terms, term_map = self.vocabulary.seg_content(content)
            print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map))
            #for term_id in term_map:
            terms_list = sorted_dict_by_values(term_map, reverse=True)
            for (term_id, term_used_in_sample) in terms_list:
                term_text = self.vocabulary.get_term_text(term_id)
                #term_used_in_sample = term_map[term_id]
                print "%s(%d): %d" % (term_text, term_id, term_used_in_sample)


        except KeyError:
            print "Sample %d not found in db_content." % (sample_id)

        db_sm = samples_unlabeled.tsm.open_db_sm()
        try:
            str_sample_info = db_sm.Get(str(sample_id))
            (category, sample_terms, term_map) = msgpack.loads(str_sample_info)
            print ""
            print "---------------- keywords ----------------"
            print ""
            terms = {}
            for term_id in term_map:
                term_text = self.vocabulary.get_term_text(term_id)
                term_used = term_map[term_id]
                (pd_word, speciality, popularity) = calculate_term_positive_degree(term_id, tsm_positive, tsm_unlabeled, sensitive_terms)
                terms[term_id] = (pd_word, speciality, popularity, term_used, term_text)

            terms_list = sorted_dict_by_values(terms, reverse = True)
            for (term_id, (pd_word, speciality, popularity, term_used, term_text)) in terms_list:
                print "%s\t%d\t[%.6f,%.6f,%.6f]\t(id:%d)" % (term_text, term_used, pd_word, speciality, popularity, term_id)

        except KeyError:
            print "Sample %d not found in db_sm." % (sample_id)

        samples_unlabeled.tsm.close_db(db_sm)
Beispiel #10
0
    def fix_categories(self):
        samples = self.samples
        cat_map = {
                (u"触电伤害", u"") : (u"安全生产", u"触电伤害"),
                (u"其他", u"触电伤害") : (u"安全生产", u"触电伤害"),
                (u"其他", u"意外伤害") : (u"安全生产", u"触电伤害"),
                (u"其他(环境污染)", u"") : (u"安全生产", u"环境保护"),
                (u"其他(破坏电力设施)", u"") : (u"安全生产", u"外力破坏"),
                (u"其他", u"道路破坏") : (u"安全生产", u"外力破坏"),
                (u"其他", u"噪音污染") : (u"安全生产", u"环境保护"),
                (u"其他", u"环境污染") : (u"安全生产", u"环境保护"),
                (u"其他", u"外力破坏") : (u"安全生产", u"外力破坏"),
                (u"其他", u"破坏道路") : (u"安全生产", u"外力破坏"),
                (u"其他(交通安全)", u"") : (u"安全生产", u"交通安全"),
                (u"其他", u"肇事逃逸") : (u"安全生产", u"交通安全"),
                (u"其他", u"酒驾致伤") : (u"安全生产", u"交通安全"),
                (u"其他(树障清除)", u"") : (u"安全生产", u"隐患治理"),
                (u"其他", u"树障清理") : (u"安全生产", u"隐患治理"),
                (u"其他", u"树障清除") : (u"安全生产", u"隐患治理"),
                (u"雾霾", u"") : (u"安全生产", u"隐患治理"),
                (u"安全供电", u"") : (u"安全生产", u"违章作业"),
                (u"智能电网", u"") : (u"电网建设", u"智能电网"),
                (u"新能源并网", u"") : (u"电网建设", u"新能源并网"),
                (u"特高压", u"") : (u"经营管理", u"特高压"),
                (u"阶梯电价", u"") : (u"电力改革", u"电价调整"),
                (u"农网改造", u"") : (u"电力改革", u"农电改制"),
                (u"三集五大", u"") : (u"电力改革", u"三集五大"),
                (u"其他(电农体制改革", u"") : (u"电力改革", u"农电改制"),
                (u"农电改革", u"") : (u"电力改革", u"农电改制"),
                (u"电价调整", u"") : (u"电力改革", u"电价调整"),
                (u"工资福利", u"") : (u"人资管理", u"工资福利"),
                (u"人资劳务", u"") : (u"人资管理", u"人事劳务"),
                (u"人力资源", u"") : (u"人资管理", u""),
                (u"(其他)人资管理",  u"") : (u"人资管理", u""),
                (u"人力资源", u"人事劳务") : (u"人资管理", u"人事劳务"),
                (u"会劳务", u"") : (u"人资管理", u"人事劳务"),
                (u"其他", u"劳动纪律") : (u"人资管理", u"劳动纪律"),
                (u"其他", u"打人致伤") : (u"人资管理", u"劳动纪律"),
                (u"同工同酬", u"") : (u"人资管理", u"同工同酬"),
                (u"作风建设", u"") : (u"党建作风", u""),
                (u"其他", u"作风建设") : (u"党建作风", u""),
                (u"其他", u"舆情宣传") : (u"党建作风", u"新闻宣传"),
                (u"其他", u"新闻宣传") : (u"党建作风", u"新闻宣传"),
                (u"作风建设", u"法律纠纷") : (u"党建作风", u""),
                (u"信访纠纷", u"") : (u"党建作风", u"腐败"),
                (u"腐败", u"") : (u"党建作风", u"腐败"),
                (u"其他", u"公车私用") : (u"党建作风", u"八项规定"),
                (u"腐  败", u"") : (u"党建作风", u"腐败"),
                (u"其他", u"借机敛财") : (u"党建作风", u"腐败"),
                (u"腐败", u"公车购置") : (u"党建作风", u"腐败"),
                (u"农网改造", u"违规收费") : (u"依法治企", u"违规收费"),
                (u"其他", u"强卖") : (u"依法治企", u"违规收费"),
                (u"电费电表", u"违规收费") : (u"依法治企", u"违规收费"),
                (u"其他", u"违规收费") : (u"依法治企", u"违规收费"),
                (u"其他(违规收费)", u"") : (u"依法治企", u"违规收费"),
                (u"其他(乱收费)", u"") : (u"依法治企", u"违规收费"),
                (u"其他", u"违规建房") : (u"依法治企", u"违规建房"),
                (u"其他", u"违规电器") : (u"依法治企", u"违规供电"),
                (u"其他", u"法律纠纷") : (u"依法治企", u"法律纠纷"),
                (u"其他(法律纠纷)", u"") : (u"依法治企", u"法律纠纷"),
                (u"相关利益方", u"") : (u"依法治企", u"审计业务"),
                (u"其他", u"财务审计") : (u"依法治企", u"审计业务"),
                (u"法律纠纷", u"") : (u"依法治企", u"法律纠纷"),
                (u"电动汽车", u"") : (u"业务拓展", u"电动汽车"),
                (u"国际业务", u"") : (u"业务拓展", u"国际业务"),
                (u"风电消纳", u"") : (u"业务拓展", u"产业"),
                (u"供电服务(三指定)", u"") :(u"供电服务", u""),
                (u"智能电表", u"") : (u"供电服务", u"电表"),
                (u"上海停电", u"") : (u"供电服务", u"停电"),
                (u"其他(意外停电)", u"") : (u"供电服务", u"停电"),
                (u"电价", u"") : (u"供电服务", u"电价"),
                (u"其他", u"窃电") : (u"供电服务", u"偷电行为"),
                (u"电费电表", u"") : (u"供电服务", u"电费"),
                (u"电表电费", u"") : (u"供电服务", u"电表"),
                (u"营销服务", u"业务投诉") : (u"供电服务", u"业务投诉"),
                (u"营销服务", u"停电") : (u"供电服务", u"停电"),
                (u"营销服务", u"电费") : (u"供电服务", u"电费"),
                (u"营销服务", u"电价") : (u"供电服务", u"电价"),
                (u"营销服务", u"电表") : (u"供电服务", u"电表"),
            }

        #bad_samples = []

        rowidx = 0
        for i in samples.db_content.RangeIter():
            row_id = i[0]
            if row_id.startswith("__"):
                continue
            (sample_id, category, date, title, key, url, msgext) = decode_sample_meta(i[1])
            (version, content, (cat1, cat2, cat3)) = msgext
            #try:
                #(version, content, (cat1, cat2, cat3)) = msgext
            #except ValueError:
                #bad_samples.append(sample_id)
            #cat1 = cat1.decode('utf-8')
            #cat2 = cat2.decode('utf-8')
            #cat3 = cat3.decode('utf-8')
            #if cat1 == u"农电改革":
                #logging.debug(Logger.debug("<%s:%s:%s:>" % (cat1, cat2, cat3)))
                #if (cat1, cat2) in cat_map:
                    #logging.debug(Logger.debug("Found <%s:%s::> in cat_map" % (cat1, cat2)))
                #else:
                    #logging.debug(Logger.debug("Not found <%s:%s::> in cat_map" % (cat1, cat2)))
                    #print cat2.__class__, (cat1, cat2) == (cat1, u""), (cat1, cat2) == (cat1, u"")


            new_cat3 = cat3
            #if cat2 == u"":
                #print "cat2 == NULL <%s:%s:%s:>" % (cat1, cat2, cat3)
            if (cat1, cat2) in cat_map:
                new_cat1, new_cat2 = cat_map[(cat1, cat2)]
                str_sample_meta = (sample_id, category, date, title, key, url, (version, content, (new_cat1, new_cat2, new_cat3)))
                samples.db_content.Put(str(sample_id), msgpack.dumps(str_sample_meta))
                logging.debug(Logger.debug("<%s:%s:%s:> -> <%s:%s:%s:>" % (cat1, cat2, cat3, new_cat1, new_cat2, new_cat3)))

            rowidx += 1