Beispiel #1
0
 def gen_start_urls():
     with open(ITEM_ID_PATH) as file:
         for line in file:
             item_id = int(line.strip())
             query = session.query(Item).filter_by(id=item_id)
             if not session.query(query.exists()).scalar():
                 yield 'https://item.taobao.com/item.htm?id=' + str(item_id)
Beispiel #2
0
def create_train_test(train_pos_path=TRAIN_POS_PATH,
                      train_neg_path=TRAIN_NEG_PATH,
                      test_pos_path=TEST_POS_PATH,
                      test_neg_path=TEST_NEG_PATH):
    """
    用数据库中所有非默认评论创建训练和测试样本,保证正负样本数一样
    """

    pos, neg = [], []
    for content, rate in Review.filter_default(
            session.query(Review.content,
                          Review.rate).filter(Review.content != '')):
        if Rate(rate).is_good:
            pos.append(content)
        else:
            neg.append(content)

    size = min(len(pos), len(neg))
    size_train = int(size * 0.8)
    pos = choices(pos, k=size)
    neg = choices(neg, k=size)

    for data, path in ((pos[:size_train], train_pos_path), (neg[:size_train],
                                                            train_neg_path),
                       (pos[size_train:], test_pos_path), (neg[size_train:],
                                                           test_neg_path)):
        with codecs.open(path, 'w', 'utf-8') as file:
            file.writelines(data)
Beispiel #3
0
def main():
    for index, review in enumerate(Review.filter_default(
            session.query(Review))):
        eval_classify(review.content, review.rate)

        if index % 100 == 99:
            print('total_contents =', total_contents)
            print('total_correct =', total_correct)
            print('correct_rate =', total_correct / total_contents)
    def start(self):
        print('认为有用则按Y,认为没用则按N:\n')
        # 防止matplotlib阻塞
        plt.ion()

        for item in (session.query(Item).filter(
                Item.reviews.any(Review.is_useful.is_(None)))):
            # 画评价数量-时间图
            dates, good_bars, bad_bars = draw_plot.draw_rate_time_plot(
                item.reviews)

            for review in item.reviews:
                if review.is_useful is not None:
                    continue
                # if review.is_default():
                #     review.is_useful = False
                #     session.commit()
                #     continue

                # 显示评论
                print('用户信用等级:', review.user_rank)
                try:
                    print('评价:', Rate(review.rate).name)
                except ValueError:
                    print('评价: 未知({})'.format(review.rate))
                print('内容:', review.content)
                if review.appends:
                    print('追评:', review.appends)
                print('有图片' if review.has_photo else '无图片')
                cur_date_bar = original_color = None
                if review.date is not None:
                    print('时间:', review.date.isoformat())
                    index = (review.date.date() - dates[0]).days
                    cur_date_bar = (good_bars[index]
                                    if review.is_good else bad_bars[index])
                    original_color = cur_date_bar.get_facecolor()
                    cur_date_bar.set_color('r')
                else:
                    print('时间: 未知')
                plt.show()

                # 输入是否有用
                self._pressed_key = ''
                while self._pressed_key not in ('y', 'n'):
                    self._canvas.start_event_loop()
                    if self.stop:
                        return
                print(self._pressed_key)
                print('')
                review.is_useful = self._pressed_key == 'y'
                session.commit()

                if cur_date_bar is not None:
                    cur_date_bar.set_color(original_color)

            plt.cla()
Beispiel #5
0
 def start_requests(self):
     for item in session.query(Item).filter(Item.sold_count.is_(None)):
         url = ('https://detailskip.taobao.com/service/getData/1/p1'
                '/item/detail/sib.htm?itemId={}&sellerId={}&modules'
                '=soldQuantity&callback=onSibRequestSuccess').format(
                    item.id, item.shop.seller_id)
         headers = {
             'referer':
             'https://item.taobao.com/item.htm?id=' + str(item.id)
         }
         yield Request(url, dont_filter=True, headers=headers)
Beispiel #6
0
    def parse(self, response):
        data = response.text[response.text.find('{'):response.text.rfind('}') +
                             1]
        data = json.loads(data)
        sold_quantity = data['data']['soldQuantity']

        match = re.search(r'itemId=(\d+)', response.url)
        if not match:
            return
        item = session.query(Item).filter_by(id=match[1]).first()

        item.sold_count = sold_quantity['soldTotalCount']
        item.confirm_count = sold_quantity['confirmGoodsCount']
        session.commit()
Beispiel #7
0
    def parse_shop(self):
        try:
            # 卖家
            data = self.driver.execute_script('return g_config.idata')
            seller_id = int(data['seller']['id'])
            query = session.query(Seller).filter_by(id=seller_id)
            if not session.query(query.exists()).scalar():
                session.add(Seller(id=seller_id,
                                   age=data['seller']['shopAge']))

            # 商店
            shop_id = int(data['shop']['id'])
            query = session.query(Shop).filter_by(id=shop_id)
            if not session.query(query.exists()).scalar():
                session.add(
                    Shop(id=shop_id,
                         url=data['shop']['url'],
                         seller_id=seller_id))

            # 商品
            self.item_id = int(data['item']['id'])
            sell_counter = self.driver.find_element_by_css_selector(
                'div.tb-sell-counter a')
            match = re.search(r'售出(\d+)件.*?成功(\d+)件',
                              sell_counter.get_attribute('title'))
            session.add(
                Item(id=self.item_id,
                     title=data['item']['title'],
                     shop_id=shop_id,
                     sold_count=int(match[1]) if match is not None else 0,
                     confirm_count=int(match[2]) if match is not None else 0))

        except:
            self.logger.exception('解析商店时出错:')
            return False

        return True
def create_train_test(train_pos_path=TRAIN_POS_PATH,
                      train_neg_path=TRAIN_NEG_PATH,
                      test_pos_path=TEST_POS_PATH,
                      test_neg_path=TEST_NEG_PATH):
    """
    创建训练和测试样本,保证正负样本数一样
    """

    pos, neg = [], []
    for item in (session.query(Item).filter(
            Item.reviews.any(Review.is_useful.isnot(None)))):
        diffs = get_diffs(item.reviews)
        if not diffs:
            continue

        for review, diff in zip(item.reviews, diffs):
            if (review.is_useful is None  # 未标注
                    or review.date is None  # 未知日期
                ):
                continue

            sample = [
                review.user_rank,  # 用户信用等级
                len(review.content) + len(review.appends),  # 评论长度
                review.has_photo,  # 是否有图片
                bool(review.appends),  # 是否有追评
                diff,  # 评论数量差分
            ]

            if review.is_useful:
                pos.append(sample)
            else:
                neg.append(sample)

    size = min(len(pos), len(neg))
    size_train = int(size * 0.8)
    pos = choices(pos, k=size)
    neg = choices(neg, k=size)

    for data, path in ((pos[:size_train], train_pos_path), (neg[:size_train],
                                                            train_neg_path),
                       (pos[size_train:], test_pos_path), (neg[size_train:],
                                                           test_neg_path)):
        with open(path, 'wb') as file:
            pickle.dump(data, file)
def draw_plot_per_item(draw_func, plots_dir=PLOTS_DIR):
    """
    每个商品画一个图,保存到文件
    :param draw_func: 画图函数,参数:reviews
    :param plots_dir: 保存图像的文件夹
    """

    for item in session.query(Item):
        print(item.id, item.title)

        filename = '{} {}.png'.format(item.id, item.title)
        filename = replace_illegal_chars(filename)
        path = plots_dir + '/' + filename
        if exists(path):
            continue

        draw_func(item.reviews)
        plt.savefig(path)
        plt.cla()