def count_vote_dist():
    db_inst = get_db_inst('AmazonReviews', 'AndroidAPP')
    delta = 2
    x_list = []
    y_list = []
    xx = []
    for i in range(1000):
        x_list.append((i * delta, (i + 1) * delta))
        pass
    for tu in x_list:
        try:
            # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": tu[0], "$lt": tu[1]}}).count(), 10))
            y_list.append(db_inst.find({"total_vote": {"$gte": tu[0], "$lt": tu[1]}}).count())

            xx.append(tu[0])
            print y_list[-1]
        except:
            xx.append(tu[0])
            y_list.append(0)
    # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count(), 10))
    y_list.append(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count())
    xx.append(xx[-1] + 1)
    res = {"x": x_list, 'y': y_list}
    open('%s/data/amazon_data/%s' % (PROJECT_PATH, 'vote_counts.json'), 'w').write(json.dumps(res))
    # plt.plot(xx, y_list)
    # plt.grid()
    # plt.show()
    sns.distplot(y_list)
    plt.show()
def structure_data():
    """
    将db中的数据进行结构化处理
    :return:
    """
    db_sp = get_db_inst('ProjectNavi', 'ScenicPoins')
    # find_result = db_sp.find({"title": "中国地质博物馆"})
    find_result = db_sp.find()
    for item in find_result:
        title = item['title']
        city = item['city']
        addr = item['address']
        tmp = city + title + addr
        md5 = hashlib.md5(tmp).hexdigest()
        open_time = item['infos'].get('open_time', None)
        structured_open_time = analyze_open_time(open_time)
        visit_time = item['infos'].get('visit_time', None)
        structured_visit_time = analyze_visit_time(visit_time)
        print title
        print open_time, structured_open_time
        print visit_time, structured_visit_time
        structured_data = {'open_time': {"start": structured_open_time[0], 'end': structured_open_time[1]},
                           'visit_time': structured_visit_time}
        item['structured_infos'] = structured_data
        item['md5'] = md5
        try:
            db_sp.update({'title': title, 'city': city}, item)
        except Exception, e:
            print e
Esempio n. 3
0
def _create_scenic_point_by_title(title):
    """
    根据title生成ScenicPoint对象实例
    :param title:
    :return:
    """
    db_sp = get_db_inst('ProjectNavi', 'ScenicPoins')
    find_result = db_sp.find_one({'title': title})
    sp = ScenicPoint(find_result)
    return sp
def handle_amazon_result(fin_path):
    with open(fin_path, 'r') as fin:
        itemlist = []
        for line in fin:
            splits = line.split('\t')
            item_id = splits[0].replace('itemID: ', '')
            total_reviews = eval(splits[1].replace('total reviews: ', ''))
            oprank_errors = eval(splits[2].replace('oprank_errors: ', ''))
            textrank_errors = eval(splits[3].replace('textrank_errors: ', ''))
            sum_oprank_errors = eval(splits[4].replace('sum_oprank_errors: ', ''))
            sum_textrank_errors = eval(splits[5].replace('sum_textrank_errors: ', ''))
            itemlist.append({'item_id': item_id, 'total_reviews': total_reviews, 'oprank_errors': oprank_errors,
                             'textrank_errors': textrank_errors, 'sum_oprank_errors': sum_oprank_errors,
                             'sum_textrank_errors': sum_textrank_errors})
        # sortedlist = sorted(itemlist, cmp=lambda x, y: cmp(x['total_reviews'], y['total_reviews']))
        db_result = get_db_inst('AmazonReviews', 'AndroidAPP_result')
        for item in itemlist:
            db_result.insert({'itemID': item['item_id'], 'total_reviews': item['total_reviews'],
                              'oprank_errors': item['oprank_errors'],
                              'textrank_errors': item['textrank_errors']})
        print 'handled!'
def get_city_scenic_points(city_name):
    """
    从mafengwo爬取指定城市的所有景点信息并存入数据库
    :param city_name:
    :return:
    """
    db_sp = get_db_inst('ProjectNavi', 'ScenicPoins')
    for info in query_by_name(city_name):
        print info['name']
        url = info['url']
        print url
        try:
            scenic_info = get_details(url)
            scenic_info['city'] = city_name
            db_sp.insert(scenic_info)
            api_logger.info('%s %s inserted' % (city_name, scenic_info['title']))
        except Exception, e:
            print e
        t = random.random() * 2
        print 'sleep %s seconds' % t
        time.sleep(t)
def amazon_preprocess(start=0, end=10, label_rate=0.65, min_vote=0):
    """

    :param start:
    :param end:
    :param label_rate:
    :return:
    """
    # prepare train set
    db_inst = get_db_inst('AmazonReviews', 'AndroidAPP')
    # print len(db_inst.distinct('asin'))
    manager_groups = {}
    asin_file = open('%s/process/data/asin.list' % PROJECT_PATH, 'r')
    # for asin in db_inst.distinct('asin'):
    #     asin_file.write('%s\n' % asin)
    lines = asin_file.readlines()
    shuffle(lines)
    # for asin in db_inst.distinct('asin'):
    tlines = lines[start:end]
    review_dicts = {}
    asin_list = []
    for asin in tlines:
        asin = asin.replace('\n', '')
        asin_list.append(asin)
        print 'loading %s' % asin
        # snm.add_node(SentenceNode(splits[4], extra=int(ll)))

        # 计算每个APP下的评论
        a_reviews = []
        max_vote = 0  # 常量
        for find_item in db_inst.find({"asin": asin, 'total_vote': {"$gt": min_vote}}):
            max_vote = max(find_item['total_vote'], max_vote)
            a_reviews.append(find_item)
        # process item reviews VOTE RANK
        review_rank = []
        print '%s has %s reviews' % (asin, len(a_reviews))
        snm = SentenceNodeManager()
        for review in a_reviews:
            alpha_const = 0
            T = float(review['total_vote']) / max_vote
            V = 1 / (1.0 + math.exp(-0.01 * (2 * review['up_vote'] - review['total_vote'])))
            # V = float(review['up_vote']) / review['total_vote']
            vote_rank_value = 2 * (T + alpha_const) * (V + alpha_const) / (T + V + 2 * alpha_const)
            if vote_rank_value >= label_rate:
                snm.add_node(
                    SentenceNode(review['reviewText'].lower(), extra=(int(1), vote_rank_value, review['reviewerID']),
                                 get_pos_func=tag_sents,
                                 get_keywords_func=cal_en_tfidf))
            elif vote_rank_value < label_rate:
                snm.add_node(
                    SentenceNode(review['reviewText'].lower(), extra=(int(0), vote_rank_value, review['reviewerID']),
                                 get_pos_func=tag_sents,
                                 get_keywords_func=cal_en_tfidf))
            review_rank.append((review, vote_rank_value))
        manager_groups[asin] = snm
        review_dicts[asin] = review_rank
        # else:
        #     break
    veclist = []
    sentlist = []
    labellist = []
    tokenlist = []
    nodelist = []
    group_nodelist = []
    print 'start normalizing vecs'
    for pid in manager_groups.keys():
        manager = manager_groups[pid]
        # DBSCANcluster(manager, '%s_DBSCANcluster.json' % pid)
        # APcluster(manager, '%s_APcluster.json' % pid)
        manager.normalize_all_sentnodes(tfidf_func=tag_sents)
        veclist.extend(manager.get_vec_list())
        sentlist.extend(manager.get_sent_list())
        gnodelist = []
        for node in manager.node_list:
            labellist.append(node.extra[0])
            tokenlist.append(node.feature2token())
            nodelist.append(node)
            gnodelist.append(node)
        group_nodelist.append(gnodelist)
    print 'end normalizing vecs'
    return veclist, sentlist, labellist, tokenlist, nodelist, manager_groups
def save_list2mongo(item_list, db_name, collection_name):
    db_inst = get_db_inst(db_name, collection_name)
    # r = db_inst.find({"app_name": "UC浏览器"})
    # for item in item_list:
    db_inst.insert_many(item_list)