コード例 #1
0
ファイル: analyseresult.py プロジェクト: yuantuo/DataExample
    def start(self):

        while True:

            print('Commands: "show": display all category, "linear <category name>": display linear graph, "exit": exit programme, "coef": Show trening again product stats')
            the_input = raw_input('Enter input:')

            if the_input == 'exit':
                break
            elif the_input == 'show':

                the_input = raw_input('Enter 1 for products, 2 for categories:')

                hb = Myhbase('crawler')

                if the_input == '1':
                    print(hb.getAllproducts())
                elif the_input == '2':
                    print(hb.getAllcategories())

            elif re.compile(r'^linear\s.+$').match(the_input):
                category = re.compile(r'^linear\s(.+)$').match(the_input)
                if category is not None:
                    self.lineargraph(category.group(1))
            elif the_input == 'coef':
                self.coef()
コード例 #2
0
    def start(self):

        while True:

            print(
                'Commands: "show": display all category, "linear <category name>": display linear graph, "exit": exit programme, "coef": Show trening again product stats'
            )
            the_input = raw_input('Enter input:')

            if the_input == 'exit':
                break
            elif the_input == 'show':

                the_input = raw_input(
                    'Enter 1 for products, 2 for categories:')

                hb = Myhbase('crawler')

                if the_input == '1':
                    print(hb.getAllproducts())
                elif the_input == '2':
                    print(hb.getAllcategories())

            elif re.compile(r'^linear\s.+$').match(the_input):
                category = re.compile(r'^linear\s(.+)$').match(the_input)
                if category is not None:
                    self.lineargraph(category.group(1))
            elif the_input == 'coef':
                self.coef()
コード例 #3
0
ファイル: sparkstream.py プロジェクト: yuantuo/DataExample
class Sparksteam(Myredis, Myhbase):

    def __init__(self, zkQuorum, topic, hbtable):
        self.zkQuorum = zkQuorum
        self.topic = topic
        self.hbase = Myhbase(hbtable)
        self.redis = Myredis()

    def start(self):

        sc = SparkContext(appName="PythonStreamingNOTHS")
        ssc = StreamingContext(sc, 10)

        kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1})
        print('******* Event received in window: ', kvs.pprint())

        if topic == 'NOTHS-crawler-topic':
            kvs.foreachRDD(self.save_crawler_hbase)
        elif topic == 'NOTHS-trends-topic':
            kvs.foreachRDD(self.save_trends_hbase)

        ssc.start()
        ssc.awaitTermination()

    def save_trends_hbase(self, time, rdd):

        try:
            recs = rdd.collect()
            if recs:
                for rec in recs:
                    self.hbase.save_trend(rec)
        except:
            print('HBase update Err.')


    def save_crawler_hbase(self, time, rdd):

        try:
            recs = rdd.collect()
            if recs:
                for rec in recs:
                    self.hbase.save_crawler(rec)

                    x = rec[1].split(',')
                    if str(x[0]) == 'category_link':
                        if self.redis.isNewCategory(str(x[2])):
                            self.redis.addcategory(str(x[2]))
        except:
            print('HBase update Err.')
コード例 #4
0
ファイル: sparkstream.py プロジェクト: adview001/DataExample
class Sparksteam(Myredis, Myhbase):
    def __init__(self, zkQuorum, topic, hbtable):
        self.zkQuorum = zkQuorum
        self.topic = topic
        self.hbase = Myhbase(hbtable)
        self.redis = Myredis()

    def start(self):

        sc = SparkContext(appName="PythonStreamingNOTHS")
        ssc = StreamingContext(sc, 10)

        kvs = KafkaUtils.createStream(ssc, self.zkQuorum,
                                      "spark-streaming-consumer",
                                      {self.topic: 1})
        print('******* Event received in window: ', kvs.pprint())

        if topic == 'NOTHS-crawler-topic':
            kvs.foreachRDD(self.save_crawler_hbase)
        elif topic == 'NOTHS-trends-topic':
            kvs.foreachRDD(self.save_trends_hbase)

        ssc.start()
        ssc.awaitTermination()

    def save_trends_hbase(self, time, rdd):

        try:
            recs = rdd.collect()
            if recs:
                for rec in recs:
                    self.hbase.save_trend(rec)
        except:
            print('HBase update Err.')

    def save_crawler_hbase(self, time, rdd):

        try:
            recs = rdd.collect()
            if recs:
                for rec in recs:
                    self.hbase.save_crawler(rec)

                    x = rec[1].split(',')
                    if str(x[0]) == 'category_link':
                        if self.redis.isNewCategory(str(x[2])):
                            self.redis.addcategory(str(x[2]))
        except:
            print('HBase update Err.')
コード例 #5
0
    def lineargraph(self, category):

        if category is not '':
            try:

                hb = Myhbase('trend')

                X = []
                Y = []
                i = 1
                rowkey = category + 'interests'
                print(rowkey)

                for key, data in hb.table.scan(row_prefix=rowkey, ):
                    X.append([i])
                    v = int(data['stats:value'])
                    Y.append([v])
                    i += 1

                if X:
                    mdl = LinearRegression().fit(X, Y)
                    m = mdl.coef_[0]
                    b = mdl.intercept_
                    plt.scatter(X, Y, color='blue')

                    x_len = len(X)
                    plt.plot([0, x_len], [b, m * x_len + b], 'r')
                    plt.title(category, fontsize=20)
                    plt.xlabel('Time (Weeks)', fontsize=15)
                    plt.ylabel('Trending', fontsize=15)
                    plt.show()
                else:
                    print('not enough data for category', category)
            except:
                print('opss something is wrong.')
コード例 #6
0
    def categorytrend(self, category):

        print('Get trend data..')
        hb = Myhbase('trend')

        X = []
        Y = []
        if category is not '':

            try:
                hb = Myhbase('trend')
                i = 1
                rowkey = category + 'interests'

                for key, data in hb.table.scan(row_prefix=rowkey, ):
                    X.append([i])
                    v = int(data['stats:value'])
                    Y.append([v])
                    i += 1

            except:
                print('err')

            return (X, Y)
コード例 #7
0
    def coef(self):

        hb_crawler = Myhbase('crawler')
        hb_trend = Myhbase('trend')
        data_hash = collections.defaultdict(dict)

        categories = hb_crawler.getAllcategories()

        for category in categories:
            print(category)

            #find coffient
            X = []
            Y = []
            i = 1
            rowkey = category + 'interests'
            print(rowkey)

            for key, data in hb_trend.table.scan(row_prefix=rowkey, ):
                X.append([i])
                v = int(data['stats:value'])
                Y.append([v])
                i += 1

            if X:

                mdl = LinearRegression().fit(X, Y)
                m = mdl.coef_[0]
                data_hash[category]['coef'] = m

                min_price = None
                max_price = None
                mean_price = 0
                total_price = 0

                products_in_category = hb_crawler.getCategoryProducts(category)
                for item in products_in_category:
                    if item['product:price']:

                        item_price = item['product:price']
                        regex = re.compile('[^0-9\.]')
                        item_price = regex.sub('', item_price)
                        item_price = float(item_price)

                        total_price += item_price

                        if item_price < min_price or min_price is None:
                            min_price = item_price

                        if item_price > max_price or max_price is None:
                            max_price = item_price

                        data_hash[category]['min_price'] = min_price
                        data_hash[category]['max_price'] = max_price
                        data_hash[category]['total_price'] = total_price
                        data_hash[category]['mean_price'] = total_price / len(
                            products_in_category)
                        data_hash[category][
                            'range_price'] = max_price - min_price
                        data_hash[category]['total_item'] = len(
                            products_in_category)

        print(data_hash)
        x = []
        y = []
        z = []
        for key, item in data_hash.iteritems():

            x.append(item['mean_price'])
            y.append(item['coef'][0])
            z.append(item['total_item'])

        x = np.array(x)
        y = np.array(y)
        z = np.array(z)
        colors = np.random.rand(len(x))
        area = np.pi * z
        plt.scatter(x, y, s=area, c=colors, alpha=0.5)
        plt.title('Product range, price and trending coefficient', fontsize=20)
        plt.xlabel('Averge Price', fontsize=15)
        plt.ylabel('Trend Coefficient', fontsize=15)
        plt.show()
コード例 #8
0
ファイル: sparkstream.py プロジェクト: yuantuo/DataExample
 def __init__(self, zkQuorum, topic, hbtable):
     self.zkQuorum = zkQuorum
     self.topic = topic
     self.hbase = Myhbase(hbtable)
     self.redis = Myredis()
コード例 #9
0
ファイル: sparkstream.py プロジェクト: adview001/DataExample
 def __init__(self, zkQuorum, topic, hbtable):
     self.zkQuorum = zkQuorum
     self.topic = topic
     self.hbase = Myhbase(hbtable)
     self.redis = Myredis()
コード例 #10
0
ファイル: hbase_test.py プロジェクト: adview001/DataExample
        'category:name': 'mycategory',
        'stats:from': '2016-03-01',
        'stats:to': '2016-03-31',
        'stats:value': '10'
    })

trend_test_conn.put(
    'mycategoryinterests2016-04-012016-05-01', {
        'group:name': 'interests',
        'category:name': 'mycategory',
        'stats:from': '2016-04-01',
        'stats:to': '2016-05-01',
        'stats:value': '20'
    })

hb_cralwer_test = Myhbase('crawler_test')


def test_save_cawler_rec():

    new_data = ",".join([
        'product_link', 'http://newadd.com', 'product1', '$10',
        'Im a new product desc from test', 'testcategory'
    ])
    hb_cralwer_test.save_crawler([0, new_data])

    row = crawler_test_conn.row('testcategoryproduct1')
    assert row['product:name'] == 'product1'
    assert row['product:desc'] == 'Im a new product desc from test'
    assert row['category:name'] == 'testcategory'
コード例 #11
0
ファイル: analyseresult.py プロジェクト: yuantuo/DataExample
    def coef(self):

        hb_crawler = Myhbase('crawler')
        hb_trend = Myhbase('trend')
        data_hash = collections.defaultdict(dict)

        categories = hb_crawler.getAllcategories()

        for category in categories:
            print(category)

            #find coffient
            X = []
            Y = []
            i = 1
            rowkey = category + 'interests'
            print(rowkey)

            for key, data in hb_trend.table.scan(row_prefix=rowkey, ):
                X.append([i])
                v = int(data['stats:value'])
                Y.append([v])
                i += 1

            if X:

                mdl = LinearRegression().fit(X, Y)
                m = mdl.coef_[0]
                data_hash[category]['coef'] = m

                min_price = None
                max_price = None
                mean_price = 0
                total_price = 0

                products_in_category = hb_crawler.getCategoryProducts(category)
                for item in products_in_category:
                    if item['product:price']:

                        item_price = item['product:price']
                        regex = re.compile('[^0-9\.]')
                        item_price = regex.sub('', item_price)
                        item_price = float(item_price)

                        total_price += item_price

                        if item_price < min_price or min_price is None:
                            min_price = item_price

                        if item_price > max_price or max_price is None:
                            max_price = item_price

                        data_hash[category]['min_price'] = min_price
                        data_hash[category]['max_price'] = max_price
                        data_hash[category]['total_price'] = total_price
                        data_hash[category]['mean_price'] = total_price / len(products_in_category)
                        data_hash[category]['range_price'] = max_price - min_price
                        data_hash[category]['total_item'] = len(products_in_category)



        print(data_hash)
        x = []
        y = []
        z = []
        for key, item in data_hash.iteritems():

            x.append(item['mean_price'])
            y.append(item['coef'][0])
            z.append(item['total_item'])

        x = np.array(x)
        y = np.array(y)
        z = np.array(z)
        colors = np.random.rand( len(x) )
        area = np.pi * z
        plt.scatter(x, y, s=area, c=colors, alpha=0.5)
        plt.title('Product range, price and trending coefficient', fontsize=20)
        plt.xlabel('Averge Price', fontsize=15)
        plt.ylabel('Trend Coefficient', fontsize=15)
        plt.show()