Exemple #1
0
def insert2companyAll():
    from mongo import op
    new_company = []
    new_ids = set(db[companyCol].distinct('id'))
    old_ids = set(db['companyALL'].distinct('_id'))
    insert_ids = list(new_ids - old_ids)
    with op.DBManger() as m:
        table = m.get_lasted_collection(
            "51job", filter={"name": {
                "$regex": r"company_20\d\d\d\d\d\d"
            }})
        print(table)
    new_items = db[table].find({'id': {'$in': insert_ids}})
    for item in new_items:
        co = {
            '_id': item['id'],
            'name': item['name'],
            'industry': item['industry'],
            'location': item['location'],
            'compkind': item['compkind'],
            'size': item['size']
        }
        new_company.append(co)
        # print(co)
    db['companyALL'].insert(new_company)
    print('new company insert successful!')
Exemple #2
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     with op.DBManger() as m:
         m.create_db_collection(db_collection=("jingdong", "jdskuid{0}_sep".format(current_date)))
         last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\d_sep$"}})
         seed_set = set()
         for table in  m.list_tables("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\dretry\d*$"}}):
             if not last_sep or table > last_sep:
                 self.logger.info("valid table : {}".format(table))
                 pipeline = [
                     {"$match": {
                         "$and": [{"_status": 0}, {"$or": [{"status": 0}, {"status": -1}]}]
                         }
                     }
                 ]
                 for seed in m.read_from(db_collect=("jingdong", table), out_field=("cate_id", "brand_id"), pipeline=pipeline):
                     seed_set.add(seed)
         for i, seed in enumerate(seed_set):
             seed = Seed(value=seed, type=0)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #3
0
 def init_clean_price(self):
     from mongo import op
     from tqdm import tqdm
     pipeline = [{"$match": {"pid": {"$ne": "null"}}}]
     with op.DBManger() as m:
         dic = {}
         m.drop_db_collect(db_collect=("secoo", "CleanListNew"))
         for collection in tqdm(m.list_tables("secoo",
                                              filter={
                                                  "name": {
                                                      "$regex":
                                                      r"List20\d\d\d\d\d\d"
                                                  }
                                              }),
                                desc="init_clean_price"):
             for pid, price, self1 in m.read_from(db_collect=("secoo",
                                                              collection),
                                                  out_field=("pid", "price",
                                                             "self"),
                                                  pipeline=pipeline):
                 dic.update({pid: (price, self1)})
         date_tuple_list = []
         for k, (p, s) in dic.items():
             date_tuple_list.append((k, k, p, s))
         m.date_tuple_to_db(date_tuple_list=date_tuple_list,
                            db_collect=("secoo", "CleanListNew"),
                            fields_tupe=("_id", "pid", "price", "self"),
                            buffer_size=1024,
                            attach_dict={"_date": self.current_date})
Exemple #4
0
    def __init__(self, **kwargs):
        super(JDPriceMiss, self).__init__(**kwargs)
        self.ua = UserAgent()
        with op.DBManger() as m:
            table = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}})
            skuid_set = set()
            for item in m.read_from(db_collect=("jingdong",table), out_field=("skuid",)):
                skuid_set.add(item[0])
            for i, seed in enumerate(skuid_set):
                current = seed.strip()
                if i % 60 == 0:
                    if i != 0:
                        self.seeds_queue.put(Seed(strr, kwargs["retries"]))
                    strr = current
                else:
                    strr = strr + '%2CJ_' + current
            if strr:
                self.seeds_queue.put(Seed(strr, kwargs["retries"]))

        self.block_pattern = re.compile(r'{.*?}')
        self.innerid_pattern = re.compile(r'\d+')
        self.innerprice_pattern = re.compile(r'"\d+.\d+"')
        self.op_pattern = re.compile(r'"op":"(\d+.\d+)"')
        self.p_pattern = re.compile(r'"(\d+.\d+)"')
        self. p2_pattern = re.compile(r'"(-\d+.\d+)"')
        self.p1 = re.compile(r'"id":.*?"}')
        self.id_pattern = re.compile(r'id:"J_(\d+)"')
        self.first_pattern = re.compile(r'([a-zA-Z]*)":')
        self.rid = random.randint(100000000, 999999999)
        self.usrid = str(self.rid)
        self.up_pattern = re.compile('"up":"tpp"')
        self.price_pattern = re.compile(r'^\d+\.\d\d$')
Exemple #5
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     with op.DBManger() as m:
         pipeline = [
             {
                 "$match": {
                     "_status": 3
                 }
             },
         ]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", self.last_retry_collect),
                         out_field=("_seed", "_status"),
                         pipeline=pipeline))
         should_exit = True
         for i, (seed, status) in enumerate(data_set.distinct()):
             should_exit = False
             seed = Seed(value=seed, type=3)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
         if should_exit:
             import sys
             sys.exit(0)
Exemple #6
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer_size = 1024
     with op.DBManger() as m:
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdchaoshi{0}_sep".format(current_date)))
         buffer = []
         import requests
         request = {
             "url": "https://chaoshi.jd.com/",
             "headers": {
                 'Connection':
                 'close',
                 "Referer":
                 "https://www.jd.com",
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
             }
         }
         nav_pattern = re.compile(r'navThird[1-9]: (\[.*\])')
         import time
         from ast import literal_eval
         src = requests.get(**request).text
         for i in nav_pattern.findall(src):
             for j in literal_eval(i):
                 for k in j["children"]:
                     seed = Seed(value=k["URL"].replace("\\", ""), type=0)
                     buffer.append(str(seed))
                     if len(buffer) % buffer_size == 0:
                         self.redis.sadd(self.start_urls_redis_key, *buffer)
                         buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #7
0
def reader():
    global queue
    # ids = db["companyIds"].distinct("_id")
    from mongo import op
    with op.DBManger() as m:
        last_company_ids = m.get_lasted_collection(
            "liepin",
            filter={"name": {
                "$regex": r"CompanyIdLib_20\d\d\d\d\d\d"
            }})
    #ids = db['CompanyIdLib_20200617'].distinct('_id')
    ids = db[last_company_ids].distinct('_id')
    url = "https://www.liepin.com/company/sojob.json"
    for i, id in enumerate(ids):
        if i % 10000 == 0:
            print
            time.strftime('%Y-%m-%d %H:%M:%S Loading...'), i
        formdata = {
            "ecompIds": id,
            "pageSize": "15",
            "curPage": 0,
            "keywords": "",
            "dq": "",
            "deptId": "",
        }
        task = {"url": url, "formdata": formdata}
        queue.put(task)
    maxCount = queue.qsize()
    print
    time.strftime('%Y-%m-%d %H:%M:%S Loading complete'), maxCount
Exemple #8
0
 def clean_price(self):
     from mongo import op
     pipeline = [{"$match": {"pid": {"$ne": None}}}]
     with op.DBManger() as m:
         dic = {}
         m.drop_db_collect(db_collect=("secoo", "CleanListOld"))
         m.rename_collection(old_db_collection=("secoo", "CleanListNew"),
                             new_db_collection=("secoo", "CleanListOld"))
         for pid, price, self1 in m.read_from(db_collect=("secoo",
                                                          "CleanListOld"),
                                              out_field=("pid", "price",
                                                         "self")):
             dic.update({pid: (price, self1)})
         for pid, price, self1 in m.read_from(
                 db_collect=("secoo", "List" + self.current_date),
                 out_field=("pid", "price", "self"),
                 pipeline=pipeline):
             dic.update({pid: (price, self1)})
         date_tuple_list = []
         for k, (p, s) in dic.items():
             date_tuple_list.append((k, k, p, s))
         m.date_tuple_to_db(date_tuple_list=date_tuple_list,
                            db_collect=("secoo", "CleanListNew"),
                            fields_tupe=("_id", "pid", "price", "self"),
                            buffer_size=128,
                            attach_dict={"_date": self.current_date},
                            show_pbar=True,
                            pbar_name="clean_price")
 def __init__(self, *args, **kwargs):
     super(RetryMaster, self).__init__(*args, **kwargs)
     with op.DBManger() as m:
         self.last_retry_collect = 'jdprice_miss_out'
         self.new_retry_collect = 'jdprice_miss_out_retry'
         self.logger.info((self.last_retry_collect, self.new_retry_collect))
     self.out_table = self.new_retry_collect
Exemple #10
0
    def __init__(self, address, verbose=True):
        """
		:param address: Socket address (Ip address, Port)
		:type address: tuple
		:param verbose:
		"""
        self.ip, self.port = address
        self.verbose = verbose
        self.state = None
        self.writer = op.DBManger()
        self.lock = threading.Lock()

        def receive_ation(passss):
            try:
                action_socket = socket.socket(socket.AF_INET,
                                              socket.SOCK_DGRAM)
                action_socket.bind((self.ip, 10053))
                action_socket.settimeout(100000)
                while True:
                    (data, client_address
                     ) = action_socket.recvfrom(DEFAULT_BUFFER_SIZE)
                    print('change state from {} to {}'.format(
                        self.state, data))
                    if data == b'None':
                        self.state = None
                    else:
                        self.state = data
            except KeyboardInterrupt:
                if self.verbose:
                    print('\rAction Server Shutdown!')
            finally:
                action_socket.close()

        thread = threading.Thread(target=receive_ation, args=(None, ))
        thread.start()
Exemple #11
0
 def __init__(self, **kwargs):
     super(JDPrice, self).__init__(**kwargs)
     self.ua = UserAgent()
     with op.DBManger() as m:
         #创建临时表本月任务的分界线
         m.create_db_collection(db_collection=("jingdong","jdcommentdetail{0}_sep".format(current_date)))
         skuid_set = {}
         top1000w = TopK(1)
         #skuids in last result
         last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}})
         pipeline = [
             {
                 "$project": {
                     "skuid": "$skuid",
                     "comment_{}".format(last_result[-6:]):"$comment_{}".format(last_result[-6:])
                 }
             },
             {"$limit": 100}
         ]
         for item, comments in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comment_{}".format(last_result[-6:])),pipeline=pipeline):
             if int(item) not in skuid_set:
                 top1000w.push(int(comments))
                 skuid_set[int(item)] = int(comments)
         top1000w = set(top1000w.get_topk())
         for i, seed in enumerate(skuid_set):
             if skuid_set[seed] in top1000w:
                 seed = Seed(value=seed, type=0)
                 self.seeds_queue.put(seed)
Exemple #12
0
def run_result():
    with op.DBManger() as m:
        skuid_sukid_dict = {}
        for shopid, shop_name in m.read_from(db_collect=("jingdong", "zhaixun_shop"), out_field=("shopid","shop_name")):
            if shop_name:
                skuid_sukid_dict[shopid] = (shopid,shop_name)

        #skuids in last result
        m.date_tuple_to_db(date_tuple_list=list(skuid_sukid_dict.values()),db_collect=("jingdong","zhaixun_shop_1"),fields_tupe=("shopid","shop_name"))
Exemple #13
0
def run_result():
    with op.DBManger() as m:
        brandid2name={}
        for brand_id, brand_name in m.read_from(db_collect=("jingdong", "jdbrand20210108retry0"),
                                                                              out_field=("brand_id", "name")):
            if brand_id:
                brandid2name[int(brand_id)] = brand_name
        skuid_sukid_dict = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20201214_sep"}})
        tables = m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20210108)retry\d*$"}})
        tables.extend(m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20201214)retry\d*$"}}))
        for table in tables:
            if not last_sep or table > last_sep:
                print("step 1: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {"_status": 0}
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "cate_id": "$cate_id",
                            "brand_id": "$brand_id",
                            "shopid": "$shopid",
                            "shop_name": "$shop_name",
                            "title": "$title",
                        }
                    },
                ]
                for skuid, cate_id, brand_id, shopid, shop_name, title in m.read_from(db_collect=("jingdong", table), out_field=("skuid","cate_id","brand_id","shopid","shop_name","title"), pipeline=pipeline):
                    skuid_sukid_dict[int(skuid)]={"cate_id":cate_id,"brand_id": "0" if brand_id is None else brand_id,"shopid":shopid,"shop_name":shop_name,"title":title}

        #skuids in last result
        buffer = []
        buffer_size = 5000
        last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^month202007$"}})
        print("step 2: processing {}".format(last_result), flush=True)
        count = 0
        out_table = "month_zx_" + "202007"
        for skuid, comments, price,cate_id,brand_id,month in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comments","clean_price","cate_id","brand_id","month")):
            count = count + 1
            if int(skuid) not in skuid_sukid_dict:
                tmp = {"skuid": int(skuid), "clean_price": price, "comments": comments,
                       "cate_id": format_cat_id(cate_id), "brand_id": brand_id,"brand_name": brandid2name.get(int(brand_id)), "month": month}
            else:
                item = skuid_sukid_dict[int(skuid)]
                tmp = {"skuid": int(skuid), "clean_price": price, "comments": comments,
                       "cate_id": format_cat_id(cate_id) if cate_id else None, "brand_id": brand_id,"brand_name": brandid2name.get(int(brand_id)), "month": month, "shopid":item["shopid"],"shop_name":item["shop_name"],"title":item["title"]}
            buffer.append(tmp)
            if count % buffer_size == 0 and buffer:
                m.insert_many_dict(db_collect=("jingdong",out_table), data_dict_list=buffer)
                buffer = []
        if buffer:
            m.insert_many_dict(db_collect=("jingdong", out_table),
                               data_dict_list=buffer)
Exemple #14
0
def run_result():
    with op.DBManger() as m:
        skuid_sukid_dict = {}
        tables = m.list_tables(
            dbname="jingdong",
            filter={"name": {
                "$regex": r"^jdskuid(20210108)retry\d*$"
            }})
        tables.extend(
            m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20201214)retry\d*$"
                }}))
        tables.extend(
            m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20200821)retry\d*$"
                }}))
        tables.extend(
            m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20200920)retry\d*$"
                }}))
        for table in tables:
            print("step 1: processing {}".format(table), flush=True)
            pipeline = [
                {
                    "$match": {
                        "_status": 0
                    }
                },
                {
                    "$project": {
                        "shopid": "$shopid",
                        "shop_name": "$shop_name",
                    }
                },
            ]
            for shopid, shop_name in m.read_from(db_collect=("jingdong",
                                                             table),
                                                 out_field=("shopid",
                                                            "shop_name"),
                                                 pipeline=pipeline):
                if shop_name:
                    skuid_sukid_dict[shopid] = (shopid, shop_name)
                elif shopid:
                    skuid_sukid_dict[shopid] = (shopid, None)

        #skuids in last result
        m.date_tuple_to_db(date_tuple_list=list(skuid_sukid_dict.values()),
                           db_collect=("jingdong", "zhaixun_shop"),
                           fields_tupe=("shopid", "shop_name"))
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         pipeline = [
             {
                 "$match": {
                     "skuid": {
                         "$ne": None
                     }
                 }
             },
         ]
         skuid_set = set()
         for item in m.read_from(db_collect=("jingdong",
                                             "jdprice_miss_seed"),
                                 out_field=("skuid", ),
                                 pipeline=pipeline):
             skuid_set.add(int(item[0]))
         self.logger.info(
             "total new skuid of comment larger than 0 is: {}".format(
                 len(skuid_set)))
         buffer = []
         for i, seed in enumerate(skuid_set):
             seed = str(seed)
             current = seed.strip()
             if i % 60 == 0:
                 if i != 0:
                     seed = Seed(value=strr, type=0)
                     buffer.append(str(seed))
                 strr = current
             else:
                 strr = strr + '%2CJ_' + current
         if strr:
             seed = Seed(value=strr, type=0)
             buffer.append(str(seed))
         if buffer:
             buffer1 = []
             buffer_size = 10000
             for i, seed in enumerate(buffer):
                 buffer1.append(str(seed))
                 if len(buffer1) % buffer_size == 0:
                     random.shuffle(buffer1)
                     self.redis.sadd(self.start_urls_redis_key, *buffer1)
                     buffer1 = []
             if buffer1:
                 random.shuffle(buffer1)
                 self.redis.sadd(self.start_urls_redis_key, *buffer1)
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         #创建临时表本月任务的分界线
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdcommentdetail{0}_sep".format(current_date)))
         skuid_set = {}
         top1000w = TopK(1000000)
         #skuids in last result
         last_result = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^summary_201905_20\d\d\d\d$"
             }})
         pipeline = [
             {
                 "$project": {
                     "skuid":
                     "$skuid",
                     "comment_{}".format(last_result[-6:]):
                     "$comment_{}".format(last_result[-6:])
                 }
             },
             #{"$limit": 1000}
         ]
         for item, comments in m.read_from(
                 db_collect=("jingdong", last_result),
                 out_field=("skuid", "comment_{}".format(last_result[-6:])),
                 pipeline=pipeline):
             if int(item) not in skuid_set:
                 top1000w.push(int(comments))
                 skuid_set[int(item)] = int(comments)
         top1000w = set(top1000w.get_topk())
         buffer = []
         buffer_size = 10000
         for i, seed in enumerate(skuid_set):
             if skuid_set[seed] in top1000w:
                 seed = Seed(value=seed, type=0)
                 buffer.append(str(seed))
                 if len(buffer) % buffer_size == 0:
                     random.shuffle(buffer)
                     self.redis.sadd(self.start_urls_redis_key, *buffer)
                     buffer = []
         if buffer:
             random.shuffle(buffer)
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #17
0
 def __init__(self, *args, **kwargs):
     super(RetryMaster, self).__init__(*args, **kwargs)
     with op.DBManger() as m:
         self.last_retry_collect = m.get_lasted_collection(
             "jingdong",
             filter={
                 "name": {
                     "$regex": r"^jdsearch20\d\d\d\d\d\dretry\d+$"
                 }
             })
         self.new_retry_collect = self.last_retry_collect[:self.last_retry_collect.find(
             "retry") + 5] + str(
                 int(self.last_retry_collect[self.last_retry_collect.
                                             find("retry") + 5:]) +
                 1) if self.last_retry_collect.find(
                     "retry") != -1 else self.last_retry_collect + "retry1"
         self.logger.info((self.last_retry_collect, self.new_retry_collect))
Exemple #18
0
 def __init__(self, **kwargs):
     super(GetProductId, self).__init__(**kwargs)
     self.retries = 3
     self.proxies = HttpProxy.getHttpProxy()
     self.ua = UserAgent()
     with op.DBManger() as m:
         last_brand_collect = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^brand20\d\d\d\d\d\d$"
             }})
         pipeline = [{
             "$match": {
                 "cate_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "brand_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "name": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "_status": 0
             }
         }]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", last_brand_collect),
                         out_field=("cate_id", "brand_id", "name"),
                         pipeline=pipeline))
         for i, seed in enumerate(data_set.distinct()):
             self.seeds_queue.put(
                 Seed(value=seed, retries=self.retries, type=0))
     self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
     self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}')
     self.totalpage_perttern = re.compile(
         r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
Exemple #19
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer_size = 1024
     with op.DBManger() as m:
         #m.create_db_collection(db_collection=("jingdong", "jdbrand{0}_sep".format(current_date)))
         buffer = []
         pipeline = [{"$limit": 1}]
         for seed in m.read_from(db_collect=("jingdong", "newCateName"),
                                 out_field=("cate_id", ),
                                 pipeline=pipeline):
             seed = Seed(value=seed[0], type=0)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #20
0
 def __init__(self, current_date, **kwargs):
     super(SecooMonthJob1, self).__init__(**kwargs)
     self.proxies = list(map(lambda x: ("http://u{}:[email protected]:3128".format(x)), range(28)))
     self.ua = UserAgent()
     self.current_date = current_date
     with op.DBManger() as m:
         total = m.count(db_collect=("secoo", "CleanListNew"))
         for pid, price in tqdm(m.read_from(db_collect=("secoo", "CleanListNew"), out_field=("pid", "price")),
                                total=total, desc="reading"):
             self.seeds_queue.put(Seed((pid, price), kwargs["retries"], type=0))
     self.seed_retries = kwargs["retries"]
     self.page_pattern = re.compile(r'totalCurrCommentNum":.*?,')
     self.block_pattern = re.compile(r'{"isShow.*?}')
     self.bench = timeUtil.getdate(-90, format='%Y%m%d')
     self.id_pattern = re.compile(r'"id":\d+')
     self.pid_pattern = re.compile(r'productId":\d+')
     self.time_pattern = re.compile(r'createDate":\d+')
     self.user_pattern = re.compile(r'userName":"******"')
     self.device_pattern = re.compile(r'sourceDevice":".*?"')
Exemple #21
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         skuid_set = set()
         for item in m.read_from(db_collect=("jingdong", self.intable),
                                 out_field=("skuid", )):
             skuid_set.add(int(item[0]))
         buffer = []
         buffer_size = 10000
         for i, seed in enumerate(skuid_set):
             seed = Seed(value=seed, type=0)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 random.shuffle(buffer)
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             random.shuffle(buffer)
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #22
0
 def __init__(self, filename):
     # format后的tablename
     self.table_name = time.strftime('company_%Y%m%d',
                                     time.localtime(time.time()))
     self.new_company_lib = time.strftime('CompanyIdLib_%Y%m%d',
                                          time.localtime(time.time()))
     # company id库
     from mongo import op
     with op.DBManger() as m:
         last_company_ids = m.get_lasted_collection(
             "liepin",
             filter={"name": {
                 "$regex": r"CompanyIdLib_20\d\d\d\d\d\d"
             }})
     #self.company_ids = 'CompanyIdLib_20200602'
     self.company_ids = last_company_ids
     print("last_company_ids: " + last_company_ids)
     # 最终插入mongo的数据列表
     self.final_list = []
     # liepin_data
     self.filename = filename
     self.db = pymongo.MongoClient('mongodb://192.168.0.13:27017')['liepin']
Exemple #23
0
def run_result():
    with op.DBManger() as m:
        skuid_sukid_dict = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdskuid20201214_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20201214)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 3: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            "_status": 0
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "cate_id": "$cate_id",
                            "brand_id": "$brand_id",
                            "ziying": "$ziying",
                        }
                    },
                ]
                for skuid, cate_id, brand_id, ziying in m.read_from(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "cate_id", "brand_id", "ziying"),
                        pipeline=pipeline):
                    skuid_sukid_dict[int(skuid)] = {
                        "cate_id": cate_id,
                        "brand_id": "0" if brand_id is None else brand_id,
                        "ziying": ziying
                    }

        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}):
        count = 0
        count1 = 0
        a = set()
        b = set()

        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdcomment20201218_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdcomment(20201218)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 4: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}]
                            "$and": [{
                                "_status": 0
                            }, {
                                "comment": {
                                    "$gt": "0"
                                }
                            }]
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "comment": "$comment",
                        }
                    },
                ]
                for skuid, comments in m.read_from_yield(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "comment"),
                        pipeline=pipeline):
                    if int(skuid) in skuid_sukid_dict:
                        count = count + 1
                        if count < 50:
                            a.add(int(skuid))
                    else:
                        count1 = count1 + 1
                        if count1 < 50:
                            b.add(int(skuid))

        print(count, )
        for i, v in enumerate(a):
            print(v)
        print(count1, )
        for i, v in enumerate(b):
            print(v)
    myconf.setAppName("test").setMaster("local[4]")
    myconf.set('spark.executor.instances', '4')
    myconf.set('spark.driver.memory', '6G')
    #myconf.set('spark.executor.memory','1G')
    myconf.set('spark.executor.cores', '4')
    myconf.set('spark.task.cpus', '4')

    # 指定连接器对应的spark-package
    myconf.set("spark.jars.packages",
               "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1")
    spark = SparkSession.builder.config(conf=myconf).getOrCreate()
    logger = spark._jvm.org.apache.log4j
    logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL)

    # 使用指定格式读取
    with op.DBManger() as m:
        month = '202102'
        last_month = '202101'
        tables = m.list_tables(
            dbname="jingdong",
            filter={"name": {
                "$regex": r"^jdpricejdprice20210222$"
            }})
        schema = StructType([
            StructField("skuid", LongType(), True),
            StructField("price", DoubleType(), True)
        ])
        df = spark.createDataFrame([], schema)
        for table in tables:
            tmp = spark.read.format("mongo").option(
                "uri", "mongodb://192.168.0.13:27017/jingdong.{}".format(
from mongo import op
from tqdm import tqdm
# with op.DBManger() as m, open("paopaomate_summary","w") as f:
#     pipeline = [
#         {
#             "$lookup": {
#                 "from": "paopaomate_comment",
#                 "localField": "skuid",
#                 "foreignField": "skuid",
#                 "as": "skuid"
#             }
#         },
#         {"$out": "paopaomate_final"}
#     ]
#     m.aggregate(db_collect=("jingdong","paopaomate_summary"), pipeline=pipeline)
with op.DBManger() as m, open("paopaomate_summary", "w") as f:
    tmp = {}
    for skuid, comment in m.read_from(db_collect=("jingdong",
                                                  "paopaomate_comment"),
                                      out_field=("skuid", "comment")):
        tmp[str(skuid)] = comment

    headers = [
        "skuid", "brand_id", "price", "comment_201905", "comment_201906",
        "comment_201907", "comment_201908", "comment_201909", "comment_201910",
        "comment_201911", "comment_201912", "comment_202001", "comment_202002",
        "comment_202003", "comment_202004", "comment_202005", "comment_202006",
        "comment_202007", "comment_202008", "comment_202009"
    ]
    skuids = set()
    for item in tqdm(m.read_from(db_collect=("jingdong",
Exemple #26
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         #创建临时表本月任务的分界线
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdcomment{0}_sep".format(current_date)))
         skuid_set = set()
         pipeline = [{
             "$match": {
                 "$and": [{
                     "_status": 0
                 }, {
                     "skuid": {
                         "$ne": None
                     }
                 }]
             }
         }, {
             "$project": {
                 "skuid": "$skuid",
             }
         }, {
             "$limit": 400
         }]
         # last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep$"}})
         # for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}):
         last_sep = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^jdskuid20200920_sep$"
             }})
         for table in m.list_tables(
                 dbname="jingdong",
                 filter={"name": {
                     "$regex": r"^jdskuid20200920retry\d*$"
                 }}):
             if not last_sep or table > last_sep:
                 self.logger.info("valid table : {}".format(table))
                 for item in m.read_from(db_collect=("jingdong", table),
                                         out_field=("skuid", ),
                                         pipeline=pipeline):
                     skuid_set.add(int(item[0]))
         #skuids in last result
         pipeline = [{"$limit": 40}]
         last_result = m.get_lasted_collection(
             "jingdong", filter={"name": {
                 "$regex": r"^month20\d\d\d\d$"
             }})
         for item in m.read_from(db_collect=("jingdong", last_result),
                                 out_field=("skuid", ),
                                 pipeline=pipeline):
             skuid_set.add(int(item[0]))
         buffer = []
         buffer_size = 10000
         for i, seed in enumerate(skuid_set):
             seed = Seed(value=seed, type=0)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 random.shuffle(buffer)
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             random.shuffle(buffer)
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #27
0
def run_result():
    with op.DBManger() as m:
        pipeline = [{
            "$match": {
                "_status": 0
            },
        }]
        price_dic = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdprice20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdprice(20\d\d\d\d\d\d)$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdprice20201209_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdprice(20210129)$"
                }}):
            if not last_sep or table > last_sep:
                print("step 1: processing {}".format(table), flush=True)
                for item in m.read_from(db_collect=("jingdong", table),
                                        pipeline=pipeline):
                    if int(item["id"]) in price_dic:
                        tmp = price_dic[int(item["id"])]
                        tmp["prices"] = (tmp["prices"][0] + 1,
                                         tmp["prices"][1] + clean_price(item))
                    else:
                        price_dic[int(item["id"])] = {
                            "prices": (1, clean_price(item))
                        }
        for skuid in price_dic:
            tmp = price_dic[int(skuid)]
            tmp["clean_price"] = round(tmp["prices"][1] / tmp["prices"][0], 2)
            tmp.pop("prices")
        result_dic = price_dic

        #skuids in last result
        last_month_skuids = {}
        last_result = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^month20\d\d\d\d$"
            }})
        print("step 2: processing {}".format(last_result), flush=True)
        last_month = last_result[-6:]
        for skuid, comments, price, cate_id, brand_id, ziying in m.read_from(
                db_collect=("jingdong", last_result),
                out_field=("skuid", "comments", "clean_price", "cate_id",
                           "brand_id", "ziying")):
            if cate_id:
                last_month_skuids[int(skuid)] = {
                    "clean_price": price,
                    "comments": comments,
                    "cate_id": format_cat_id(cate_id),
                    "brand_id": brand_id,
                    "ziying": ziying
                }

        skuid_sukid_dict = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdskuid20201214_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20210108)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 3: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            "_status": 0
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "cate_id": "$cate_id",
                            "brand_id": "$brand_id",
                            "ziying": "$ziying",
                        }
                    },
                ]
                for skuid, cate_id, brand_id, ziying in m.read_from(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "cate_id", "brand_id", "ziying"),
                        pipeline=pipeline):
                    skuid_sukid_dict[int(skuid)] = {
                        "cate_id": cate_id,
                        "brand_id": "0" if brand_id is None else brand_id,
                        "ziying": ziying
                    }

        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdcomment20201218_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdcomment(20210302)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 4: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}]
                            "$and": [{
                                "_status": 0
                            }, {
                                "comment": {
                                    "$gt": "0"
                                }
                            }]
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "comment": "$comment",
                        }
                    },
                ]
                for skuid, comments in m.read_from_yield(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "comment"),
                        pipeline=pipeline):
                    if int(skuid) in skuid_sukid_dict:
                        if int(skuid) in price_dic:
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = price_dic[int(
                                skuid)]["clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 0
                        elif int(skuid) in last_month_skuids:
                            last_month_price_item = last_month_skuids[int(
                                skuid)]
                            if int(skuid) not in result_dic:
                                result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = last_month_price_item[
                                "clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 1
                        else:
                            result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = 79.90
                            price_item["comments"] = int(comments)
                            price_item["type"] = 2
                        skuid_sukid_item = skuid_sukid_dict[int(skuid)]
                        price_item["cate_id"] = skuid_sukid_item["cate_id"]
                        price_item["brand_id"] = skuid_sukid_item["brand_id"]
                        price_item["ziying"] = skuid_sukid_item["ziying"]
                    elif int(skuid) in last_month_skuids:
                        if int(skuid) in price_dic:
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = price_dic[int(
                                skuid)]["clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 3
                        elif int(skuid) in last_month_skuids:
                            last_month_price_item = last_month_skuids[int(
                                skuid)]
                            if int(skuid) not in result_dic:
                                result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = last_month_price_item[
                                "clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 4
                        else:
                            result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = 79.90
                            price_item["comments"] = int(comments)
                            price_item["type"] = 5
                        last_month_skuids_item = last_month_skuids[int(skuid)]
                        price_item["cate_id"] = last_month_skuids_item[
                            "cate_id"]
                        price_item["brand_id"] = last_month_skuids_item[
                            "brand_id"]
                        price_item["ziying"] = last_month_skuids_item["ziying"]
                    else:
                        result_dic[int(skuid)] = {}
                        price_item = result_dic[int(skuid)]
                        price_item["clean_price"] = 79.90
                        price_item["comments"] = int(comments)
                        price_item["cate_id"] = "0,0,0"
                        price_item["brand_id"] = "0"
                        price_item["ziying"] = "-1"
                        price_item["type"] = 6
        print(
            "step 5: processing skuid in last_month_skuids but not in result_dic",
            flush=True)
        for skuid in last_month_skuids:
            if int(skuid) not in result_dic:
                result_dic[int(skuid)] = {}
                price_item = result_dic[int(skuid)]
                price_item["clean_price"] = last_month_skuids[skuid][
                    "clean_price"]
                price_item["comments"] = last_month_skuids[skuid]["comments"]
                price_item["cate_id"] = "0,0,0"
                price_item["brand_id"] = "0"
                price_item["ziying"] = "-1"
                price_item["type"] = 7
            else:
                price_item = result_dic[int(skuid)]
                if 'type' not in price_item:
                    price_item["clean_price"] = last_month_skuids[skuid][
                        "clean_price"]
                    price_item["comments"] = last_month_skuids[skuid][
                        "comments"]
                    price_item["cate_id"] = last_month_skuids[skuid]["cate_id"]
                    price_item["brand_id"] = last_month_skuids[skuid][
                        "brand_id"]
                    price_item["ziying"] = last_month_skuids[skuid]["ziying"]
                    price_item["type"] = 8

        this_month = timeUtil.get_month(deltamonth=1, current_month=last_month)
        out_table = "month" + this_month
        print("step 6: processing writing result to {}".format(out_table),
              flush=True)
        buffer = []
        buffer_size = 5000
        print("result_dic:{}".format(len(result_dic)), flush=True)
        for i, k in enumerate(result_dic):
            result_dic[k]["skuid"] = k
            if "prices" in result_dic[k]:
                result_dic[k].pop("prices")
            result_dic[k]["month"] = this_month
            if "cate_id" in result_dic[k]:
                buffer.append(result_dic[k])
            else:
                print(result_dic[k])
            if i % buffer_size == 0 and buffer:
                m.insert_many_dict(db_collect=("jingdong", out_table),
                                   data_dict_list=buffer)
                buffer = []
        if buffer:
            m.insert_many_dict(db_collect=("jingdong", out_table),
                               data_dict_list=buffer)
        m.create_db_collection(
            db_collection=("jingdong", "jdprice{0}_sep".format(current_date)))
Exemple #28
0
 def __init__(self):
     self.num = 0
     self.state = None
     thread.start_new_thread(self.receive_ation, (None, ))
     self.writer = op.DBManger()
Exemple #29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from mongo import op
import os
import glob
from multiprocess.tools import timeUtil
os.chdir("/home/u9000/martingale/jd_month/")
current_date = timeUtil.current_time()
with op.DBManger() as db:
    db.load_file_to_db(filename="month202006",
                       db_collect=("jingdong", "month202006"),
                       sep="\t",
                       buffer_size=128,
                       column_index_list=[0, 14, 28, 29, 30, 31],
                       fields_tupe=("skuid", "comment", "price", "cate_id",
                                    "brand_id", "ziying"),
                       attach_dict={"_month": 202006})
Exemple #30
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdpricemiss{0}_sep".format(current_date)))
         pipeline = [
             {
                 "$match": {
                     "$and": [{
                         "_status": 0
                     }, {
                         "comment": {
                             "$gt": 0
                         }
                     }]
                 }
             },
             {
                 "$project": {
                     "skuid": "$skuid",
                 }
             },
         ]
         skuid_set = set()
         last_sep = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^jdcomment20\d\d\d\d\d\d_sep"
             }})
         for table in m.list_tables(
                 dbname="jingdong",
                 filter={
                     "name": {
                         "$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"
                     }
                 }):
             if not last_sep or table > last_sep:
                 self.logger.info("valid table : {}".format(table))
                 for item in m.read_from(db_collect=("jingdong", table),
                                         out_field=("skuid", ),
                                         pipeline=pipeline):
                     skuid_set.add(int(item[0]))
         #skuids in last result
         skuid_set1 = set()
         last_result = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^summary_201905_20\d\d\d\d$"
             }})
         for item in m.read_from(db_collect=("jingdong", last_result),
                                 out_field=("skuid", )):
             skuid_set1.add(int(item[0]))
         skuid_set = skuid_set - skuid_set1
         self.logger.info(
             "total new skuid of comment larger than 0 is: {}".format(
                 len(skuid_set)))
         buffer = []
         for i, seed in enumerate(skuid_set):
             seed = str(seed)
             current = seed.strip()
             if i % 60 == 0:
                 if i != 0:
                     seed = Seed(value=strr, type=0)
                     buffer.append(str(seed))
                 strr = current
             else:
                 strr = strr + '%2CJ_' + current
         if strr:
             seed = Seed(value=strr, type=0)
             buffer.append(str(seed))
         if buffer:
             buffer1 = []
             buffer_size = 10000
             for i, seed in enumerate(buffer):
                 buffer1.append(str(seed))
                 if len(buffer1) % buffer_size == 0:
                     random.shuffle(buffer1)
                     self.redis.sadd(self.start_urls_redis_key, *buffer1)
                     buffer1 = []
             if buffer1:
                 random.shuffle(buffer1)
                 self.redis.sadd(self.start_urls_redis_key, *buffer1)