def insert2companyAll(): from mongo import op new_company = [] new_ids = set(db[companyCol].distinct('id')) old_ids = set(db['companyALL'].distinct('_id')) insert_ids = list(new_ids - old_ids) with op.DBManger() as m: table = m.get_lasted_collection( "51job", filter={"name": { "$regex": r"company_20\d\d\d\d\d\d" }}) print(table) new_items = db[table].find({'id': {'$in': insert_ids}}) for item in new_items: co = { '_id': item['id'], 'name': item['name'], 'industry': item['industry'], 'location': item['location'], 'compkind': item['compkind'], 'size': item['size'] } new_company.append(co) # print(co) db['companyALL'].insert(new_company) print('new company insert successful!')
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer = [] buffer_size = 1024 with op.DBManger() as m: m.create_db_collection(db_collection=("jingdong", "jdskuid{0}_sep".format(current_date))) last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\d_sep$"}}) seed_set = set() for table in m.list_tables("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\dretry\d*$"}}): if not last_sep or table > last_sep: self.logger.info("valid table : {}".format(table)) pipeline = [ {"$match": { "$and": [{"_status": 0}, {"$or": [{"status": 0}, {"status": -1}]}] } } ] for seed in m.read_from(db_collect=("jingdong", table), out_field=("cate_id", "brand_id"), pipeline=pipeline): seed_set.add(seed) for i, seed in enumerate(seed_set): seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def init_clean_price(self): from mongo import op from tqdm import tqdm pipeline = [{"$match": {"pid": {"$ne": "null"}}}] with op.DBManger() as m: dic = {} m.drop_db_collect(db_collect=("secoo", "CleanListNew")) for collection in tqdm(m.list_tables("secoo", filter={ "name": { "$regex": r"List20\d\d\d\d\d\d" } }), desc="init_clean_price"): for pid, price, self1 in m.read_from(db_collect=("secoo", collection), out_field=("pid", "price", "self"), pipeline=pipeline): dic.update({pid: (price, self1)}) date_tuple_list = [] for k, (p, s) in dic.items(): date_tuple_list.append((k, k, p, s)) m.date_tuple_to_db(date_tuple_list=date_tuple_list, db_collect=("secoo", "CleanListNew"), fields_tupe=("_id", "pid", "price", "self"), buffer_size=1024, attach_dict={"_date": self.current_date})
def __init__(self, **kwargs): super(JDPriceMiss, self).__init__(**kwargs) self.ua = UserAgent() with op.DBManger() as m: table = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}}) skuid_set = set() for item in m.read_from(db_collect=("jingdong",table), out_field=("skuid",)): skuid_set.add(item[0]) for i, seed in enumerate(skuid_set): current = seed.strip() if i % 60 == 0: if i != 0: self.seeds_queue.put(Seed(strr, kwargs["retries"])) strr = current else: strr = strr + '%2CJ_' + current if strr: self.seeds_queue.put(Seed(strr, kwargs["retries"])) self.block_pattern = re.compile(r'{.*?}') self.innerid_pattern = re.compile(r'\d+') self.innerprice_pattern = re.compile(r'"\d+.\d+"') self.op_pattern = re.compile(r'"op":"(\d+.\d+)"') self.p_pattern = re.compile(r'"(\d+.\d+)"') self. p2_pattern = re.compile(r'"(-\d+.\d+)"') self.p1 = re.compile(r'"id":.*?"}') self.id_pattern = re.compile(r'id:"J_(\d+)"') self.first_pattern = re.compile(r'([a-zA-Z]*)":') self.rid = random.randint(100000000, 999999999) self.usrid = str(self.rid) self.up_pattern = re.compile('"up":"tpp"') self.price_pattern = re.compile(r'^\d+\.\d\d$')
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer = [] buffer_size = 1024 with op.DBManger() as m: pipeline = [ { "$match": { "_status": 3 } }, ] data_set = collections.DataSet( m.read_from(db_collect=("jingdong", self.last_retry_collect), out_field=("_seed", "_status"), pipeline=pipeline)) should_exit = True for i, (seed, status) in enumerate(data_set.distinct()): should_exit = False seed = Seed(value=seed, type=3) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer) if should_exit: import sys sys.exit(0)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer_size = 1024 with op.DBManger() as m: m.create_db_collection( db_collection=("jingdong", "jdchaoshi{0}_sep".format(current_date))) buffer = [] import requests request = { "url": "https://chaoshi.jd.com/", "headers": { 'Connection': 'close', "Referer": "https://www.jd.com", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' } } nav_pattern = re.compile(r'navThird[1-9]: (\[.*\])') import time from ast import literal_eval src = requests.get(**request).text for i in nav_pattern.findall(src): for j in literal_eval(i): for k in j["children"]: seed = Seed(value=k["URL"].replace("\\", ""), type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def reader(): global queue # ids = db["companyIds"].distinct("_id") from mongo import op with op.DBManger() as m: last_company_ids = m.get_lasted_collection( "liepin", filter={"name": { "$regex": r"CompanyIdLib_20\d\d\d\d\d\d" }}) #ids = db['CompanyIdLib_20200617'].distinct('_id') ids = db[last_company_ids].distinct('_id') url = "https://www.liepin.com/company/sojob.json" for i, id in enumerate(ids): if i % 10000 == 0: print time.strftime('%Y-%m-%d %H:%M:%S Loading...'), i formdata = { "ecompIds": id, "pageSize": "15", "curPage": 0, "keywords": "", "dq": "", "deptId": "", } task = {"url": url, "formdata": formdata} queue.put(task) maxCount = queue.qsize() print time.strftime('%Y-%m-%d %H:%M:%S Loading complete'), maxCount
def clean_price(self): from mongo import op pipeline = [{"$match": {"pid": {"$ne": None}}}] with op.DBManger() as m: dic = {} m.drop_db_collect(db_collect=("secoo", "CleanListOld")) m.rename_collection(old_db_collection=("secoo", "CleanListNew"), new_db_collection=("secoo", "CleanListOld")) for pid, price, self1 in m.read_from(db_collect=("secoo", "CleanListOld"), out_field=("pid", "price", "self")): dic.update({pid: (price, self1)}) for pid, price, self1 in m.read_from( db_collect=("secoo", "List" + self.current_date), out_field=("pid", "price", "self"), pipeline=pipeline): dic.update({pid: (price, self1)}) date_tuple_list = [] for k, (p, s) in dic.items(): date_tuple_list.append((k, k, p, s)) m.date_tuple_to_db(date_tuple_list=date_tuple_list, db_collect=("secoo", "CleanListNew"), fields_tupe=("_id", "pid", "price", "self"), buffer_size=128, attach_dict={"_date": self.current_date}, show_pbar=True, pbar_name="clean_price")
def __init__(self, *args, **kwargs): super(RetryMaster, self).__init__(*args, **kwargs) with op.DBManger() as m: self.last_retry_collect = 'jdprice_miss_out' self.new_retry_collect = 'jdprice_miss_out_retry' self.logger.info((self.last_retry_collect, self.new_retry_collect)) self.out_table = self.new_retry_collect
def __init__(self, address, verbose=True): """ :param address: Socket address (Ip address, Port) :type address: tuple :param verbose: """ self.ip, self.port = address self.verbose = verbose self.state = None self.writer = op.DBManger() self.lock = threading.Lock() def receive_ation(passss): try: action_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) action_socket.bind((self.ip, 10053)) action_socket.settimeout(100000) while True: (data, client_address ) = action_socket.recvfrom(DEFAULT_BUFFER_SIZE) print('change state from {} to {}'.format( self.state, data)) if data == b'None': self.state = None else: self.state = data except KeyboardInterrupt: if self.verbose: print('\rAction Server Shutdown!') finally: action_socket.close() thread = threading.Thread(target=receive_ation, args=(None, )) thread.start()
def __init__(self, **kwargs): super(JDPrice, self).__init__(**kwargs) self.ua = UserAgent() with op.DBManger() as m: #创建临时表本月任务的分界线 m.create_db_collection(db_collection=("jingdong","jdcommentdetail{0}_sep".format(current_date))) skuid_set = {} top1000w = TopK(1) #skuids in last result last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}}) pipeline = [ { "$project": { "skuid": "$skuid", "comment_{}".format(last_result[-6:]):"$comment_{}".format(last_result[-6:]) } }, {"$limit": 100} ] for item, comments in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comment_{}".format(last_result[-6:])),pipeline=pipeline): if int(item) not in skuid_set: top1000w.push(int(comments)) skuid_set[int(item)] = int(comments) top1000w = set(top1000w.get_topk()) for i, seed in enumerate(skuid_set): if skuid_set[seed] in top1000w: seed = Seed(value=seed, type=0) self.seeds_queue.put(seed)
def run_result(): with op.DBManger() as m: skuid_sukid_dict = {} for shopid, shop_name in m.read_from(db_collect=("jingdong", "zhaixun_shop"), out_field=("shopid","shop_name")): if shop_name: skuid_sukid_dict[shopid] = (shopid,shop_name) #skuids in last result m.date_tuple_to_db(date_tuple_list=list(skuid_sukid_dict.values()),db_collect=("jingdong","zhaixun_shop_1"),fields_tupe=("shopid","shop_name"))
def run_result(): with op.DBManger() as m: brandid2name={} for brand_id, brand_name in m.read_from(db_collect=("jingdong", "jdbrand20210108retry0"), out_field=("brand_id", "name")): if brand_id: brandid2name[int(brand_id)] = brand_name skuid_sukid_dict = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20201214_sep"}}) tables = m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20210108)retry\d*$"}}) tables.extend(m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20201214)retry\d*$"}})) for table in tables: if not last_sep or table > last_sep: print("step 1: processing {}".format(table), flush=True) pipeline = [ { "$match": {"_status": 0} }, { "$project": { "skuid": "$skuid", "cate_id": "$cate_id", "brand_id": "$brand_id", "shopid": "$shopid", "shop_name": "$shop_name", "title": "$title", } }, ] for skuid, cate_id, brand_id, shopid, shop_name, title in m.read_from(db_collect=("jingdong", table), out_field=("skuid","cate_id","brand_id","shopid","shop_name","title"), pipeline=pipeline): skuid_sukid_dict[int(skuid)]={"cate_id":cate_id,"brand_id": "0" if brand_id is None else brand_id,"shopid":shopid,"shop_name":shop_name,"title":title} #skuids in last result buffer = [] buffer_size = 5000 last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^month202007$"}}) print("step 2: processing {}".format(last_result), flush=True) count = 0 out_table = "month_zx_" + "202007" for skuid, comments, price,cate_id,brand_id,month in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comments","clean_price","cate_id","brand_id","month")): count = count + 1 if int(skuid) not in skuid_sukid_dict: tmp = {"skuid": int(skuid), "clean_price": price, "comments": comments, "cate_id": format_cat_id(cate_id), "brand_id": brand_id,"brand_name": brandid2name.get(int(brand_id)), "month": month} else: item = skuid_sukid_dict[int(skuid)] tmp = {"skuid": int(skuid), "clean_price": price, "comments": comments, "cate_id": format_cat_id(cate_id) if cate_id else None, "brand_id": brand_id,"brand_name": brandid2name.get(int(brand_id)), "month": month, "shopid":item["shopid"],"shop_name":item["shop_name"],"title":item["title"]} buffer.append(tmp) if count % buffer_size == 0 and buffer: m.insert_many_dict(db_collect=("jingdong",out_table), data_dict_list=buffer) buffer = [] if buffer: m.insert_many_dict(db_collect=("jingdong", out_table), data_dict_list=buffer)
def run_result(): with op.DBManger() as m: skuid_sukid_dict = {} tables = m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20210108)retry\d*$" }}) tables.extend( m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20201214)retry\d*$" }})) tables.extend( m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20200821)retry\d*$" }})) tables.extend( m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20200920)retry\d*$" }})) for table in tables: print("step 1: processing {}".format(table), flush=True) pipeline = [ { "$match": { "_status": 0 } }, { "$project": { "shopid": "$shopid", "shop_name": "$shop_name", } }, ] for shopid, shop_name in m.read_from(db_collect=("jingdong", table), out_field=("shopid", "shop_name"), pipeline=pipeline): if shop_name: skuid_sukid_dict[shopid] = (shopid, shop_name) elif shopid: skuid_sukid_dict[shopid] = (shopid, None) #skuids in last result m.date_tuple_to_db(date_tuple_list=list(skuid_sukid_dict.values()), db_collect=("jingdong", "zhaixun_shop"), fields_tupe=("shopid", "shop_name"))
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: pipeline = [ { "$match": { "skuid": { "$ne": None } } }, ] skuid_set = set() for item in m.read_from(db_collect=("jingdong", "jdprice_miss_seed"), out_field=("skuid", ), pipeline=pipeline): skuid_set.add(int(item[0])) self.logger.info( "total new skuid of comment larger than 0 is: {}".format( len(skuid_set))) buffer = [] for i, seed in enumerate(skuid_set): seed = str(seed) current = seed.strip() if i % 60 == 0: if i != 0: seed = Seed(value=strr, type=0) buffer.append(str(seed)) strr = current else: strr = strr + '%2CJ_' + current if strr: seed = Seed(value=strr, type=0) buffer.append(str(seed)) if buffer: buffer1 = [] buffer_size = 10000 for i, seed in enumerate(buffer): buffer1.append(str(seed)) if len(buffer1) % buffer_size == 0: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1) buffer1 = [] if buffer1: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: #创建临时表本月任务的分界线 m.create_db_collection( db_collection=("jingdong", "jdcommentdetail{0}_sep".format(current_date))) skuid_set = {} top1000w = TopK(1000000) #skuids in last result last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^summary_201905_20\d\d\d\d$" }}) pipeline = [ { "$project": { "skuid": "$skuid", "comment_{}".format(last_result[-6:]): "$comment_{}".format(last_result[-6:]) } }, #{"$limit": 1000} ] for item, comments in m.read_from( db_collect=("jingdong", last_result), out_field=("skuid", "comment_{}".format(last_result[-6:])), pipeline=pipeline): if int(item) not in skuid_set: top1000w.push(int(comments)) skuid_set[int(item)] = int(comments) top1000w = set(top1000w.get_topk()) buffer = [] buffer_size = 10000 for i, seed in enumerate(skuid_set): if skuid_set[seed] in top1000w: seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer)
def __init__(self, *args, **kwargs): super(RetryMaster, self).__init__(*args, **kwargs) with op.DBManger() as m: self.last_retry_collect = m.get_lasted_collection( "jingdong", filter={ "name": { "$regex": r"^jdsearch20\d\d\d\d\d\dretry\d+$" } }) self.new_retry_collect = self.last_retry_collect[:self.last_retry_collect.find( "retry") + 5] + str( int(self.last_retry_collect[self.last_retry_collect. find("retry") + 5:]) + 1) if self.last_retry_collect.find( "retry") != -1 else self.last_retry_collect + "retry1" self.logger.info((self.last_retry_collect, self.new_retry_collect))
def __init__(self, **kwargs): super(GetProductId, self).__init__(**kwargs) self.retries = 3 self.proxies = HttpProxy.getHttpProxy() self.ua = UserAgent() with op.DBManger() as m: last_brand_collect = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^brand20\d\d\d\d\d\d$" }}) pipeline = [{ "$match": { "cate_id": { "$ne": None } } }, { "$match": { "brand_id": { "$ne": None } } }, { "$match": { "name": { "$ne": None } } }, { "$match": { "_status": 0 } }] data_set = collections.DataSet( m.read_from(db_collect=("jingdong", last_brand_collect), out_field=("cate_id", "brand_id", "name"), pipeline=pipeline)) for i, seed in enumerate(data_set.distinct()): self.seeds_queue.put( Seed(value=seed, retries=self.retries, type=0)) self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',") self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}') self.totalpage_perttern = re.compile( r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer_size = 1024 with op.DBManger() as m: #m.create_db_collection(db_collection=("jingdong", "jdbrand{0}_sep".format(current_date))) buffer = [] pipeline = [{"$limit": 1}] for seed in m.read_from(db_collect=("jingdong", "newCateName"), out_field=("cate_id", ), pipeline=pipeline): seed = Seed(value=seed[0], type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def __init__(self, current_date, **kwargs): super(SecooMonthJob1, self).__init__(**kwargs) self.proxies = list(map(lambda x: ("http://u{}:[email protected]:3128".format(x)), range(28))) self.ua = UserAgent() self.current_date = current_date with op.DBManger() as m: total = m.count(db_collect=("secoo", "CleanListNew")) for pid, price in tqdm(m.read_from(db_collect=("secoo", "CleanListNew"), out_field=("pid", "price")), total=total, desc="reading"): self.seeds_queue.put(Seed((pid, price), kwargs["retries"], type=0)) self.seed_retries = kwargs["retries"] self.page_pattern = re.compile(r'totalCurrCommentNum":.*?,') self.block_pattern = re.compile(r'{"isShow.*?}') self.bench = timeUtil.getdate(-90, format='%Y%m%d') self.id_pattern = re.compile(r'"id":\d+') self.pid_pattern = re.compile(r'productId":\d+') self.time_pattern = re.compile(r'createDate":\d+') self.user_pattern = re.compile(r'userName":"******"') self.device_pattern = re.compile(r'sourceDevice":".*?"')
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: skuid_set = set() for item in m.read_from(db_collect=("jingdong", self.intable), out_field=("skuid", )): skuid_set.add(int(item[0])) buffer = [] buffer_size = 10000 for i, seed in enumerate(skuid_set): seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer)
def __init__(self, filename): # format后的tablename self.table_name = time.strftime('company_%Y%m%d', time.localtime(time.time())) self.new_company_lib = time.strftime('CompanyIdLib_%Y%m%d', time.localtime(time.time())) # company id库 from mongo import op with op.DBManger() as m: last_company_ids = m.get_lasted_collection( "liepin", filter={"name": { "$regex": r"CompanyIdLib_20\d\d\d\d\d\d" }}) #self.company_ids = 'CompanyIdLib_20200602' self.company_ids = last_company_ids print("last_company_ids: " + last_company_ids) # 最终插入mongo的数据列表 self.final_list = [] # liepin_data self.filename = filename self.db = pymongo.MongoClient('mongodb://192.168.0.13:27017')['liepin']
def run_result(): with op.DBManger() as m: skuid_sukid_dict = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdskuid20201214_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20201214)retry\d*$" }}): if not last_sep or table > last_sep: print("step 3: processing {}".format(table), flush=True) pipeline = [ { "$match": { "_status": 0 } }, { "$project": { "skuid": "$skuid", "cate_id": "$cate_id", "brand_id": "$brand_id", "ziying": "$ziying", } }, ] for skuid, cate_id, brand_id, ziying in m.read_from( db_collect=("jingdong", table), out_field=("skuid", "cate_id", "brand_id", "ziying"), pipeline=pipeline): skuid_sukid_dict[int(skuid)] = { "cate_id": cate_id, "brand_id": "0" if brand_id is None else brand_id, "ziying": ziying } #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}): count = 0 count1 = 0 a = set() b = set() last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdcomment20201218_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdcomment(20201218)retry\d*$" }}): if not last_sep or table > last_sep: print("step 4: processing {}".format(table), flush=True) pipeline = [ { "$match": { #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}] "$and": [{ "_status": 0 }, { "comment": { "$gt": "0" } }] } }, { "$project": { "skuid": "$skuid", "comment": "$comment", } }, ] for skuid, comments in m.read_from_yield( db_collect=("jingdong", table), out_field=("skuid", "comment"), pipeline=pipeline): if int(skuid) in skuid_sukid_dict: count = count + 1 if count < 50: a.add(int(skuid)) else: count1 = count1 + 1 if count1 < 50: b.add(int(skuid)) print(count, ) for i, v in enumerate(a): print(v) print(count1, ) for i, v in enumerate(b): print(v)
myconf.setAppName("test").setMaster("local[4]") myconf.set('spark.executor.instances', '4') myconf.set('spark.driver.memory', '6G') #myconf.set('spark.executor.memory','1G') myconf.set('spark.executor.cores', '4') myconf.set('spark.task.cpus', '4') # 指定连接器对应的spark-package myconf.set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1") spark = SparkSession.builder.config(conf=myconf).getOrCreate() logger = spark._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL) # 使用指定格式读取 with op.DBManger() as m: month = '202102' last_month = '202101' tables = m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdpricejdprice20210222$" }}) schema = StructType([ StructField("skuid", LongType(), True), StructField("price", DoubleType(), True) ]) df = spark.createDataFrame([], schema) for table in tables: tmp = spark.read.format("mongo").option( "uri", "mongodb://192.168.0.13:27017/jingdong.{}".format(
from mongo import op from tqdm import tqdm # with op.DBManger() as m, open("paopaomate_summary","w") as f: # pipeline = [ # { # "$lookup": { # "from": "paopaomate_comment", # "localField": "skuid", # "foreignField": "skuid", # "as": "skuid" # } # }, # {"$out": "paopaomate_final"} # ] # m.aggregate(db_collect=("jingdong","paopaomate_summary"), pipeline=pipeline) with op.DBManger() as m, open("paopaomate_summary", "w") as f: tmp = {} for skuid, comment in m.read_from(db_collect=("jingdong", "paopaomate_comment"), out_field=("skuid", "comment")): tmp[str(skuid)] = comment headers = [ "skuid", "brand_id", "price", "comment_201905", "comment_201906", "comment_201907", "comment_201908", "comment_201909", "comment_201910", "comment_201911", "comment_201912", "comment_202001", "comment_202002", "comment_202003", "comment_202004", "comment_202005", "comment_202006", "comment_202007", "comment_202008", "comment_202009" ] skuids = set() for item in tqdm(m.read_from(db_collect=("jingdong",
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: #创建临时表本月任务的分界线 m.create_db_collection( db_collection=("jingdong", "jdcomment{0}_sep".format(current_date))) skuid_set = set() pipeline = [{ "$match": { "$and": [{ "_status": 0 }, { "skuid": { "$ne": None } }] } }, { "$project": { "skuid": "$skuid", } }, { "$limit": 400 }] # last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep$"}}) # for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdskuid20200920_sep$" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid20200920retry\d*$" }}): if not last_sep or table > last_sep: self.logger.info("valid table : {}".format(table)) for item in m.read_from(db_collect=("jingdong", table), out_field=("skuid", ), pipeline=pipeline): skuid_set.add(int(item[0])) #skuids in last result pipeline = [{"$limit": 40}] last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^month20\d\d\d\d$" }}) for item in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid", ), pipeline=pipeline): skuid_set.add(int(item[0])) buffer = [] buffer_size = 10000 for i, seed in enumerate(skuid_set): seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer)
def run_result(): with op.DBManger() as m: pipeline = [{ "$match": { "_status": 0 }, }] price_dic = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdprice20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdprice(20\d\d\d\d\d\d)$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdprice20201209_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdprice(20210129)$" }}): if not last_sep or table > last_sep: print("step 1: processing {}".format(table), flush=True) for item in m.read_from(db_collect=("jingdong", table), pipeline=pipeline): if int(item["id"]) in price_dic: tmp = price_dic[int(item["id"])] tmp["prices"] = (tmp["prices"][0] + 1, tmp["prices"][1] + clean_price(item)) else: price_dic[int(item["id"])] = { "prices": (1, clean_price(item)) } for skuid in price_dic: tmp = price_dic[int(skuid)] tmp["clean_price"] = round(tmp["prices"][1] / tmp["prices"][0], 2) tmp.pop("prices") result_dic = price_dic #skuids in last result last_month_skuids = {} last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^month20\d\d\d\d$" }}) print("step 2: processing {}".format(last_result), flush=True) last_month = last_result[-6:] for skuid, comments, price, cate_id, brand_id, ziying in m.read_from( db_collect=("jingdong", last_result), out_field=("skuid", "comments", "clean_price", "cate_id", "brand_id", "ziying")): if cate_id: last_month_skuids[int(skuid)] = { "clean_price": price, "comments": comments, "cate_id": format_cat_id(cate_id), "brand_id": brand_id, "ziying": ziying } skuid_sukid_dict = {} #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdskuid20201214_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdskuid(20210108)retry\d*$" }}): if not last_sep or table > last_sep: print("step 3: processing {}".format(table), flush=True) pipeline = [ { "$match": { "_status": 0 } }, { "$project": { "skuid": "$skuid", "cate_id": "$cate_id", "brand_id": "$brand_id", "ziying": "$ziying", } }, ] for skuid, cate_id, brand_id, ziying in m.read_from( db_collect=("jingdong", table), out_field=("skuid", "cate_id", "brand_id", "ziying"), pipeline=pipeline): skuid_sukid_dict[int(skuid)] = { "cate_id": cate_id, "brand_id": "0" if brand_id is None else brand_id, "ziying": ziying } #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}}) #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}): last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdcomment20201218_sep" }}) for table in m.list_tables( dbname="jingdong", filter={"name": { "$regex": r"^jdcomment(20210302)retry\d*$" }}): if not last_sep or table > last_sep: print("step 4: processing {}".format(table), flush=True) pipeline = [ { "$match": { #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}] "$and": [{ "_status": 0 }, { "comment": { "$gt": "0" } }] } }, { "$project": { "skuid": "$skuid", "comment": "$comment", } }, ] for skuid, comments in m.read_from_yield( db_collect=("jingdong", table), out_field=("skuid", "comment"), pipeline=pipeline): if int(skuid) in skuid_sukid_dict: if int(skuid) in price_dic: price_item = result_dic[int(skuid)] price_item["clean_price"] = price_dic[int( skuid)]["clean_price"] price_item["comments"] = int(comments) price_item["type"] = 0 elif int(skuid) in last_month_skuids: last_month_price_item = last_month_skuids[int( skuid)] if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_price_item[ "clean_price"] price_item["comments"] = int(comments) price_item["type"] = 1 else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["type"] = 2 skuid_sukid_item = skuid_sukid_dict[int(skuid)] price_item["cate_id"] = skuid_sukid_item["cate_id"] price_item["brand_id"] = skuid_sukid_item["brand_id"] price_item["ziying"] = skuid_sukid_item["ziying"] elif int(skuid) in last_month_skuids: if int(skuid) in price_dic: price_item = result_dic[int(skuid)] price_item["clean_price"] = price_dic[int( skuid)]["clean_price"] price_item["comments"] = int(comments) price_item["type"] = 3 elif int(skuid) in last_month_skuids: last_month_price_item = last_month_skuids[int( skuid)] if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_price_item[ "clean_price"] price_item["comments"] = int(comments) price_item["type"] = 4 else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["type"] = 5 last_month_skuids_item = last_month_skuids[int(skuid)] price_item["cate_id"] = last_month_skuids_item[ "cate_id"] price_item["brand_id"] = last_month_skuids_item[ "brand_id"] price_item["ziying"] = last_month_skuids_item["ziying"] else: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = 79.90 price_item["comments"] = int(comments) price_item["cate_id"] = "0,0,0" price_item["brand_id"] = "0" price_item["ziying"] = "-1" price_item["type"] = 6 print( "step 5: processing skuid in last_month_skuids but not in result_dic", flush=True) for skuid in last_month_skuids: if int(skuid) not in result_dic: result_dic[int(skuid)] = {} price_item = result_dic[int(skuid)] price_item["clean_price"] = last_month_skuids[skuid][ "clean_price"] price_item["comments"] = last_month_skuids[skuid]["comments"] price_item["cate_id"] = "0,0,0" price_item["brand_id"] = "0" price_item["ziying"] = "-1" price_item["type"] = 7 else: price_item = result_dic[int(skuid)] if 'type' not in price_item: price_item["clean_price"] = last_month_skuids[skuid][ "clean_price"] price_item["comments"] = last_month_skuids[skuid][ "comments"] price_item["cate_id"] = last_month_skuids[skuid]["cate_id"] price_item["brand_id"] = last_month_skuids[skuid][ "brand_id"] price_item["ziying"] = last_month_skuids[skuid]["ziying"] price_item["type"] = 8 this_month = timeUtil.get_month(deltamonth=1, current_month=last_month) out_table = "month" + this_month print("step 6: processing writing result to {}".format(out_table), flush=True) buffer = [] buffer_size = 5000 print("result_dic:{}".format(len(result_dic)), flush=True) for i, k in enumerate(result_dic): result_dic[k]["skuid"] = k if "prices" in result_dic[k]: result_dic[k].pop("prices") result_dic[k]["month"] = this_month if "cate_id" in result_dic[k]: buffer.append(result_dic[k]) else: print(result_dic[k]) if i % buffer_size == 0 and buffer: m.insert_many_dict(db_collect=("jingdong", out_table), data_dict_list=buffer) buffer = [] if buffer: m.insert_many_dict(db_collect=("jingdong", out_table), data_dict_list=buffer) m.create_db_collection( db_collection=("jingdong", "jdprice{0}_sep".format(current_date)))
def __init__(self): self.num = 0 self.state = None thread.start_new_thread(self.receive_ation, (None, )) self.writer = op.DBManger()
#!/usr/bin/env python # -*- coding: utf-8 -*- from mongo import op import os import glob from multiprocess.tools import timeUtil os.chdir("/home/u9000/martingale/jd_month/") current_date = timeUtil.current_time() with op.DBManger() as db: db.load_file_to_db(filename="month202006", db_collect=("jingdong", "month202006"), sep="\t", buffer_size=128, column_index_list=[0, 14, 28, 29, 30, 31], fields_tupe=("skuid", "comment", "price", "cate_id", "brand_id", "ziying"), attach_dict={"_month": 202006})
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: m.create_db_collection( db_collection=("jingdong", "jdpricemiss{0}_sep".format(current_date))) pipeline = [ { "$match": { "$and": [{ "_status": 0 }, { "comment": { "$gt": 0 } }] } }, { "$project": { "skuid": "$skuid", } }, ] skuid_set = set() last_sep = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^jdcomment20\d\d\d\d\d\d_sep" }}) for table in m.list_tables( dbname="jingdong", filter={ "name": { "$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$" } }): if not last_sep or table > last_sep: self.logger.info("valid table : {}".format(table)) for item in m.read_from(db_collect=("jingdong", table), out_field=("skuid", ), pipeline=pipeline): skuid_set.add(int(item[0])) #skuids in last result skuid_set1 = set() last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^summary_201905_20\d\d\d\d$" }}) for item in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid", )): skuid_set1.add(int(item[0])) skuid_set = skuid_set - skuid_set1 self.logger.info( "total new skuid of comment larger than 0 is: {}".format( len(skuid_set))) buffer = [] for i, seed in enumerate(skuid_set): seed = str(seed) current = seed.strip() if i % 60 == 0: if i != 0: seed = Seed(value=strr, type=0) buffer.append(str(seed)) strr = current else: strr = strr + '%2CJ_' + current if strr: seed = Seed(value=strr, type=0) buffer.append(str(seed)) if buffer: buffer1 = [] buffer_size = 10000 for i, seed in enumerate(buffer): buffer1.append(str(seed)) if len(buffer1) % buffer_size == 0: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1) buffer1 = [] if buffer1: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1)