class MongoSessionFactory(object): def __init__(self, database, collection='sessions', ttl=None, **kwargs): self.collection = MongoClient(**kwargs)[database][collection] self.collection.drop_indexes() if ttl is not None: self.collection.ensure_index('last_access', expireAfterSeconds=ttl) def load(self, id=None): session = Session() try: doc = self.collection.find_one({'_id': ObjectId(id)}) if doc is not None: session.data = doc['data'] session.id = id except: pass return session def save(self, session): doc = { 'data': session.data, 'last_access': datetime.utcnow(), } if session.id is not None: doc['_id'] = ObjectId(session.id) self.collection.save(doc) return str(doc['_id'])
def make_arch_db(): executor = ProcessPoolExecutor(max_workers=8) by = 10000 m = 60000000 #by = 2000 #m = 10000 e = executor.map(process_range, zip(range(0, m, by), range(by, m + by, by))) executor.shutdown() print('done calculating architectures') pfam_sets = merge(e) print(len(pfam_sets)) gsave(pfam_sets, 'pfam_sets.pkl.gz') # mongodb db = MongoClient('wl-cmadmin', 27017).ArchDB_Pfam_071414.ArchDB_Pfam_071414 db.insert( map( lambda item: { '_id': min(item[1]), 'pID': list(item[1]), 'Pfam': item[0] }, pfam_sets.items())) db.ensure_index('pID') db.ensure_index('Pfam')
class PyVaultMongoBackend(PyVaultBackend): """ implements a MonoDB backend :param uri: a MongoURI that selects the database to store all data under. :param collection: select the collection to save the data under """ def __init__(self, uri, collection='store'): self._db = MongoClient(uri) if isinstance(self._db, MongoClient): dbname = (uri.rsplit("/", 1)[1]).split("?", 1)[0] self._db = getattr(self._db, dbname) self._db = getattr(self._db, collection) self._db.ensure_index("id", background=True) def exists(self): meta = self._db.find_one({"id": "_meta"}) return not meta is None def create(self): pass def get_meta(self): meta = self._db.find_one({"id": "_meta"}) if meta is None: raise ValueError return meta def set_meta(self, data): meta = self._db.find_one({"id": "_meta"}) if not meta is None: raise ValueError data['id'] = "_meta" self._db.insert(data) def retrieve(self, key): item = self._db.find_one({"id": key}) if item is None: raise ValueError return item def store(self, key, data): data['id'] = key self._db.update( spec={"id": key}, document=data, upsert=True ) def delete(self, key): self._db.remove({"id": key})
def mongo_clstr(clstr_file, name, host='wl-cmadmin', port=None, ipa=True): #name = "071414_ComPIL_forwardonly_0_7" from pymongo import MongoClient coll = MongoClient(host=host, port=port)[name][name] if ipa: coll.insert(map(annotate_group_ipa, parse_clstr_compil(clstr_file))) else: coll.insert(parse_clstr_compil(clstr_file)) coll.ensure_index('pID')
def make_arch_db(): executor = ProcessPoolExecutor(max_workers=8) by = 10000 m = 60000000 #by = 2000 #m = 10000 e = executor.map(process_range, zip(range(0, m, by),range(by, m+by, by))) executor.shutdown() print('done calculating architectures') pfam_sets = merge(e) print(len(pfam_sets)) gsave(pfam_sets,'pfam_sets.pkl.gz') # mongodb db = MongoClient('wl-cmadmin', 27017).ArchDB_Pfam_071414.ArchDB_Pfam_071414 db.insert(map(lambda item: {'_id': min(item[1]), 'pID': list(item[1]), 'Pfam': item[0]}, pfam_sets.items())) db.ensure_index('pID') db.ensure_index('Pfam')
#!/usr/bin/env python # -*- coding: utf-8 -*- import time from pymongo import MongoClient from bottle import Bottle, request, abort, static_file application = Bottle() db = MongoClient()['buibui']['danmaku'] db.ensure_index('ts') @application.route('/buibui/get_danmakus') def get_danmakus(): ts = int(request.params.ts) qset = list(db.find({'ts': {'$gt': ts}})) for i in qset: del i['_id'] return {'danmakus': qset} TEXT_MAX = 72 BUI_PARAMS = { 'text': str, 'mode': int, 'color': str, 'size': int, }
#!/usr/bin/env python # -*- coding: utf-8 -*- import json, sys, re from datetime import datetime from pymongo import MongoClient db = MongoClient()["m6forums"]['posts'] re_cp1212 = re.compile(ur'[\x80-\x9F]') def recode(x): if type(x) == unicode and re_cp1212.search(x): x = x.encode('latin1').decode('CP1252', 'ignore') return x.encode('utf8') re_newline = re.compile(r"[\r\n]+") uniline = lambda x: re_newline.sub(" ", x) escaper = lambda x: '"%s"' % x.replace('"', '""') formatt = lambda x: escaper(uniline(recode(x))) db.ensure_index([("thread_title", 1), ("created_at", 1)]) print "permalink,thread,author,text,timestamp,deleted" for t in db.find(sort=[("thread_title", 1), ("created_at", 1)]): print ",".join([formatt(a) if a else "" for a in [t["permalink"],t["thread_title"],t["author"],t["message"],t["created_at"],str(t["deleted"])]])
#!/usr/bin/env python # -*- coding: utf-8 -*- import time from pymongo import MongoClient from bottle import Bottle, request, abort, static_file application = Bottle() db = MongoClient()['buibui']['danmaku'] db.ensure_index('ts') @application.route('/buibui/get_danmakus') def get_danmakus(): ts = int(request.params.ts) qset = list(db.find({'ts': {'$gt': ts}})) for i in qset: del i['_id'] return {'danmakus': qset} BUI_PARAMS = { 'text': str, 'mode': int, 'color': str, 'size': int, } @application.post('/buibui/bui') def bui():
class UserManager(object): _key_prefix = 'user' _log_key_prefix = 'user-log' def __init__(self, settings): """ Initialize player manager :param settings: Settings :type settings: dict :return: """ self.settings = settings self.redis = None self.mongo = None self.random = Random() self.init_redis() self.init_mongo() def init_redis(self): """ Initialize redis client :return: """ settings = self.settings['user_manager']['redis'] host = settings['host'] port = settings['port'] password = settings['password'] db = settings['db'] self.redis = UserRedis(host, port, db, password) self.redis.init_scripts() def init_mongo(self): """ Initialize mongo client :return: """ settings = self.settings['user_manager']['mongo'] host = settings['host'] port = settings['port'] db_name = settings['db_name'] collection = settings['collection'] self.mongo = MongoClient(host=host, port=port)[db_name][collection] def get_lock(self, key): """ :return: RedisLock object """ return RedisLock(self.redis, key) user_key = key_maker(_key_prefix) def decode_user_id(self, user_key): """ Decode user_id key that would :param user_key: :type user_key: :return: User ID :rtype: str """ if isinstance(user_key, bytes): user_key = user_key.decode('utf-8') return user_key.split(':')[1] def get(self, user_id, auto_create=True): """ Get user from redis if exists, otherwise fetch it from storage and put it into redis :param user_id: User ID or Social ID :type user_id: str :param auto_create: Auto create user if not exists :type auto_create: bool :return: User state object :rtype: UserState """ user_id = str(user_id) user_key = self.user_key(user_id) if self.settings['user_manager']['fixed_random_seed']: random = None else: random = self.random if self.redis.exists(user_key): data = self.decode_data(self.redis.get(user_key)) return UserState(data, random=random) else: data = self.fetch_or_create(user_id, auto_create=auto_create) if data: return UserState(data, random=random) return None def create_user_state(self, user_id): """ Create initial user state :param user_id: User ID :type user_id: str :return: User state :rtype: dict """ state = deepcopy(self.settings['user']['starting_state']) state['_id'] = user_id state['user_id'] = user_id state['registration_time'] = milliseconds() state['new_user'] = True return state def fetch_or_create(self, user_id, auto_create=True): """ Fetch player from database or create new one, if not exists :param user_id: User ID :type user_id: str :param auto_create: Auto create user if not exists :type auto_create: bool :return: User state :rtype: dict """ user_state = self.mongo.find_one({'_id': user_id}) if not user_state and auto_create: user_state = self.create_user_state(user_id) if user_state: result = self.save(user_id, user_state) if result: log.debug('User {} saved to redis'.format(user_id)) else: raise ValueError('Cannot save user to redis : {}'.format(user_id)) return user_state return None def delete(self, user_id): self.mongo.remove({'_id': user_id}) pipe = self.redis.pipeline(transaction=True) pipe.delete(self.user_key(user_id)) pipe.srem('modified_users', self.user_key(user_id)) result = pipe.execute()[0] return result def save(self, user_id, data): """ Save user data into redis :param user_id: User ID :type user_id: str :param data: User state :type data: dict :return: indicates if player was saved in redis :rtype: bool """ pipe = self.redis.pipeline(transaction=True) data['user_id'] = user_id # the set command cancels a user's ttl pipe.set(self.user_key(user_id), self.encode_data(data)) pipe.sadd('modified_users', self.user_key(user_id)) result = pipe.execute()[0] return result def encode_data(self, data): """ Encode data to store in redis :param data: Decoded user state :type data: dict :return: Encoded data :rtype: """ return encoder_function(data) def decode_data(self, data): """ Decode data came from redis :param data: User state encoded :type data: :return: Decoded data :rtype: dict """ if isinstance(data, bytes): data = data.decode('utf-8') return decoder_function(data) @property def online_users_count(self): """ Count online players in redis :return: Database size (players in database) :rtype: int """ return self.redis.keys(self.user_key("*")) def transaction(self, user_id): """ Context manager helper for opening user state. Provides lock for user get/set operations. :param user_id: User ID :type user_id: str :return: context manager """ lock = self.get_lock(user_id) yield from lock.acquire() writable_state = self.get(user_id) return self._transaction_context(user_id, writable_state, lock) @contextlib.contextmanager def _transaction_context(self, user_id, writable_state, lock): """ User state transaction context manager :param user_id: User ID :type user_id: str :param writable_state: User state :type writable_state: UserState :param key: Lock key :type key: str """ try: yield writable_state lock.check_validity_time() self.save(user_id, dump_value(writable_state)) finally: lock.release() @property def unsaved_users_count(self): """ Get unsaved players count :return: Unsaved players count :rtype: int """ return self.redis.scard('modified_users') def dump_users(self, all=False): db = self.redis expire = self.settings['user']['session_ttl'] if all: users = db.get_all_users_script(keys=[expire]) else: users = db.get_modified_users_script(keys=[expire]) for user_data in users: self._dump_user_to_mongo(self.decode_data(user_data)) return len(users) def remove_user_ttls(self): # TODO: a script that calls this method self.redis.remove_user_ttls_script() def _dump_user_to_mongo(self, user_data): """ Dump player to MongoDB :param player_data: User state :type player_data: dict or UserState :return: """ self.mongo.ensure_index('user_id') self.mongo.find_and_modify({'_id': user_data['user_id']}, dump_value(user_data), upsert=True) log_key = key_maker(_log_key_prefix) def log_commands(self, user_id, commands, response): settings = self.settings['user_manager']['redis']['log_commands'] if not settings['enable']: return if isinstance(commands, str): commands = json.loads(commands) key = self.log_key(user_id) data = {"ts": milliseconds(), "commands": commands, "response": response} pipe = self.redis.pipeline(transaction=True) pipe.rpush(key, json.dumps(data)) pipe.ltrim(key, -settings['size'], -1) pipe.expire(key, settings['ttl']) pipe.execute() def get_commands_log(self, user_id): key = self.log_key(user_id) data = self.redis.lrange(key, 0, -1) return [json.loads(i.decode("utf-8")) for i in data]
class Crawling: def __init__(self): self.value = set() self.base_url = "https://zh.wikipedia.org{0}" self.base_url = "https://en.wikipedia.org{0}" self.client = MongoClient()["crawling"]["wikipedia_en"] self.client.ensure_index('company_link', unique=True) self.proxy = { "http:": "127.0.0.1:1087", "https": "127.0.0.1:1087" } self.list_links = set() self.comp_links = set() def get_html(self, url, d=100): n = 0 h = "" if "//zh.wikipedia.org" in url: url = url.replace("/wiki/", "/zh-cn/") while not h and n < d: n += 1 try: h = requests.get(url, proxies=self.proxy, timeout=5).text # html.encoding="utf-8" # print(html.text) except Exception as e: h = "" if "Wikimedia Error" in h: h = "" time.sleep(5) if not h: print(url) return h def get_comp_html(self, h): selector = Selector(text=h) url = "".join(selector.xpath("""//link[@rel="canonical"]/@href""").extract()) if self.client.find_one({"company_link": url}): return info = [] table_xpath = """//table[@class="infobox vcard"]""" name = selector.xpath(table_xpath+"/caption/text()").extract() table_nums = len(selector.xpath(table_xpath)) for tn in range(1, table_nums + 1): info_xpath = table_xpath + "[" + str(tn) + """]/tbody/tr""" info_nums = len(selector.xpath(info_xpath)) for inn in range(1, info_nums + 1): info_th = selector.xpath(info_xpath + "[" + str(inn) + """]/th""").xpath("string(.)").extract() info_td = selector.xpath(info_xpath + "[" + str(inn) + """]/td""").xpath("string(.)").extract() info.append(["|".join(info_th), "|".join(info_td)]) profile = selector.xpath("""//div[@class="mw-parser-output"]/p[1]""").xpath("string(.)").extract() keys = selector.xpath("""//div[@class="mw-parser-output"]/p[1]/b""").xpath("string(.)").extract() result = { "name":name, "info": info, "profile": profile, "keys": keys, "company_link": url } self.client.insert_one(result) def get_decade_link(self, h): selector = Selector(text=h) links = set(selector.xpath("""//div[@id="mw-pages"]//a/@href""").extract()) | \ set(selector.xpath("""//div[@id="mw-subcategories"]//a/@href""").extract()) url = "".join(selector.xpath("""//link[@rel="canonical"]/@href""").extract()) if "https://en.wikipedia" in url: base_url = "https://en.wikipedia.org{0}" else: base_url = "https://zh.wikipedia.org{0}" category_link = set(filter(lambda x: "Category" in x, links)) comp_links = links - category_link for k in category_link: if k not in self.list_links: redis_queue.rpush(DECADE_LINK, base_url.format(k)) self.list_links.add(k) for k in comp_links: if k not in self.comp_links: redis_queue.rpush(COMP_LINK, base_url.format(k)) self.comp_links.add(k)
class Crawling: def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'TYCID=27a76f60d2b811e88ae7e3d6e0c34190; undefined=27a76f60d2b811e88ae7e3d6e0c34190; ssuid=4413173240; _ga=GA1.2.1501966850.1539854950; _gid=GA1.2.1498667544.1539854950; RTYCID=90afebb407304b419ffba3f34db2cbbf; CT_TYCID=be7a85298d4b4c5c89c7a318933de9c4; aliyungf_tc=AQAAAGY8MkG+sgsARg5pZTp1l4I4f8ag; csrfToken=fkLdGgQ0KdvwrHEM42ohdewc; cloud_token=a095417bd2c54b658e67f3f5ab221407; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1540014672,1540014676,1540014687,1540024935; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1540024935', # 'Host': 'www.tianyancha.com', 'Referer': 'https://www.baidu.com/link?url=OxKpHBAyow0MbfYxvji6ulLNumykoi-9DJbD724GpNyaSTSCNKNlm4G2pLLHmK1eNbv1X1AkdpfkkdLiuY8tsq&wd=&eqid=f03e551e0004a8c7000000035bcac24c', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } self.value = set() self.base_url = "https://www.tianyancha.com/search/p{0}?key={1}" self.brand_url = "https://www.tianyancha.com/brand/{0}" self.proxy = self.get_ip() self.client = MongoClient()["crawling"]["tianyancha"] self.client.ensure_index('company_link', unique=True) self.brand_client = MongoClient()["crawling"]["tianyancha_brand"] self.brand_client.ensure_index('brand_link', unique=True) def update_ip(self): try: self.proxy = self.get_ip() except Exception as e: print(e) time.sleep(300) self.update_ip() def get_ip(self): ip = requests.get("http://10.0.40.42:5000/proxy", headers=self.headers).text p = { 'http': ip, 'https': ip, } return p def get_html(self, url, d=100): n = 0 h = "" while not h and n < d: n += 1 try: h = requests.get(url, headers=self.headers, proxies=self.proxy, timeout=1).text # html.encoding="utf-8" # print(html.text) except Exception as e: h = "" self.update_ip() if "无法收到短信验证码?点击切换语音验证" in h: h = "" self.update_ip() return h def save(self, fn): print(model_path + fn) open(model_path + "result/" + fn, "w").write("\n".join(self.value)) def parse_comp_html(self, h): selector = Selector(text=h) url = "".join( selector.xpath( """//div[@class="company-tabwarp -abs"]/a/@href""").extract()) # if not url: # return # if self.client.find_one({"company_link": url}): # return name = selector.xpath("""//div[@class="header"]/h1/text()""").extract() update_time = selector.xpath( """//span[@class="updatetimeComBox"]/text()""").extract() info_base_xpath = """//div[@class="detail "]/div""" info_nums = len(selector.xpath(info_base_xpath).extract()) info = {} for i in range(1, info_nums + 1): second_xpath = """//div[@class="detail "]/div""" + str( [i]) + """/div""" for j in range(1, len(selector.xpath(second_xpath).extract()) + 1): info_items = selector.xpath( second_xpath + str([j]) + """/span[3]/script/text()""").extract() if not (info_items): info_items = selector.xpath(second_xpath + str([j]) + """/a/text()""").extract() if info_items: info_name = selector.xpath( second_xpath + str([j]) + """/span[1]/text()""").extract() info["".join(info_name)] = "".join(info_items).replace( "\"", "") common_name = selector.xpath( """//div[@class="logo-text -l4 -w100"]/span/text()""").extract() member = [] member_xpath = """//div[@class="clearfix"]/table/tbody/tr""" member_nums = len(selector.xpath(member_xpath).extract()) for i in range(1, member_nums + 1): member_name = selector.xpath( member_xpath + str([i]) + """//a[@event-name="企业详情-主要人员"]/text()""").extract() member_title = selector.xpath(member_xpath + str([i]) + """/td[3]""").xpath( "string(.)").extract() member.append(member_name + member_title) holder_xpath = """//div[@id="_container_holder"]/table/tbody/tr""" holder_nums = len(selector.xpath(holder_xpath).extract()) holder = [] for i in range(1, holder_nums + 1): holder_name = selector.xpath( holder_xpath + str([i]) + """//a[@class="link-click"]/text()""").extract() holder_rate = selector.xpath( holder_xpath + str([i]) + """//span[@class="num-investment-rate"]/text()""").extract() holder_capital = selector.xpath( holder_xpath + str([i]) + """/td[4]/div/span/text()""").extract() holder_link = selector.xpath( holder_xpath + str([i]) + """/td/div/div/a[@class="link-click"]/@href""").extract() holder.append(holder_name + holder_rate + holder_capital + holder_link) invest_xpath = """//div[@id="_container_invest"]/table/tbody/tr""" invest_nums = len(selector.xpath(invest_xpath).extract()) invest = [] for i in range(1, invest_nums + 1): invest_name = selector.xpath( invest_xpath + str([i]) + """/td[2]//a[@class="link-click"]/text()""").extract() invest_rate = selector.xpath(invest_xpath + str([i]) + """/td[5]/span/text()""").extract() invest_representative = selector.xpath( invest_xpath + str([i]) + """/td[3]/span[1]/text()""").extract() invest_link = selector.xpath(invest_xpath + str([i]) + """/td[2]//td/a/@href""").extract() invest.append(invest_name + invest_rate + invest_representative + invest_link) financing_xpath = """//div[@id="_container_rongzi"]/table/tbody/tr""" financing_nums = len(selector.xpath(financing_xpath).extract()) financing = [] for i in range(1, financing_nums + 1): financing_time = selector.xpath(financing_xpath + str([i]) + """/td[2]/text()""").extract() financing_rounds = selector.xpath(financing_xpath + str([i]) + """/td[3]/text()""").extract() financing_valuation = selector.xpath( financing_xpath + str([i]) + """/td[4]/text()""").extract() financing_investor = selector.xpath( financing_xpath + str([i]) + """/td[7]//a/text()""").extract() financing_amount = selector.xpath(financing_xpath + str([i]) + """/td[5]/text()""").extract() financing.append(financing_time + financing_rounds + financing_valuation + financing_amount + [",".join(financing_investor)]) legal_representative = selector.xpath( """//div[@class="legal-representative"]//div[@class="name"]""" ).xpath("string(.)").extract() tags = selector.xpath( """//span[@class="tag tag-new-category mr10"]/text()""").extract() branch_xpath = """//div[@id="_container_branch"]/table/tbody/tr""" branch_nums = len(selector.xpath(branch_xpath).extract()) branch = [] for i in range(1, branch_nums + 1): branch_name = selector.xpath( branch_xpath + str([i]) + """/td[2]//a[@class="link-click"]/text()""").extract() branch_principal = selector.xpath(branch_xpath + str([i]) + """/td[3]""").xpath( "string(.)").extract() branch_link = selector.xpath(branch_xpath + str([i]) + """/td[2]//td/a/@href""").extract() branch.append(branch_name + branch_principal + branch_link) other_link = selector.xpath( """//div[@class="container-right"]//a[@class="link-hover-click"]/@href""" ).extract() history_name = selector.xpath( """//div[@class="history-content"]/div/text()""").extract() en_name = selector.xpath( """//div[@id="_container_baseInfo"]//table[@class="table -striped-col -border-top-none"]/tbody/tr[7]/td[4]/text()""" ).extract() registration_time = selector.xpath( """//div[@id="_container_baseInfo"]//table[@class="table -striped-col -border-top-none"]/tbody/tr[4]/td[2]/span/text()""" ).extract() size = selector.xpath( """//div[@id="_container_baseInfo"]//table[@class="table -striped-col -border-top-none"]/tbody/tr[5]/td[@colspan="2"]/text()""" ).extract() registered_capital = \ selector.xpath("""//div[@id="_container_baseInfo"]/table[@class="table"]/tbody/tr[1]/td[2]/div[2]/@title""").extract() score = "".join( selector.xpath( """//img[@class="sort-chart"]/@alt""").extract()).replace( "评分", "") team_xpath = """//div[@id="_container_teamMember"]/div/div[@class="card-team"]""" team_nums = len(selector.xpath(team_xpath).extract()) leaders = [] for tn in range(1, team_nums + 1): leader_name = selector.xpath( team_xpath + "[" + str(tn) + """]/div[@class="left"]/div/div/img/@alt""").extract() leader_position = selector.xpath( team_xpath + "[" + str(tn) + """]/div[@class="right"]/div[@class="title"]/text()""" ).extract() leader_profile = selector.xpath( team_xpath + "[" + str(tn) + """]/div[@class="right"]/p/text()""").extract() leaders.append([ "".join(leader_name), ",".join(leader_position), "\n".join(leader_profile) ]) products_name = selector.xpath( """//div[@class="product-list"]/a/span/text()""").extract() products_link = list( map( lambda x: "".join(re.findall("\'([a-z0-9]*)\'", x)), selector.xpath( """//div[@class="product-list"]/a/@onclick""").extract())) result = { "registration_time": "".join(registration_time), "registered_capital": "".join(registered_capital), "leaders": leaders, "en_name": "".join(en_name), "history_name": history_name, "common_name": "".join(common_name), "other_link": other_link, "branch": branch, "tags": tags, "legal_representative": "".join(legal_representative), "holder": holder, "company_link": url, "invest": invest, "member": member, "name": "".join(name), "update_time": "".join(update_time), "financing": financing, "info": info, "score": score, "products": products_name, "size": "".join(size) } # self.client.insert_one(result) # brand_link = set(selector.xpath("""//div[@class="item"]/a/@href""").extract()) # for k in brand_link: # redis_queue.rpush(BRAND_LINK, k) # for k in products_link: # redis_queue.rpush(BRAND_LINK, self.brand_url.format(k)) return result def parse_link_html(self, h): selector = Selector(text=h) links = set( selector.xpath("""//div[@class="header"]/a/@href""").extract()) for k in links: redis_queue.rpush(COMP_LINK, k) brand_link = set( map(lambda x: self.brand_url.format(x.split("\'")[1]), selector.xpath( """//div[@class="brand"]/@onclick""").extract())) for k in brand_link: redis_queue.rpush(BRAND_LINK, k) def parse_brand_html(self, h): selector = Selector(text=h) key = re.findall( "[a-z0-9]+$", "".join( selector.xpath( """//link[@rel="alternate"]/@href""").extract())) url = self.brand_url.format("".join(key)) if not key: return if self.brand_client.find_one({"brand_link": url}): return name = "".join( selector.xpath( """//div[@class="content"]/div[@class="header"]/div/text()"""). extract()) father = "".join( selector.xpath( """//div[@class="content"]/div[@class="header"]/a/text()"""). extract()) tags = selector.xpath("""//div[@class="tags"]/a/text()""").extract() info = selector.xpath( """//div[@class="infos"]/span/text()""").extract() profile = selector.xpath( """//div[@class="block-data-group"][1]/div[@class="block-data"]/div/text()""" ).extract() leaders_xpath = """//div[@class="block-data-group"][2]/div/div/table/tbody/tr""" leaders_nums = len(selector.xpath(leaders_xpath).extract()) leaders = [] for n in range(1, leaders_nums + 1): person_xpath = leaders_xpath + "[" + str(n) + "]" leader_name = "".join( selector.xpath( person_xpath + "/td[2]/table/tr/td[1]/div[2]/img/@alt").extract()) leader_position = "".join( selector.xpath(person_xpath + "/td[3]/text()").extract()) leaders.append(leader_name + "\t" + leader_position) result = { "brand_link": url, "name": name, "father": father, "tags": tags, "info": info, "profile": "".join(profile), "leaders": leaders } self.brand_client.insert_one(result)