def get_updated(): db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select record_id from spider_sh_related_internet_medicine_msg_server" cursor.execute(sql) db_data = cursor.fetchall() # print(db_data) # exit() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000,error_rate=0.001) for bl in data: bloom.add(bl) return bloom
def blood_filter_pickle(sum): sql = "select * from qcc_source_data" cursor.execute(sql) data = cursor.fetchall() if len(data) == 0: return sum data = [i[2] for i in data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) for bl in data: bloom.add(bl) sum = [i for i in sum if i['园区id'] not in bloom] return sum
def updated(): db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select url from spider_2_company_revoke" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0].strip() for i in db_data] bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001) for i in data: bloom.add(i) return bloom
class Spider_related(threading.Thread,Downloader): def __init__(self,keyList_queue,writer,contain): super(Spider_related, self).__init__() self.keyList_queue = keyList_queue self.writer = writer self.contain = contain # 可自动扩容的布隆过滤器 self.bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001) def run(self): while True: key = self.keyList_queue.get() # 过滤重复词,如果它存在seo中就跳过 if key in self.bloom: self.keyList_queue.task_done() continue # 依据关键词下载当页的源码 source = self.download(key) self.bloom.add(key) #每采一个就添加一个,用来过滤重复 # 如果返回的源码为 None则跳过 if source is None: # task_done() 的作用:只有消费者把队列所有的数据处理完毕,queue.join()才会停止阻塞 self.keyList_queue.task_done() continue # 解析源码中的新关键词 self.parse_keyList(source) self.writer.flush() #每执行一次刷新一次文件记录 self.keyList_queue.task_done() def parse_keyList(self,source): ele = etree.HTML(source) keyList = ele.xpath('//table//tr//th/a/text()') for key in keyList: for con in self.contain: if con in key: if key in self.bloom: return else: self.writer.write('{}\n'.format(key)) self.keyList_queue.put(key) print('新词:{}'.format(key))
def get_updated(): db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select license_num from spider_qualification" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=1000000, error_rate=0.001) for bl in data: bloom.add(bl) return bloom
def get_bloomFilter(self, sql): bloom = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) db = pymysql.connect("localhost", "root", "123456", "mytest", charset='utf8') cursor = db.cursor() cursor.execute(sql) desc = cursor.description object_dict = [ dict(zip([col[0] for col in desc], row)) for row in cursor.fetchall() ] # print(object_dict) cursor.close() for d in object_dict: # print(d) bloom.add(d) return bloom
def get_updated(): db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select record_num from spider_sh_company_medical_equipment_network_third_party_platform" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0] for i in db_data] from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) for bl in data: bloom.add(bl) return bloom
class RemoveSameUrlPipLine: def __init__(self): redis_db = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True) result = redis_db.smembers('spider:url') self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) for item in result: self.sbf.add(item) def process_item(self, item, spider): if item['link'] in self.sbf: raise DropItem("same title in %s" % item['link']) else: return item
class IgnoreRequestMiddleware(object): """ url 请求去重 """ def __init__(self): # 可自动伸缩的布隆过滤器 self.sbf = ScalableBloomFilter(initial_capacity=100, error_rate=0.001) def process_request(self, request, spider): if not request.url: return None url_hash = hashlib.md5(request.url.encode("utf8")).hexdigest() if url_hash in self.sbf: raise IgnoreRequest("Spider : %s, IgnoreRequest : %s" % (spider.name, request.url)) else: self.sbf.add(url_hash)
def get_bloom(): db = pymysql.connect(host="192.168.2.97", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select company_num from company_title" cursor.execute(sql) db_data = cursor.fetchall() data = [i[0] for i in db_data] print(data) from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=100000, error_rate=0.001) for bl in data: bloom.add(bl) db.close() return bloom
def main(): ''' 有icp edi 域名备案 :return: ''' db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = 'select distinct (company_name) from spider_company_icp;' cursor.execute(sql) domain_data = cursor.fetchall() domain_data = [i[0] for i in domain_data] # print(domain_data) print('icp 域名备案 ---- count:{}'.format(len(domain_data))) ''' 没有icp edi 许可证 :return: ''' add_value_telecom_lists = model1_filter_data( 'spider_add_value_telecom_info_hz') industry_lists = model1_filter_data('spider_industry_information') industry_gov_lists = model1_filter_data('spider_industry_information_gov') p = add_value_telecom_lists + industry_lists + industry_gov_lists bloom = ScalableBloomFilter(initial_capacity=1000000, error_rate=0.001) for i in p: bloom.add(i) sum = [] for company in domain_data: if company not in bloom: sum.append(company) else: print('具有ICp许可证', company) print('icp 域名备案 ,没有icp许可证的公司 ----- count:{}'.format(len(sum))) print(sum) for i in sum: medicine = Medicine(company_name=i) session.add(medicine) session.commit() session.close()
def generate_mul_col_bloom(conf, capacity, cursor): """ 根据配置文件,初始容量,放入bloom过滤器的数据为联合外键分析生成bloom过滤器 :param conf: 配置信息 :param capacity: bloom过滤器初始容量 :param cursor: 联合主键的值 :return: bloom过滤器对象 """ assert isinstance(conf, Config) b = ScalableBloomFilter(initial_capacity=capacity, error_rate=conf.bloom_error_rate) while True: row = cursor.fetchone() if not row: break # 核心算法:遍历联合主键的每一行,将值生成一个frozenset,对frozenset取hash,将hash值放入bloom过滤器 hash_elem = get_md5([str(elem).rstrip() for elem in row]) b.add(hash_elem) return b
def get_updated(): db = pymysql.connect(host="192.168.2.99", user="******", password='******', database="spider", port=3306) cursor = db.cursor() sql = "select permit_number from spider_add_value_telecom_info" cursor.execute(sql) db_data = cursor.fetchall() # print(db_data) # exit() data = [i[0].strip() for i in db_data] # print(data) # exit() from pybloom_live import ScalableBloomFilter bloom = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) for bl in data: bloom.add(bl) return bloom
def collection_list_field(list, data, main_field): if list: bloom = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) for l in list: bloom.add(l) if data not in bloom: for l in list: if l[main_field] == data[main_field]: list.remove(l) item = {} for k, v in l.items(): item[k] = databaseTool.collection_field( databaseTool.is_field_dict(l, k), databaseTool.is_field_dict(data, k)) list.append(item) else: list.append(data) else: list.append(data) return list
class ConOfAllData(object): def __init__(self, site_name): self.client = MongoClient('localhost', 27017) self.db = self.client.crawlSpider self.col_url = self.db[site_name + "_url"] self.col_content = self.db[site_name + "_content"] self.sbf = ScalableBloomFilter(initial_capacity=100) for item in self.col_url.find(): self.sbf.add(item["url"]) self.insert_url = [] self.insert_content = [] def isexist(self, url): if url in self.sbf: return True else: self.sbf.add(url) self.insert_url.append({"url": url}) return False def insert(self, content): if content['real_url'] is not None and content[ 'title'] is not None and content['abstract'] is not None: self.insert_content.append(content) def end(self): if len(self.insert_url) != 0: self.col_url.insert_many(self.insert_url) if len(self.insert_content) != 0: self.col_content.insert_many(self.insert_content) #if __name__ == "__main__": #coad = ConOfAllData("ningxia") #coad.isexist("1") #coad.isexist('2') #coad.isexist("1") #coad.insert({"site": "ningxia"}) #coad.insert({"site": "guangdong"}) #coad.end()
def delete_duplicate_data(self, file): """ 去重函数 :param file: :return: """ bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.00000001) temp_name = file.replace(".txt", "_temp.txt") with open(file, 'r', encoding='utf8') as r_f, open(temp_name, 'a', encoding='utf8') as w_f: for line in r_f: line_content = line.strip() if line_content not in bloom: bloom.add(line_content) w_f.write(line_content + "\n") else: print(line_content) os.remove(file) os.rename(temp_name, file)
class GanjiPipeline(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) # self.connection = pymongo.MongoClient( # settings['MONGODB_SERVER'], # settings['MONGODB_PORT'] # ) # db = self.connection[settings['MONGODB_DB']] # self.collection = db[settings['MONGODB_COLLECTION']] # # count self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings self.add_num = 0 self.drop_num = 0 # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] self.df_result = pd.DataFrame() self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01) # # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) def open_spider(self, spider): pass def process_item(self, item, spider): if spider.name in [ 'ganji', 'crawl_jingzhengu', 'xiaozhu', 'hry2car', 'che168', 'youxin', 'chesupai', 'youxin_master', 'auto51', 'autohome_butie' ]: valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) field_list = [ "carsource", "grab_time", "price1", "mileage", "post_time", "sold_date", "city", "registerdate" ] data = dict() for field in field_list: data[field] = item[field] if field in item else None if returndf: self.drop_num += 1 valid = False else: pass if valid: self.fa.writelines(i + '\n') # 数据存入mysql items = list() items.append(item) df = pd.DataFrame(items) if spider.name in [ 'test', ]: self.df_result = pd.concat([self.df_result, df]) self.mongocounts += 1 logging.log( msg= f"add {self.mongocounts} items", level=logging.INFO) else: df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) self.mongocounts += 1 logging.log( msg= f"scrapy {self.mongocounts} items", level=logging.INFO) def close_spider(self, spider): # self.connection.close() logging.log( msg=f"drop {self.drop_num} items", level=logging.INFO) if spider.name in ['test']: self.df_result.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) self.conn.dispose()
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir='../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) try: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir='../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class GanjiPipeline(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) # self.connection = pymongo.MongoClient( # settings['MONGODB_SERVER'], # settings['MONGODB_PORT'] # ) # db = self.connection[settings['MONGODB_DB']] # self.collection = db[settings['MONGODB_COLLECTION']] # # count self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings self.add_num = 0 self.drop_num = 0 self.log_dict = { "projectName": "used-car-scrapy", "logProgram": '', "logProgramPath": str(pathlib.Path.cwd()), "logPath": "/home/logs/usedcar_new", "logTime": '', "logMessage": "", "logServer": "192.168.1.241", "logObjectType": "UsedCarPaChong", "logObject": { "field": '', "info": { "dataBaseType": 'mysql', "dataBaseName": '', "tableName": '', "saveStatus": '' } } } # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] self.df_result = pd.DataFrame() self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01) # # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) def open_spider(self, spider): pass def process_item(self, item, spider): if spider.name in ['chesupai']: self.log_dict["logServer"] = "192.168.1.241" if spider.name in ['anxinpai']: self.log_dict["logServer"] = "192.168.1.249" if spider.name in ['crawl_jingzhengu', 'ganji']: self.log_dict["logServer"] = "192.168.1.248" if spider.name in ['xiaozhu', 'hry2car']: self.log_dict["logServer"] = "192.168.1.92" if spider.name in [ 'ganji', 'crawl_jingzhengu', 'xiaozhu', 'hry2car', 'che168', 'youxin', 'chesupai', 'youxin_master', 'auto51' ]: valid = True i = md5(item['statusplus'].encode("utf8")).hexdigest() returndf = self.df.add(i) self.log_dict["logProgram"] = spider.name self.log_dict["logTime"] = item["grab_time"] self.log_dict["logType"] = 'INFO' self.log_dict["logMessage"] = "successful" field_list = [ "carsource", "grab_time", "price1", "mileage", "post_time", "sold_date", "city", "registerdate" ] data = dict() for field in field_list: data[field] = item[field] if field in item else None self.log_dict["logObject"]["field"] = data self.log_dict["logObject"]["field"]["carsource"] = item[ "car_source"] self.log_dict["logObject"]["info"][ "dataBaseName"] = "usedcar_update" self.log_dict["logObject"]["info"][ "tableName"] = spider.name + '_online' if returndf: self.drop_num += 1 valid = False # self.log_dict["logObject"]["info"]["saveStatus"] = "false" self.log_dict["logObject"]["info"]["saveStatus"] = "true" logging.log(msg=json.dumps(self.log_dict, ensure_ascii=False), level=logging.INFO) # raise DropItem("Drop data {0}!".format(item["url"])) else: pass if valid: self.fa.writelines(i + '\n') # 数据存入mysql items = list() items.append(item) df = pd.DataFrame(items) if spider.name in [ 'test', ]: self.df_result = pd.concat([self.df_result, df]) self.mongocounts += 1 logging.log( msg= f"add {self.mongocounts} items", level=logging.INFO) else: # self.log_dict["logObject"]["info"]["saveStatus"] = "true" self.log_dict["logObject"]["info"]["saveStatus"] = "false" logging.log(msg=json.dumps(self.log_dict, ensure_ascii=False), level=logging.INFO) df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) self.mongocounts += 1 logging.log( msg= f"scrapy {self.mongocounts} items", level=logging.INFO) def close_spider(self, spider): # self.connection.close() logging.log( msg=f"drop {self.drop_num} items", level=logging.INFO) if spider.name in ['test']: self.df_result.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) self.conn.dispose()
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir = '../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) try: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not (os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir = '../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while (True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format( self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format( current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info( u"skipping url \"{0}\" because it matches filter" .format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if (parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info( u"skipping url with invalid scheme: {0}". format(url)) continue parsed_as_list[5] = '' url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info( u"skipping malformed url {0}. Error: {1}". format(url, str(e))) continue if (not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not (u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put( (url, str(int(current_level) + 1))) logging.info( u"added {0} to the to_visit as well as the level {1}" .format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info( u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class AutohomeNewPipeline: @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] website = settings["WEBSITE"] self.collection = db[settings['MONGODB_COLLECTION']] # count self.mysqlcounts = 0 self.counts = 0 self.settings = settings # bloom file self.CrawlCar_Num = 1000000 filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MONGODB_DB'] + '/' + settings['MONGODB_COLLECTION'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB'] self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) self.counts = 0 def process_item(self, item, spider): # mongo要有重字段status的爬虫名字写进去 if spider.name in ["autohome_dealer", " "]: valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["status"])) else: self.fa.writelines(i + '\n') self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) return item # mongo不需要去重的爬虫名字写进去 elif spider.name in ["", " "]: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) return item # mysql有要去重字段status的爬虫名字写进去 elif spider.name in ['', ' ']: valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["status"])) else: self.fa.flush() self.fa.writelines(i + '\n') self.mysqlcounts += 1 logging.log( msg= f"scrapy {self.mysqlcounts} items", level=logging.INFO) # 数据存入mysql items = list() items.append(item) df = pd.DataFrame(items) df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) logging.log(msg=f"add data in mysql", level=logging.INFO) return item # mysql不需要去重的爬虫名字写进去 elif spider.name in ['baidu', '']: self.mysqlcounts += 1 logging.log( msg= f"scrapy {self.mysqlcounts} items", level=logging.INFO) # 数据存入mysql items = list() items.append(item) df = pd.DataFrame(items) df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) logging.log(msg=f"add data in mysql", level=logging.INFO) return item def close_spider(self, spider): self.connection.close() self.fa.close()
class WeiboCnSpider: def __init__(self, tasks=2, loop=None): self.tasks = tasks self.loop = loop or asyncio.get_event_loop() self.redis_cookie = RedisCookie() self.redis_cookie_now = src.redis_cookies.RedisCookies() self.redis_job = RedisJob() self.bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.weibo_limit = True self.time_current_pattern = re.compile(r'(\d*)分钟前') self.time_today_pattern = re.compile(r'今天\s*(\d*):(\d*)') self.time_year_pattern = re.compile(r'(\d*)月(\d*)日\s*(\d*):(\d*)') self.user_id_pattern = re.compile(r'https://weibo.cn/u/(\d*)') self.weibo_host = 'https://weibo.cn' self.follow_url = self.weibo_host + '/%s/follow' self.redis_job_now = src.redis_cookies.RedisJob() self.fan_url = self.weibo_host + '/%s/fans' self.user_info_url = self.weibo_host + '/%s/info' self.user_tweet_url = self.weibo_host + '/%s' self.user_tweet_url2 = self.weibo_host + '/%s?page=%d' self.user_repost_url = self.weibo_host + '/repost/%s' self.user_repost_url2 = self.weibo_host + '/repost/%s?page=%d' self.tweet_comment_url = self.weibo_host + '/comment/%s' self.tweet_comment_url2 = self.weibo_host + '/comment/%s?page=%d' self.weibo_producer = WeiboProcuder(['localhost:9092'], 'lyzzrO_U') self.search_url = 'https://weibo.cn/search/?pos=search' self.get_search_url = 'https://weibo.cn/search/mblog/?keyword=%s&filter=hasori' self.topic_max_page = 500 self.super_new_fan_max_page = 500 self.topic_url = 'https://m.weibo.cn/api/container/getIndex?containerid=1008086a8d30da45d91717a596df498af918aa_-_feed&page=%d' self.super_new_fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=2311406a8d30da45d91717a596df498af918aa_-_super_newfans&page=%d' async def crawl_follow(self): while True: follow_dict = self.redis_job_now.fetch_job(JobType.follower.value) if follow_dict: try: await self.grab_follow(follow_dict) LOGGER.info('finish %d follow crawl ' % follow_dict['uid']) except TimeoutError as e: print(e) except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def grab_follow(self, follow_dict): LOGGER.info('start grab user follow: %s' % str(follow_dict)) html_content = await self.grab_html(follow_dict['url']) follow_html = BeautifulSoup(html_content, "lxml") all_td = follow_html.find_all('td', style=True) follow_id=[] for td in all_td: a = td.find('a').get('href') usr_id_result = self.user_id_pattern.findall(a) if usr_id_result: usr_id = usr_id_result[0] else: usr_id = await self.get_user_id_from_homepage(a) if usr_id not in follow_id: follow_id.append(int(usr_id)) user_follow_dict={} follow_id_key_list = [i for i in range(len(follow_id))] follow_id = dict(zip(follow_id_key_list, follow_id)) user_follow_dict['type'] = 'follow' user_follow_dict['uid'] = follow_dict['uid'] user_follow_dict['fans_id'] = follow_id await self.weibo_producer.send(user_follow_dict, self.follow_url % follow_dict['uid']) if 'page=' not in follow_dict['url']: page_div = follow_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) if max_page>20: max_page=20 for page in range(2, max_page + 1): await self.redis_job.push_job(JobType.follower.value, {'url': (self.follow_url % follow_dict['uid']) + '?page=' + str(page), 'uid': follow_dict['uid']}) async def crawl_fan(self): while True: fan_dict = self.redis_job_now.fetch_job(JobType.fan.value) if fan_dict: try: await self.grab_fan(fan_dict) LOGGER.info('finish %d fan crawl ' % fan_dict['uid']) except TimeoutError as e: print(e) except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def grab_fan(self, fan_dict): LOGGER.info('start grab user fan: %s' % str(fan_dict)) html_content = await self.grab_html(fan_dict['url']) fan_html = BeautifulSoup(html_content, "lxml") all_td = fan_html.find_all('td', style=True) fans_id=[] for td in all_td: a = td.find('a').get('href') usr_id_result = self.user_id_pattern.findall(a) if usr_id_result: usr_id = usr_id_result[0] else: usr_id = await self.get_user_id_from_homepage(a) if usr_id not in fans_id: fans_id.append(int(usr_id)) user_fan_dict={} fans_id_key_list = [i for i in range(len(fans_id))] fans_id = dict(zip(fans_id_key_list,fans_id)) user_fan_dict['type'] = 'fan' user_fan_dict['uid'] = fan_dict['uid'] user_fan_dict['fans_id'] = fans_id await self.weibo_producer.send(user_fan_dict, self.fan_url % fan_dict['uid']) if 'page=' not in fan_dict['url']: page_div = fan_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) if max_page>20: max_page=20 for page in range(2, max_page + 1): await self.redis_job.push_job(JobType.fan.value, {'url': (self.fan_url % fan_dict['uid']) + '?page=' + str(page), 'uid': fan_dict['uid']}) async def crawl_comment(self): while True: comment_job_info = await self.redis_job.fetch_job(JobType.comment.value) if comment_job_info: try: # asyncio.run_coroutine_threadsafe(self.grab_tweet_comments(comment_job_info), self.loop) await self.grab_tweet_comments(comment_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def crawl_repost(self): while True: repost_job_info = await self.redis_job.fetch_job(JobType.repost.value) if repost_job_info: try: await self.grab_tweet_repost(repost_job_info) except TimeoutError as e: pass except: LOGGER.error("something error") LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def crawl_weibo(self): r = re.compile(r'https://weibo.cn/(\d*)\?page=(\d*)') while True: tweet_job_info = await self.redis_job.fetch_job(JobType.tweet.value) if tweet_job_info: m = r.findall(tweet_job_info['url']) if m: page_no = int(m[0][1]) if page_no > 200: LOGGER.info('job passed %s' % str(tweet_job_info)) continue # if 'page=' in tweet_job_info['url']: # LOGGER.info('job passed %s' % str(tweet_job_info)) # continue try: await self.grab_user_tweet(tweet_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def search(self): while True: search_job_info = await self.redis_job.fetch_job(JobType.search.value) if search_job_info: try: await self.search_tweet(search_job_info) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def crawl_user(self): while True: user_job_info = self.redis_job_now.fetch_job(JobType.user.value) if user_job_info: try: if 'source' in user_job_info: await self.grab_user_info(user_job_info['user_id'], user_job_info['source']) else : await self.grab_user_info(user_job_info['user_id'], user_job_info['source']) # await self.redis_job.push_job(JobType.tweet.value, # {'url': 'https://weibo.cn/' + user_job_info['user_id'], # 'uid': user_job_info['user_id']}) await self.redis_job.push_job(JobType.follower.value, {'url': self.follow_url % user_job_info['user_id'], 'uid': user_job_info['user_id']}) await self.redis_job.push_job(JobType.fan.value, {'url': self.fan_url % user_job_info['user_id'], 'uid': user_job_info['user_id']}) # self.weibo_queue.put({'url': self.user_tweet_url % user_id, 'uid': user_id}) # self.follow_queue.put({'uid': user_id, 'url': self.follow_url % user_id}) except TimeoutError as e: pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def search_tweet(self, search_job_info): html_content = await self.grab_html(search_job_info['url']) result_html = BeautifulSoup(html_content, "lxml") if 'page' not in search_job_info['url']: total_count_str = result_html.find(text=re.compile(r'共\d*条')) print(total_count_str) total_count_result = re.findall(r'共(\d*)条', total_count_str) if total_count_result: total_count = total_count_result[0] total_page = int(total_count) / 10 for page_no in range(2, int(total_page)): await self.redis_job.push_job(JobType.search.value, { 'url': search_job_info['url'] + '&page=' + str(page_no) }) tweet_divs = result_html.find_all(id=True, class_='c') for tweet_div in tweet_divs: tweet = {} nk_div = tweet_div.find('a', class_='nk') if nk_div: nk_url = nk_div.get('href') usr_id_result = self.user_id_pattern.findall(nk_url) if usr_id_result: usr_id = usr_id_result[0] else: usr_id = await self.get_user_id_from_homepage(nk_url) else: usr_id = 'unknown' if tweet_div.find(class_='cmt', string='转发理由:'): # 转发 tweet['flag'] = '转发' parent = tweet_div.find(class_='cmt', string='转发理由:').parent try: comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href') href = comment_href.split('?')[0] tweet['sourceTid'] = href.split('/')[-1] except Exception: pass text = parent.get_text() fields = text.split('\xa0') content = fields[0][5:] ct_content = parent.find('span', class_='ct').get_text() time_source = ct_content.split('\u6765\u81ea') time = time_source[0] if len(time_source) == 2: source = time_source[1] else: source = 'unknown' other = ';'.join(fields[1:]) else: tweet['flag'] = '原创' text = tweet_div.get_text() ct_content = tweet_div.find('span', class_='ct').get_text() time_source = ct_content.split('\u6765\u81ea') time = time_source[0] if len(time_source) == 2: source = time_source[1] else: source = 'unknown' fields = text.split('\u200b') content = fields[0] other_fields = fields[-1].split('\xa0') other = ';'.join(other_fields[1:]) like = re.findall(u'\u8d5e\[(\d+)\];', other) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', other) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', other) # 评论数 tweet['content'] = content.strip() tweet['id'] = tweet_div.get('id').strip('M_') tweet['time'] = self.get_time(str(time)) tweet['source'] = source tweet['like'] = like[0] if like else -1 tweet['transfer'] = transfer[0] if transfer else -1 tweet['comment'] = comment[0] if comment else -1 tweet['type'] = 'tweet_info' tweet['uid'] = usr_id print(tweet) await self.weibo_producer.send(tweet, search_job_info['url']) await self.redis_job.push_job(JobType.tweet.value, {'url': self.user_tweet_url % tweet['id'], 'uid': usr_id}) await self.redis_job.push_job(JobType.comment.value, {'url': self.tweet_comment_url % tweet['id'], 'tweetId': tweet['id']}) async def grab_user_tweet(self, tweet_job_info): LOGGER.info('start grab tweet: %s' % str(tweet_job_info)) html_content = await self.grab_html(tweet_job_info['url']) user_tweet_html = BeautifulSoup(html_content, "lxml") tweet_divs = user_tweet_html.find_all(id=True, class_='c') for tweet_div in tweet_divs: tweet = {} if tweet_div.find(class_='cmt', string='转发理由:'): # 转发 tweet['flag'] = '转发' parent = tweet_div.find(class_='cmt', string='转发理由:').parent try: comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href') href = comment_href.split('?')[0] tweet['sourceTid'] = href.split('/')[-1] except Exception: pass text = parent.get_text() fields = text.split('\xa0') content = fields[0][5:] ct_content = parent.find('span', class_='ct').get_text() time_source = ct_content.split('\u6765\u81ea') time = time_source[0] if len(time_source) == 2: source = time_source[1] else: source = 'unknown' other = ';'.join(fields[1:]) else: tweet['flag'] = '原创' text = tweet_div.get_text() ct_content = tweet_div.find('span', class_='ct').get_text() time_source = ct_content.split('\u6765\u81ea') time = time_source[0] if len(time_source) == 2: source = time_source[1] else: source = 'unknown' fields = text.split('\u200b') content = fields[0] other_fields = fields[-1].split('\xa0') other = ';'.join(other_fields[1:]) like = re.findall(u'\u8d5e\[(\d+)\];', other) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', other) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', other) # 评论数 tweet['content'] = content.strip() tweet['id'] = tweet_div.get('id') tweet['time'] = self.get_time(str(time)) tweet['source'] = source tweet['like'] = like[0] if like else -1 tweet['transfer'] = transfer[0] if transfer else -1 tweet['comment'] = comment[0] if comment else -1 tweet['type'] = 'tweet_info' tweet['uid'] = tweet_job_info['uid'] await self.weibo_producer.send(tweet, tweet_job_info['url']) # 获取评论 # self.comment_queue.put({'url': self.tweet_comment_url % tweet['id'][2:], # 'tweetId': tweet['id'][2:]}) if 'page=' not in tweet_job_info['url']: page_div = user_tweet_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) if self.weibo_limit: max_page = max_page if max_page < 500 else 500 for page in range(2, max_page + 1): await self.redis_job.push_job(JobType.tweet.value, {'url': self.user_tweet_url2 % (tweet_job_info['uid'], page), 'uid': tweet_job_info['uid']}) async def grab_user_info(self, user_id, source = 'unknown'): LOGGER.info('start grab user info: %s' % user_id) html_content = await self.grab_html(self.user_info_url % user_id) user_info_html = BeautifulSoup(html_content, "lxml") div_list = list(user_info_html.find_all(class_=['c', 'tip'])) base_info_index, edu_info_index, work_info_index = -1, -1, -1 base_info = '' edu_info = '' work_info = '' tags = '' user_info = {} for index, div in enumerate(div_list): text = div.text if text == u'基本信息': base_info_index = index elif text == u'学习经历': edu_info_index = index elif text == u'工作经历': work_info_index = index if base_info_index != -1: b = div_list[base_info_index + 1] tags = ','.join(map(lambda a: a.get_text(), b.find_all('a'))) base_info = b.get_text(';') if edu_info_index != -1: edu_info = div_list[edu_info_index + 1].get_text(';') if work_info_index != -1: work_info = div_list[work_info_index + 1].get_text(';') base_info += ';' nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', base_info) # 昵称 if nickname: user_info['nickname'] = nickname[0] if nickname else 'unknown' gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', base_info) # 性别 place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', base_info) # 地区(包括省份和城市) signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', base_info) # 个性签名 birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', base_info) # 生日 sex_orientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', base_info) # 性取向 marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', base_info) # 婚姻状况 head_url = user_info_html.find('img', alt='头像') if head_url: user_info['head'] = head_url.get('src') user_info['tags'] = tags user_info['gender'] = gender[0] if gender else 'unknown' user_info['place'] = place[0] if place else 'unknown' user_info['signature'] = signature[0] if signature else 'unknown' user_info['birthday'] = birthday[0] if birthday else 'unknown' user_info['sexOrientation'] = sex_orientation[0] if sex_orientation else 'unknown' user_info['eduInfo'] = edu_info if edu_info else 'unknown' user_info['marriage'] = marriage[0] if marriage else 'unknown' user_info['workInfo'] = work_info if work_info else 'unknown' user_info['type'] = 'user_info' user_info['id'] = user_id user_info['source'] = source result = await self.grab_view(user_id) user_info.update(result) await self.weibo_producer.send(user_info, self.user_info_url % user_id) async def grab_view(self, user_id): """ 获取用户id的微博数、粉丝数、发布的微博数 :param user_id: 用户id :return: dict """ LOGGER.info('grab user view: %s' % str(user_id)) html_content = await self.grab_html(self.weibo_host + '/' + str(user_id)) home_page_html = BeautifulSoup(html_content, "lxml") v = home_page_html.find('div', class_='tip2') result = {} if v: content = v.get_text(';') else: content = '' tweet_r = re.findall('微博\[(\d+)\];', content) result['tweetNum'] = tweet_r[0] if tweet_r else -1 fans_r = re.findall('粉丝\[(\d+)\];', content) result['fansNum'] = fans_r[0] if fans_r else -1 follow_r = re.findall('关注\[(\d+)\];', content) result['followNum'] = follow_r[0] if follow_r else -1 return result def get_time(self, time_str): current_result = self.time_current_pattern.findall(time_str) time_now = datetime.datetime.now() if current_result: result_time = time_now - datetime.timedelta(minutes=int(current_result[0])) return result_time.strftime('%Y-%m-%d %H:%M:%S') else: current_result = self.time_today_pattern.findall(time_str) if current_result: result_time = datetime.datetime(time_now.year, time_now.month, time_now.day, int(current_result[0][0]), int(current_result[0][0])) return result_time.strftime('%Y-%m-%d %H:%M:%S') else: current_result = self.time_year_pattern.findall(time_str) if current_result: result_time = datetime.datetime(time_now.year, int(current_result[0][0]), int(current_result[0][1]), int(current_result[0][2]), int(current_result[0][3])) return result_time.strftime('%Y-%m-%d %H:%M:%S') else: return time_str @staticmethod async def grab_html2(session, url): with async_timeout.timeout(60): async with session.get(url, verify_ssl=False) as response: return await response.text() @staticmethod def grab_html2_now(session,headers, url, cookies): with session.get(url, cookies=cookies,verify=False) as response: return response.text async def post_grab2(self, session, url, data): with async_timeout.timeout(2 * 60): async with session.post(url=url, data=data, verify_ssl=False) as response: return await response.text() async def post_grab(self, url, data): cookies = await self.redis_cookie.fetch_cookies() LOGGER.info('using cookies' + str(cookies)) async with aiohttp.ClientSession(cookies=cookies['cookies']) as session: return await self.post_grab2(session, url, data) async def grab_html(self, url): cookies = await self.redis_cookie.fetch_cookies() async with aiohttp.ClientSession(cookies=cookies['cookies']) as session: return await self.grab_html2(session, url) def grab_html_now(self, url): cookies = self.redis_cookie_now.fetch_cookies() headers = self.get_header() headers['Upgrade-Insecure-Requests'] = '1' headers['Proxy-Connection'] = 'keep-alive' LOGGER.info('using cookies'+str(cookies)) ok = True while ok: resp_text = requests.get(url=url, cookies=cookies['cookies'], verify=False).text userjson = json.loads(resp_text) # userjson = json.loads(resp_text,'GBK') if userjson['ok'] == 1: ok = False return userjson['data'] @staticmethod def get_header(): header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'weibo.com', # 'Referer': 'https://weibo.com', 'User-Agent': user_agents.USER_AGENTS[random.randint(0, len(user_agents.USER_AGENTS) - 1)] } return header @staticmethod def find_fm_view_json(html): resp_html = BeautifulSoup(html, 'html.parser') scripts = resp_html.find_all('script') scripts.reverse() fm_view_pattern = re.compile('FM.view\((.*)\)') view_jsons = [] for script in scripts: print(script) r = fm_view_pattern.findall(str(script)) if len(r): view_jsons.append(json.loads(r[0])) return view_jsons async def user_id_in_queue(self, user_id): if user_id and user_id not in self.bloom_filter: # LOGGER.info('%s in user queue.' % user_id) self.bloom_filter.add(user_id) await self.redis_job.push_job(JobType.user.value, {'user_id': user_id}) async def get_user_id_from_homepage(self, home_page): html_content = await self.grab_html(home_page) home_page_html = BeautifulSoup(html_content, "lxml") info_a = home_page_html.find('a', string='资料') # LOGGER.info('get id from home page: %s' % home_page) if info_a: user_id = info_a.get('href').split('/')[1] # LOGGER.info('id got: %s' % user_id) return user_id return 0 async def parse_tweet_content(self, html, job_info): tweet_div = html.find(id='M_', class_='c') if tweet_div: tweet_user_a = tweet_div.find('a') flag = False if tweet_user_a: tweet = {} tweet_user_href = tweet_user_a.get('href') if tweet_user_href.startswith('/u/'): tweet_user_id = tweet_user_href[3:] else: tweet_user_id = await self.get_user_id_from_homepage(self.weibo_host + tweet_user_href) await self.user_id_in_queue(tweet_user_id) if tweet_div.find(class_='cmt', string='转发理由:'): tweet['flag'] = '转发' parent = tweet_div.find(class_='cmt', string='转发理由:').parent try: comment_href = tweet_div.find_all('div')[-2].find('a', class_='cc').get('href') href = comment_href.split('?')[0] tweet['sourceTid'] = href.split('/')[-1] except Exception: pass text = parent.get_text() fields = text.split('\xa0') flag = True content = fields[0][5:] tweet['content'] = content.strip() # ct_content = parent.find('span', class_='ct').get_text() # time_source = ct_content.split('\u6765\u81ea') # # time = time_source[0] # if len(time_source) == 2: # source = time_source[1] # else: # source = 'unknown' # other = ';'.join(fields[1:]) else: tweet_content = tweet_div.find('span', class_='ctt').get_text() tweet['content'] = tweet_content.strip() tweet_details = list( filter(lambda div: div.find(class_='pms'), html.find_all('div', id=False, class_=False))) tweet['sourceTid'] = job_info['parentTid'] if 'parentTid' in job_info \ else tweet['sourceTid'] if flag else '' detail = tweet_details[0].get_text(';').replace('\xa0', '') like = re.findall(u'\u8d5e\[(\d+)\];', detail) # 点赞数 transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\];', detail) # 转载数 comment = re.findall(u'\u8bc4\u8bba\[(\d+)\];', detail) # 评论数 tweet['id'] = job_info['tweetId'] tweet['like'] = like[0] if like else 0 tweet['transfer'] = transfer[0] if transfer else 0 tweet['comment'] = comment[0] if comment else 0 tweet['type'] = 'tweet_info' # if flag: # await self.weibo_producer.send(tweet, job_info['url']) # else: others = tweet_div.find(class_='ct').get_text() if others: others = others.split('\u6765\u81ea') tweet['time'] = self.get_time(others[0]) if len(others) == 2: tweet['source'] = others[1] tweet['uid'] = tweet_user_id await self.weibo_producer.send(tweet, job_info['url']) return tweet return None async def grab_tweet_repost(self, repost_job_info): LOGGER.info('start grab tweet repost: %s' % str(repost_job_info)) html_content = await self.grab_html(repost_job_info['url']) tweet_repost_html = BeautifulSoup(html_content, "lxml") repost_divs = tweet_repost_html.find_all(class_='c') for div in repost_divs: span_cc = div.find('span', class_='cc') if span_cc: attitube_a = span_cc.find('a') if attitube_a: href = attitube_a.get('href') if len(href.split('/')) > 2: await self.redis_job.push_job(JobType.comment.value, {'url': self.tweet_comment_url % href.split('/')[2], 'tweetId': href.split('/')[2], 'parentTid': repost_job_info['tweetId']}) await self.redis_job.push_job(JobType.repost.value, {'url': self.user_repost_url % href.split('/')[2], 'tweetId': href.split('/')[2], 'parentTid': repost_job_info['tweetId']}) if 'page=' not in repost_job_info['url']: await self.parse_tweet_content(tweet_repost_html, repost_job_info) page_div = tweet_repost_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) for page in range(2, max_page + 1): await self.redis_job.push_job(JobType.repost.value, {'url': self.user_repost_url2 % (repost_job_info['tweetId'], page), 'tweetId': repost_job_info['tweetId']}) pass async def grab_tweet_comments(self, comment_job): LOGGER.info('start grab comment: %s' % str(comment_job)) html_content = await self.grab_html(comment_job['url']) comment_html = BeautifulSoup(html_content, "lxml") comment_divs = comment_html.find_all(id=re.compile('C_[\d]'), class_='c') for comment_div in comment_divs: comment_info = {} comment_id = comment_div.get('id') user_a = comment_div.find('a') if user_a: user_href = user_a.get('href') if user_href.startswith('/u/'): user_id = user_href[3:] else: user_id = await self.get_user_id_from_homepage(self.weibo_host + user_href) await self.user_id_in_queue(user_id) comment_info['userId'] = user_id comment_info['content'] = comment_div.find(class_='ctt').get_text() others = comment_div.find(class_='ct').get_text() if others: others = others.split('\u6765\u81ea') comment_info['pubTime'] = self.get_time(others[0]) if len(others) == 2: comment_info['source'] = others[1] comment_info['id'] = comment_id comment_info['tweetId'] = comment_job['tweetId'] comment_info['type'] = 'comment_info' await self.weibo_producer.send(comment_info, comment_job['url']) if 'page=' not in comment_job['url']: await self.parse_tweet_content(comment_html, comment_job) page_div = comment_html.find(id='pagelist') if page_div: max_page = int(page_div.input.get('value')) for page in range(2, max_page + 1): await self.redis_job.push_job(JobType.comment.value, {'url': self.tweet_comment_url2 % (comment_job['tweetId'], page), 'tweetId': comment_job['tweetId']}) async def topic_finding(self): while True: topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value) if topic_job_info: try: print(topic_job_info) LOGGER.info('topic finding') await self.search_topic_user(topic_job_info) except TimeoutError as e: LOGGER.info('topic finding timeout error') pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) async def super_fan_finding(self): while True: topic_job_info = self.redis_job_now.fetch_job(JobType.superfan.value) if topic_job_info: try: print(topic_job_info) LOGGER.info('super fan finding') await self.search_super_fan(topic_job_info) except TimeoutError as e: LOGGER.info('super fan finding timeout error') pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) def push_topic_job_now(self): self.redis_job_now.push_job( JobType.topic.value, {'url': 'https://m.weibo.cn/api/container/getIndex?containerid=1008084d899324c66df69e0248e385a7eccca2_-_feed&page=1'}) async def push_topic_job(self): for page in range(1,self.topic_max_page): await self.redis_job_now.push_job(JobType.topic.value, { 'url': self.topic_url % page}) def push_topic_job_all_now(self): for page in range(1, self.topic_max_page+1): self.redis_job_now.push_job(JobType.topic.value, { 'url': self.topic_url % page}) def push_super_new_fan_job_all_now(self): for page in range(1, self.super_new_fan_max_page+1): self.redis_job_now.push_job(JobType.superfan.value, { 'url': self.super_new_fan_url % page}) def topic_finding_now(self): topic_job_info = self.redis_job_now.fetch_job(JobType.topic.value) if topic_job_info: try: print(topic_job_info) LOGGER.info('topic finding') self.search_topic_user_now(topic_job_info) except TimeoutError as e: LOGGER.info('topic finding timeout error') pass except: LOGGER.error(traceback.format_exc()) sleep(5 * 60) def search_topic_user_now(self, topic_job_info): LOGGER.info('try to grab html: ' + topic_job_info['url']) userjson = self.grab_html_now(topic_job_info['url']) LOGGER.info('succeed to grab html: ' + topic_job_info['url']) for group in userjson['cards']: if 'show_type' in group: for card in group['card_group']: user_id = card['mblog']['user']['id'] self.redis_job_now.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'}) async def search_topic_user(self, topic_job_info): LOGGER.info('try to grab html: ' + topic_job_info['url']) html_content = await self.grab_html(topic_job_info['url']) userjson = json.loads(html_content) userjson = userjson['data'] LOGGER.info('succeed to grab html: ' + topic_job_info['url']) for group in userjson['cards']: if 'show_type' in group: for card in group['card_group']: user_id = card['mblog']['user']['id'] await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'comment'}) async def search_super_fan(self, super_fan_job_info): LOGGER.info('try to grab html: ' + super_fan_job_info['url']) html_content = await self.grab_html(super_fan_job_info['url']) userjson = json.loads(html_content, 'GBK') userjson = userjson['data'] LOGGER.info('succeed to grab html: ' + super_fan_job_info['url']) for group in userjson['cards']: for card in group['card_group']: user_id = card['user']['id'] await self.redis_job.push_job(JobType.user.value, {'user_id': user_id, 'source': 'super'}) def start(self, args): LOGGER.info(str(args)) workers = [] if 'f' in args:#关注 workers += [asyncio.Task(self.crawl_follow(), loop=self.loop) for _ in range(self.tasks)] if 'o' in args:#粉丝 workers += [asyncio.Task(self.crawl_fan(), loop=self.loop) for _ in range(self.tasks)] if 'c' in args:#评论 workers += [asyncio.Task(self.crawl_comment(), loop=self.loop) for _ in range(self.tasks)] if 'u' in args:#用户 workers += [asyncio.Task(self.crawl_user(), loop=self.loop) for _ in range(self.tasks)] if 'w' in args:#微博内容 workers += [asyncio.Task(self.crawl_weibo(), loop=self.loop) for _ in range(self.tasks)] if 'r' in args:#转发 workers += [asyncio.Task(self.crawl_repost(), loop=self.loop) for _ in range(self.tasks)] if 's' in args:#搜索 workers += [asyncio.Task(self.search(), loop=self.loop) for _ in range(self.tasks)] if 't' in args:#话题帖子 workers += [asyncio.Task(self.topic_finding(), loop=self.loop) for _ in range(self.tasks)] for _ in range(10): self.topic_finding_now() if 'a' in args:#名人堂 workers += [asyncio.Task(self.super_fan_finding(), loop=self.loop) for _ in range(self.tasks)] if 'i' in args: self.push_topic_job_all_now() sleep(5) if 'n' in args: self.push_super_new_fan_job_all_now() sleep(5) if workers: self.loop.run_until_complete(asyncio.wait(workers))
data_dic = dict() content = row["content"] if tab["weidu"] in content: count = 0 for zmxr in tab["zmxr"]: if zmxr in content: count += 1 data_dic["weidu"] = tab["weidu"] data_dic["sword"] = zmxr data_dic["taidu"] = 'good' data_dic["variable"] = row["variable"] data_dic["row_names"] = row["row_names"] data_dic["value"] = row["value"] status = row["row_names"] + '_' + tab["weidu"] + '_' + zmxr i = md5(status.encode("utf8")).hexdigest() returndf = bf.add(i) if not returndf: items = list() items.append(data_dic) save_df = pd.DataFrame(items) save_df.to_sql(name='content_luntan_test1', con=conn, if_exists="append", index=False) print("-" * 50 + "insert data" + "-" * 50) else: print("重复数据!") if count == 0: for z in tab["zxxr"]: if z in content: count += 1
xhr_headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Host': 'api.zhihu.com', 'Access-Control-Request-Method': 'GET', 'Connection': 'keep-alive' } # 保存已存在的live的id到id_list以及过滤器 b = ScalableBloomFilter(10000, 0.001) id_list = [] with open('live_id.txt', 'r+') as f: for line in f.readlines(): id = line.strip() id_list.append(id) [b.add(id)] # 保存已存在的people的id到过滤器 bb = ScalableBloomFilter(10000, 0.001) with open('peoples.txt', 'r+') as f: for line in f.readlines(): id = line.strip() if not id in bb: [bb.add(id)] # # 找出所有最新的live,去重,存在id_list和过滤器以及文件中 tag_list = [ 101, 102, 103, 104, 105, 106, 107, 108, 109, 201, 202, 203, 301, 302, 303, 304, 305 ]
class ChexiuPipeline(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mongo self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db = self.connection[settings['MONGODB_DB']] website = settings["WEBSITE"] # local_time = time.strftime('%Y-%m-%d', time.localtime()) # if website in ["pcauto_price", "yiche_price", "autohome_price"]: # self.collection = db[settings['MONGODB_COLLECTION'] + '_' + str(local_time)] # else: self.collection = db[settings['MONGODB_COLLECTION']] # bloom file self.CrawlCar_Num = 1000000 filename = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB'] + '/' + settings[ 'MONGODB_COLLECTION'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB'] self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) self.counts = 0 def process_item(self, item, spider): if spider.name in ["chexiuSpider", "chexiu_car"]: valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["status"])) else: self.fa.writelines(i + '\n') self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) # return item elif spider.name in ["autohome_price_new", "yiche_price", "pcauto_price", "58car_price"]: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) else: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) def close_spider(self, spider): self.connection.close() # self.fa.close() def dingmessage(self): # 请求的URL,WebHook地址 webhook = "https://oapi.dingtalk.com/robot/send?access_token=633758ccd22b7db4d2e9655488af7d3f5d5e0b2a32c701c80fc3cd57981e73a9" # 构建请求头部 header = { "Content-Type": "application/json", "Charset": "UTF-8" } # 构建请求数据 tex = "-车秀网爬虫结束-" message = { "msgtype": "text", "text": { "content": tex }, "at": { "isAtAll": False } } # 对请求的数据进行json封装 message_json = json.dumps(message) # 发送请求 info = requests.post(url=webhook, data=message_json, headers=header) # 打印返回的结果 print(info.text)
class CarbuisnessNewPipeline(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mongo self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) db = self.connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']] self.collectionurllog = db[settings['MONGODB_COLLECTION'] + "_urllog"] # bloom file self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) self.counts = 0 def process_item(self, item, spider): if spider.name in ["autohome_error_new", "jzg_price_master", ]: # if item["newcar_bug_num"] is not None or item["oldcar_bug_num"] is not None or item["oldcar_bug_ratio"] is not None or item["newcar_bug_ratio"] is not None: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) else: if spider.name in ['all_location', 'jzg_price', 'jzg_price_sh', 'xiaozhu_modellist', 'xiaozhu_gz', 'autohome_gz', 'jzg_modellist']: # print("*"*100) valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["status"])) else: pass if valid: self.fa.writelines(i + '\n') self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) if spider.name in ['autohome_url']: pass def close_spider(self, spider): self.connection.close() self.fa.close()
class Agent(Zentropian): def __init__(self, name=None): self.timers = TimerRegistry(callback=self._trigger_frame_handler) super().__init__(name=name) self.states.should_stop = False self.states.running = False self.loop = None # asyncio.get_event_loop() self._spawn_on_start = set() self._seen_frames = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH, error_rate=0.001) @on_state('should_stop') def _on_should_stop(self, state): if state.data.last is False and state.data.value is True: # skip double close self.close() return True async def _run_forever(self): # atexit.register(self.loop.close) if self._spawn_on_start: [self.spawn(coro) for coro in self._spawn_on_start] self._spawn_on_start = None self.emit('*** started', internal=True) self.timers.start_timers(self.spawn) while self.states.should_stop is False: await asyncio.sleep(1) self.emit('*** stopped', internal=True) def _set_asyncio_loop(self, loop=None): if self.loop and loop: raise AssertionError('Agent already has an event loop set.') if loop: self.loop = loop if not self.loop: try: self.loop = asyncio.get_event_loop() except RuntimeError: self.loop = asyncio.new_event_loop() def _trigger_frame_handler(self, frame: Frame, handler: Handler, internal=False): if isinstance(frame, Message) and frame.source == self.name: return if isinstance(frame, Event) and frame.source != self.name and frame.name.startswith('***'): return if frame and frame.id in self._seen_frames: return if not self.apply_filters([handler]): return if frame: self._seen_frames.add(frame.id) payload = [] # type: list if handler.pass_self: payload.append(self) if handler.kind != KINDS.TIMER: payload.append(frame) if handler.run_async: async def return_handler(): ret_val = await handler(*payload) if ret_val: self.handle_return(frame, return_value=ret_val) self.spawn(return_handler()) else: ret_val = handler(*payload) if ret_val: return self.handle_return(frame, return_value=ret_val) def add_handler(self, handler): if handler.kind == KINDS.TIMER: self.timers.add_handler(handler.name, handler) else: super().add_handler(handler) def on_timer(self, interval): def wrapper(handler): name = str(interval) handler_obj = Handler(kind=KINDS.TIMER, name=name, handler=handler) self.timers.add_handler(name, handler_obj) return handler return wrapper @staticmethod def sleep(duration: float): return asyncio.sleep(duration) def start(self, loop=None): self._set_asyncio_loop(loop) self.loop.create_task(self._run_forever()) def run(self): self._set_asyncio_loop() self.loop.run_until_complete(self._run_forever()) def spawn(self, coro): if not self.loop: self._spawn_on_start.add(coro) return return self.loop.create_task(coro) @staticmethod def spawn_in_thread(func, *args, **kwargs): task = threading.Thread(target=func, args=args, kwargs=kwargs) task.start() return task def run_in_thread(self): return self.spawn_in_thread(self.run) def stop(self): self.emit('*** stopping', internal=True) self.states.should_stop = True self.timers.should_stop = True def connect(self, endpoint, *, auth=None, tag='default'): retval = super().connect(endpoint, auth=auth, tag=tag) if not isgeneratorfunction(retval): return self.spawn(retval) def bind(self, endpoint, *, tag='default'): retval = super().bind(endpoint, tag=tag) if not isgeneratorfunction(retval): return self.spawn(retval) def join(self, space, *, tags: Optional[Union[list, str]] = None): retval = super().join(space, tags=tags) if not isgeneratorfunction(retval): return self.spawn(retval) def leave(self, space, *, tags: Optional[Union[list, str]] = None): retval = super().leave(space, tags=tags) if not isgeneratorfunction(retval): return self.spawn(retval) def close(self, *, endpoint: Optional[str] = None, tags: Optional[Union[list, str]] = None): """Closes all connections if no endpoint or tags given.""" if endpoint and tags: raise ValueError('Expected either endpoint: {!r} or tags: {!r}.' ''.format(endpoint, tags)) elif endpoint: connections = self._connections.connections_by_endpoint(endpoint) elif tags: connections = self._connections.connections_by_tags(tags) else: connections = self._connections.connections for connection in connections: connection.close()
def antcolony_userV4(token, domain, bloom: ScalableBloomFilter, key: str, dataobject, keylist=None, lamda=20, xhr_headers=XHR_HEADER_WZ): def keyconvert(keys, dict): c = {} for key in dict.keys(): if key in keys: c[key] = dict[key] return c urlhead = create_userV4(domain, token) req = getRequest() people_url = urlhead + paging(0, 5) print(people_url) print('before r') r = req.get(people_url, headers=xhr_headers) print('after r') if int(r.status_code) == 410: dataobject.delone(token) return 0 if int(r.status_code) > 300: raise WrongStatuCode(str(r.status_code) + ': ' + people_url) j = json.loads(r.text) if 'error' in j: raise ErrorInJson(__name__ + ": from url=" + urlhead + '\n msg=' + j['error']) print(j['paging']) try: total = int(j['paging']['totals']) except: total = None if total: print(total) print(round(total / lamda)) for i in range(round(total / lamda) + 1): urll = urlhead + paging(i * lamda, lamda) r = req.get(urll, headers=xhr_headers) # print(r) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if keylist: data = [keyconvert(keylist, x) for x in data] for i in data: if i[key] not in bloom: dataobject.insert(i) bloom.add(i[key]) print('data import ' + i[key]) else: print('pass') else: i = 0 while True: urll = urlhead + paging(int(i * lamda), lamda) r = req.get(urll, headers=xhr_headers) jc = json.loads(r.text, encoding='utf-8') data = jc['data'] if len(data) == 0: break if keylist: data = [keyconvert(keylist, x) for x in data] for c in data: if c[key] not in bloom: dataobject.insert(c) bloom.add(c[key]) print('data import ' + c[key]) else: print('pass') i += 1
class TaochePipeline(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): # mongo self.connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) db = self.connection[settings['MONGODB_DB']] website = settings["WEBSITE"] # local_time = time.strftime('%Y-%m-%d', time.localtime()) # if website in ["pcauto_price", "yiche_price", "autohome_price"]: # self.collection = db[settings['MONGODB_COLLECTION'] + '_' + str(local_time)] # else: self.collection = db[settings['MONGODB_COLLECTION']] # bloom file self.CrawlCar_Num = 1000000 filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MONGODB_DB'] + '/' + settings['MONGODB_COLLECTION'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MONGODB_DB'] self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) self.counts = 0 def process_item(self, item, spider): if spider.name in ["taoche_car", "taoche_gz"]: valid = True i = md5(item['status'].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["status"])) else: self.fa.writelines(i + '\n') self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) # return item elif spider.name in [ "autohome_price_new", "yiche_price", "pcauto_price", "58car_price" ]: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) else: self.collection.insert(dict(item)) logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.counts += 1 logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) def close_spider(self, spider): self.connection.close() self.fa.close()
# Initialise a crawling dataset connection print(colored('Initialising wikipedia crawling collection...', 'cyan')) crawl_collection = init_crawl_collection() # Iterate through the crawling database n = 0 print(colored('Iterating over crawling database...', 'cyan')) bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for topic, sentence in iter_topic(crawl_collection, args['start']): # Clean topic string topic = topic.replace("'", '').replace('\n', '') # Check if the number of processed topic exceed the limit? if topic not in bf: bf.add(topic) if len(bf) > args['limit']: print(colored('[Topics limit reached] ... BYE', 'cyan')) sys.exit(0) # Break the sentence into knowledge nodes pos = TextStructure.pos_tag(sentence) kb_nodes = patterns.capture(pos) # Clean up each of the nodes # a) Remove stopwords # b) Remove duplicates # c) Ensure supported encoding kb_nodes = ensure_viable(kb_nodes, stopwords) if args['verbose']:
class YongdaPipeline(object): @classmethod def from_crawler(cls, crawler): # 获取配置中的时间片个数,默认为12个,1分钟 idle_number = crawler.settings.getint('IDLE_NUMBER', 6) # 实例化扩展对象 ext = cls(crawler.settings, idle_number, crawler) # 将扩展对象连接到信号, 将signals.spider_idle 与 spider_idle() 方法关联起来。 crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle) return ext def __init__(self, settings, idle_number, crawler): # mysql self.conn = create_engine( f'mysql+pymysql://{settings["MYSQL_USER"]}:{settings["MYSQL_PWD"]}@{settings["MYSQL_SERVER"]}:{settings["MYSQL_PORT"]}/{settings["MYSQL_DB"]}?charset=utf8' ) # mongo # uri = f'mongodb://{settings["MONGODB_USER"]}:{settings["MONGODB_PWD"]}@{settings["MONGODB_SERVER"]}:{settings["MONGODB_PORT"]}/' # self.connection = pymongo.MongoClient(uri) # self.connection = pymongo.MongoClient( # settings['MONGODB_SERVER'], # settings['MONGODB_PORT'] # ) # db = self.connection[settings['MONGODB_DB']] # self.collection = db[settings['MONGODB_COLLECTION']] # count self.mongocounts = 0 self.counts = 0 self.CrawlCar_Num = 1000000 self.settings = settings # redis 信号 self.crawler = crawler self.idle_number = idle_number self.idle_list = [] self.idle_count = 0 # bloom file filename = str(pathlib.Path.cwd()) + '/blm/' + settings[ 'MYSQL_DB'] + '/' + settings['MYSQL_TABLE'] + '.blm' dirname = str(pathlib.Path.cwd()) + '/blm/' + settings['MYSQL_DB'] # scrapy date self.start_date = None self.end_date = None self.scrapy_date = f'{self.start_date} - {self.end_date}' # dataframe self.df_result = pd.DataFrame() # 布隆过滤 self.df = ScalableBloomFilter(initial_capacity=self.CrawlCar_Num, error_rate=0.01) # self.df = BloomFilter(capacity=self.CrawlCar_Num, error_rate=0.01) # read if os.path.exists(dirname): if os.path.exists(filename): self.fa = open(filename, "a") else: pathlib.Path(filename).touch() self.fa = open(filename, "a") else: os.makedirs(dirname) pathlib.Path(filename).touch() self.fa = open(filename, "a") with open(filename, "r") as fr: lines = fr.readlines() for line in lines: line = line.strip('\n') self.df.add(line) def open_spider(self, spider): self.start_date = time.strftime('%Y-%m-%d %X', time.localtime()) def process_item(self, item, spider): if spider.name == '': valid = True i = md5(item[''].encode("utf8")).hexdigest() returndf = self.df.add(i) if returndf: valid = False raise DropItem("Drop data {0}!".format(item["statusplus"])) else: pass if valid: self.fa.writelines(i + '\n') # 数据存入mysql items = list() items.append(item) df = pd.DataFrame(items) df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) logging.log( msg= f"scrapy {self.mongocounts} items", level=logging.INFO) self.mongocounts += 1 # if spider.name == '': # self.df_result = pd.concat([self.df_result, df]) # self.mongocounts += 1 # logging.log(msg=f"add {self.mongocounts} items", level=logging.INFO) # else: # df.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) # self.mongocounts += 1 # logging.log(msg=f"scrapy {self.mongocounts} items", level=logging.INFO) def close_spider(self, spider): # self.connection.close() # if spider.name == '': # self.df_result.to_sql(name=self.settings['MYSQL_TABLE'], con=self.conn, if_exists="append", index=False) self.conn.dispose() self.end_date = time.strftime('%Y-%m-%d %X', time.localtime()) self.scrapy_date = f'{self.start_date} - {self.end_date}' logging.info(self.scrapy_date) def spider_idle(self, spider): self.idle_count += 1 # 空闲计数 self.idle_list.append(time.time()) # 每次触发 spider_idle时,记录下触发时间戳 idle_list_len = len(self.idle_list) # 获取当前已经连续触发的次数 print(self.scrapy_date) logging.info(self.scrapy_date) # print(idle_list_len) # print(self.idle_count) # print(self.idle_list[-1] - self.idle_list[-2]) # 判断 当前触发时间与上次触发时间 之间的间隔是否大于5秒,如果大于5秒,说明redis 中还有key if idle_list_len > 2 and not (1 < (self.idle_list[-1] - self.idle_list[-2]) < 6): self.idle_list = [self.idle_list[-1]] self.idle_count = 1 elif idle_list_len == self.idle_number + 1: # 空跑一分钟后记录结束时间 self.end_date = time.strftime('%Y-%m-%d %X', time.localtime()) self.scrapy_date = f'{self.start_date} - {self.end_date}' self.start_date = time.strftime('%Y-%m-%d %X', time.localtime()) print(self.scrapy_date) print("*" * 100) elif idle_list_len > self.idle_number + 12: # 空跑一分钟后重置起始时间 self.start_date = time.strftime('%Y-%m-%d %X', time.localtime()) self.idle_count = 0
# Initialise a crawling dataset connection print(colored('Initialising wikipedia crawling collection...','cyan')) crawl_collection = init_crawl_collection() # Iterate through the crawling database n = 0 print(colored('Iterating over crawling database...','cyan')) bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for topic,sentence in iter_topic(crawl_collection,args['start']): # Clean topic string topic = topic.replace("'",'').replace('\n','') # Check if the number of processed topic exceed the limit? if topic not in bf: bf.add(topic) if len(bf) > args['limit']: print(colored('[Topics limit reached] ... BYE','cyan')) sys.exit(0) # Break the sentence into knowledge nodes pos = TextStructure.pos_tag(sentence) kb_nodes = patterns.capture(pos) # Clean up each of the nodes # a) Remove stopwords # b) Remove duplicates # c) Ensure supported encoding kb_nodes = ensure_viable(kb_nodes, stopwords) if args['verbose']: