def push_log(self, spider_name, stats): self.connect_kafka() stats['start_time'] = time.mktime(stats['start_time'].timetuple()) stats['finish_time'] = time.mktime(stats['finish_time'].timetuple()) stats['spider'] = spider_name try: #logging.info(stats) self.kafka.send_messages(KAFKA_LOG_STATS_TOPIC, *[json.dumps(stats)]) except FailedPayloadsError as e: logging.error(e) logging.info(stats) del stats['spider'] for key in stats.keys(): if '.' in key: del stats[key] stat_spider_old = self.collection_stats.find_one( {"spider": spider_name}) if stat_spider_old: stat_spider_new = stat_spider_old else: stat_spider_new = {'spider': spider_name, 'stats': []} if len(stat_spider_new['stats']) < 7: stat_spider_new['stats'].append(stats) else: stat_spider_new['stats'].pop(0) stat_spider_new['stats'].append(stats) stat_spider_new['last_history'] = stats self.collection_stats.update({"spider": spider_name}, {"$set": stat_spider_new}, upsert=True, multi=False)
def createDb(name, user="", passwd="", ram=100, replica=0, server="http://localhost:8091/pools/default/buckets"): """ Create a new bucket by using system curl command """ # curl -X POST -u username:password -d name=newbucket -d ramQuotaMB=100 -d authType=none # -d replicaNumber=1 -d proxyPort=11216 http://localhost:8091/pools/default/buckets command = "curl -X POST -u %s:%s -d name=%s -d ramQuotaMB=%s -d authType=sasl " \ "-d replicaNumber=%s %s" \ % (user, passwd, name, ram, replica, server) import commands _, output = commands.getstatusoutput(command) lines = output.split("\n") if len(lines) < 4: logging.info("Create new bucket: %s" % name) return True response = json.loads(lines[3]) if 'errors' in response: logging.error(response) return False else: logging.info("Create new bucket: %s" % name) return True
def generateSpider(spider_name, over_write): logging.info('generateSpider') commands = [] commands.append(config.PYTHON) commands.append('generator/runner.py') commands.append('generator/generate.py') commands.append('--mongo_server') commands.append(config.MONGO_SERVER) commands.append('--spider_name') commands.append(spider_name) commands.append('--spider_template') commands.append('generator/spider_template.django.py') commands.append('--output_spider_py') commands.append('scraper/spiders/' + getFileNameFromSpiderName(spider_name)) commands.append('--over_write') commands.append(str(over_write)) commands.append('--storage_spider_template') commands.append('generator/storage_spider_template.django.py') commands.append('--output_storage_py') commands.append('scraper/storage_spiders/' + getFileNameFromSpiderName(spider_name)) logging.info(" ".join(commands)) msg = execute(commands) return msg
def copy_couchdb_to_couchbase(fromDb, toDb, batch_size=10000): batch = {} cnt = 0 for doc in couch_util.get_pager(fromDb): del doc['_rev'] batch[doc['_id']] = doc if len(batch) > batch_size: try: toDb.upsert_multi(batch) cnt += len(batch) except TemporaryFailError: logging.warning( "Connection timeout. Try to break and update batch") for key, value in batch.items(): toDb.upsert(key, value) cnt += 1 batch = {} logging.info("Copied %s docs" % cnt) if len(batch) > 0: try: toDb.upsert_multi(batch) cnt += len(batch) except: logging.warning( "Connection timeout. Try to break and update batch") for key, value in batch.items(): toDb.upsert(key, value) cnt += 1 logging.info("Copied %s docs" % cnt) print "Done"
def close_all_connect(self): # if self.collection_spider: # self.collection_spider.close() if self.kafka: self.kafka.stop() self.kafka.client.close() logging.info("Closed all connection.")
def delete(db, docid): try: db.delete(db[docid]) logging.info("Doc %s is deleted!" % docid) except: logging.warning("Can not delete doc %s in %s" % (docid, db.name)) pass
def createOrMergeBatch(db, doc_batch): """ create new or merge with existing in batch. Input is a list of couchdb.Document objects. """ assert type(doc_batch) == list, "Bad input %s" % type(doc_batch) # break down doc_batch if doc_batch too large try: responses = db.update(doc_batch) except: logging.warning( "Error with doc batch of size %s. Try to break it down" % len(doc_batch)) responses = [] for doc in doc_batch: responses.extend(db.update([doc])) for (success, docid, rev_or_exc), doc in zip(responses, doc_batch): if not success: assert type(rev_or_exc) == ResourceConflict if docid == doc["_id"]: continue #same doc, updated twice. logging.info("Merging doc %s with %s" % (doc["_id"], docid)) newDoc = db[docid] if mergeDoc(newDoc, doc): db[docid] = newDoc
def deleteDocsByIds(db, docids): oks = db.remove_multi(docids, quiet=True) key_not_found = 0 for docid in docids: if oks[docid].rc == 0xD: key_not_found += 1 logging.warning("Not found key %s to delete" % docid) logging.info("Deleted %d docs" % (len(docids) - key_not_found))
def checkJsonGreater(url, xpath, expectValue): logging.info("checkJsonGreater %s with %s" % (expectValue, url)) value = xtractFromUrl(url, xpath) if value is None: return False try: return int(value) > expectValue except: logging.warn('%s is not int', value) return False
def checkContentContain(url, expectContent): logging.info("checkContentContain %s with %s" % (expectContent, url)) try: response = requests.get(url) content = response.text.lower() return content.find(expectContent.lower()) >= 0 except: logging.error('Cannot load content from %s', url) return False
def get_requests(self, url): try: req = requests.get(url, headers=HDR, allow_redirects=False, timeout=60) return req except Exception as e: logging.exception(e) logging.info("Get content from %s null", url) return None
def get_requests(url): HDR = { 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' } try: req = requests.get(url, headers=HDR, allow_redirects=False, timeout=20) return req except Exception as e: logging.exception(e) logging.info("Get content from %s null", url) return None
def tailLog(spider_name, number_lines, files=None): logging.info('tailLog') if files: logFile = config.LOG_DIR + "/" + spider_name + '_from_' + files + '.log' else: logFile = config.LOG_DIR + "/logs_crawl/" + spider_name + '.log' commands = [] commands.append('tail') commands.append('-' + str(number_lines)) commands.append(logFile) msg = execute(commands) return msg
def parse_item(self, response, url=None): origin_url = None if url is None: if self.from_url_file == None: self.parse(response) hxs = Selector(response) expired = self.setExpiredItemsBaseOnStatus(response.url, response.status) origin_url = response.url url = self.add_tracking_code(response.url) else: origin_url = self.remove_tracking_code(url) req = self.get_requests(origin_url) if req: expired = self.setExpiredItemsBaseOnStatus( origin_url, req.status_code) if not expired: hxs = Selector(text=req.content) else: return None else: return None if not expired: logging.info('======================>EXTRACT %s', origin_url) item = Product() item['source'] = self.name item['origin_url'] = origin_url item['url'] = self.add_tracking_code(origin_url) item['timestamp'] = time.time() for prop, xpath in self.xpath_dict.items(): if xpath.strip(): try: item[prop] = cleanText(hxs.xpath(xpath).extract()) except KeyError: continue if not hxs.xpath( (xpath)).extract() or hxs.xpath(xpath).extract() == "": del item[prop] if prop == "description": item["description"] = hxs.xpath( xpath + "/node()[not(self::script)]").extract() if prop == "property": item['property'] = hxs.xpath( xpath + "/node()[not(self::script)]").extract() item = test_data.process_data(item) item = self.check_item(item) if item is not None and self.isValid(item): return item else: return None elif url is not None: return None
def delete(spider_name, _type): if _type == "dir": commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name elif _type == "py": commands = 'rm -rf ' + config.CRAWL_DIR + 'scraper/spiders/' + getFileNameFromSpiderName( spider_name) elif _type == "all": delete_spider_mongo(spider_name) commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name commands += ' && rm -rf ' + config.CRAWL_DIR + 'scraper/spiders/' + getFileNameFromSpiderName( spider_name) logging.info(" ".join(commands)) msg = os.system(commands) return msg
def deleteDocsByIds(db, docids): docBatch = [] for doc in getDocsByIds(db, docids): if doc: doc["_deleted"] = True docBatch.append(doc) responses = db.update(docBatch) deleteCount = 0 for (success, docid, _) in responses: if not success: logging.info("Failed to delete doc %s", docid) else: deleteCount += 1 logging.info("Deleted %d docs", deleteCount)
def parse_item_and_links(self, response): item = self.parse_item(response) if item: logging.info("Link item: %s", response.url) yield item else: logging.info("Not link item: %s", response.url) for rule in self.rules: if not rule.link_extractor: continue links = rule.link_extractor.extract_links(response) for link in links: if link.url.startswith("http"): yield self.request(link.url, callback=self.parse_item_and_links)
def copy_couchbase_to_kafka(fromDb, kafka_producer, kafka_topic, batch_size=1000): batch = [] for doc in get_pager(fromDb): batch.append(doc) if len(batch) > batch_size: kafka_producer.send_messages(kafka_topic, *[json.dumps(msg) for msg in batch]) batch = [] if len(batch) > 0: kafka_producer.send_messages(kafka_topic, *[json.dumps(msg) for msg in batch]) logging.info("Saving Couchbase Bucket %s -> Kafka %s. Done!" % (fromDb.bucket, kafka_topic))
def test_process(self): try: res = HttpRequest(self.status, self.http_method, self.url, self.params).http_request() except Exception as e: test_result = 'Error' logging.exception(e) print("{0},Error msg:{1}".format(test_result, e)) raise e else: Actual_Result = res.json() # self.getlogging.get_logging("INFO","响应体:{}".format(ActualResult)) # print(ActualResult) try: for key in self.ExpectedResult.keys(): # 遍历预期结果的key值 # print(key) # 对比预期和实际结果中,对应的值是否一样 self.assertTrue(self.ExpectedResult[key] == Actual_Result[key]) test_result = 'Pass' logging.info(test_result) # if self.ExpectedResult["reason"] ==ActualResult['reason'] and # self.ExpectedResult['error_code']==ActualResult['error_code']: except Exception as e: test_result = 'Fail' # log写入fail的原因,是由于预期和实际结果,key相同的情况下,value值不一致导致 logging.info( "{0},ExpectedResult is not equal to ActualResult, wrong msg({1}):[{2}]!=[{3}]" .format(test_result, key, self.ExpectedResult[key], Actual_Result[key])) print( "Test Fail,ExpectedResult is not equal to ActualResult, wrong msg({0}):[{1}]!=[{2}]" .format(key, self.ExpectedResult[key], Actual_Result[key])) raise e finally: # self.getlogging.get_logging("INFO","test fail") # test_result = 'Fail' # print("Test Fail") # print("test_result:{}".format(test_result)) # 重新写会excel文件,由于actualresult文件是json,则需要转化到str,ensure_ascii=False-->控制输出中文 ManageExcel().write_back( self.sheet_name, self.case_id + 1, test_result, json.dumps(Actual_Result, ensure_ascii=False))
def getUrls(source, sincedays, beforedays, missing, start=0, limit=1000): logging.info("Get data...") url_query = source_api if source is not None: url_query += "&source=" + source if sincedays is not None: url_query += "&sincedays=" + str(sincedays) if beforedays is not None: url_query += "&beforedays=" + str(beforedays) if missing is not None: url_query += "&missing=" + missing urls = [] while True: url_query += '&start=' + str(start) + '&limit=' + str(limit) urls.extend(json.loads(requests.get(url_query).text)['hits']) start += limit if start >= DEFAULT_LIMIT: break logging.info("Done!") return urls
def getAllDocIdsBySource(es_server, source, limit=10000): base_url = es_server + "?q=source:%s&_source=false" % source start = 0 limit = limit docids = [] cnt = 0 while True: url = base_url + "&from=%d&size=%d" % (start, limit) data = json.loads(requests.get(url).text) if 'hits' in data: for doc in data['hits']['hits']: docids.append(doc['_id']) cnt += 1 logging.info("Get %d docids", cnt) if len(data['hits']['hits']) < limit: break else: break start += limit return docids
def cache_images(image_urls): images_not_cached, images_cached = download_images(image_urls) cnt = 0 images = {} images['base_url'] = config.BASE_URL images['images'] = [] images_id = {} for image in images_not_cached: image_cached = {} width, height = image['image'].size if width < MIN_WIDTH or height < MIN_HEIGHT: continue try: image_full, path_full = create_path_image(image['image'], image['name']) except: continue save_to_s3(image_full[1], path_full) images_id[image['name']] = image['url'] image_cached['image_id'] = image['name'] image_cached['thumbs_type'] = [] for thum_id, size in IMAGES_THUMBS.iteritems(): thum_img, path = create_path_image(image['image'], image['name'], thum_id) save_to_s3(thum_img[1], path) image_cached['thumbs_type'].append(thum_id) if image_cached: images['images'].append(image_cached) if images_id: r.mset(images_id) cnt += 1 logging.info("Cached %s images" % (cnt)) for image_id in images_cached: image_cached = {} image_cached['image_id'] = image_id image_cached['thumbs_type'] = ['big', 'normal', 'small'] images['images'].append(image_cached) return images
def startSpider(spider_name, crawler_server): if spider_name is not None: if spider_name in spiders_running: logging.error("Spider name \"%s\" running", spider_name) spider = collection.find_one({'doc.spider': spider_name}) logging.info("Info spider: %s", spider['crawler_status']) return crawler_server try: requests.get('http://' + crawler_server['name_server'] + '.localhost:' + SERVER_PORT + '/crawler/startcrawl?spider=' + spider_name) crawler_server = set_free_thread(crawler_server) logging.info("Start spider \"%s\" successful at \"%s\"!", spider_name, crawler_server['name_server']) except (Timeout, ConnectionError): crawler_server['status'] = False logging.error("Start spider \"%s\" failed at \"%s\"!", spider_name, crawler_server['name_server']) else: logging.error("Spider is None!") return crawler_server
def mergeDoc(existing_doc, new_doc): """ existing_doc is merged with new_doc. Returns true/false if existing_doc is modified. """ records = existing_doc.setdefault("records", []) if 'records' not in new_doc: return False isModified = False for new_record in new_doc['records']: if new_record not in records: records.append(new_record) isModified = True logging.info("# merged records %d " % len(records)) # Merge images. images = existing_doc.setdefault("images", []) for new_image in new_doc['images']: if new_image not in images: images.append(new_image) isModified = True logging.info("# merged images %d " % len(images)) # Merge sources. sources = existing_doc.setdefault("sources", []) for new_source in new_doc['source']: if new_source not in sources: sources.append(new_source) isModified = True logging.info("# merged sources %d " % len(sources)) return isModified
def buildListCrawlSpiders(): validSpiders = getAllValidSpiders() list_spiders = [] now = time.time() spider = {} logging.info("=====>Please wait, we're building spider list...") try: for spi in validSpiders: try: spider['name'] = spi['doc']['spider'] crawled_time = (float(now) - spi['crawler_status']['last_stop_time']) / 3600 spider['score'] = calculateSpiderScore( spi['crawler_status'].get('priority', 1), crawled_time, spi['crawler_status'].get('items', 1000), spi['crawler_status']['status']) list_spiders.append(spider) spider = {} except KeyError: continue except NetworkTimeout, ServerSelectionTimeoutError: sleep(60)
def startCrawl(spider_name): logging.info('startCrawl') logFile = config.LOG_DIR + 'logs_crawl/' + spider_name + '.log' jobdir_crawl = config.LOG_DIR + 'crawls/' + spider_name #kill process commands = 'ps -ef | grep python | grep ' + spider_name + ' | grep -v grep | awk \'{print $2}\' | xargs kill -2' logging.info(" ".join(commands)) os.system(commands) commands = 'scrapy crawl ' + spider_name + ' -s JOBDIR=' + jobdir_crawl + ' &> ' + logFile + ' &' logging.info(" ".join(commands)) os.system(commands) return '/crawler/taillog?spider=' + spider_name
def process_source_docs(source, command_type, batch_size=1000): docids = getAllDocIdsBySource(ES_SERVER, source) batch = [] cnt = 0 for docid in docids: batch.append(docid) cnt += 1 if batch_size <= len(batch): response = send_command(COMMAND_API, command_type, batch) if response.status_code != 200: raise Exception("Error when execute command %s", command_type) logging.info("Sent to %s %d docs", command_type, cnt) batch = [] time.sleep(5) if len(batch) > 0: send_command(COMMAND_API, command_type, batch) logging.info("Sent to %s %d docs", command_type, cnt) logging.info("Done %s %d docs from source %s", command_type, cnt, source)
def startServiceMaster(): logging.info("Start master-service!") list_crawler_servers = getCrawlerServers() list_spiders = buildListCrawlSpiders() count_start = 0 if list_spiders is not None and len(list_spiders) > 0: for spider in list_spiders: crawler_server, list_crawler_servers = chooseCrawlerServer( list_crawler_servers) if crawler_server is not None: crawler_server = startSpider(spider['name'], crawler_server) list_crawler_servers = updateCrawlerServers( crawler_server, list_crawler_servers) count_start += 1 else: logging.info("=======> All crawler servers were full!") break logging.info("Started %d spiders." % (count_start)) else: logging.error("No result from MongoDb!")
def remove_dir_crawl(self, spider_name): commands = 'rm -rf ' + config.LOG_DIR + 'crawls/' + spider_name logging.info(" ".join(commands)) os.system(commands)
def check(self, value): self.cnt = self.cnt + 1 if self.cnt % value == 0: logging.info('Processed %s', self.cnt)
def __pruning_bfs(self, lambd, max_neighbors, queue_size): """ 剪枝广度优先搜素 :param lambd: :param max_neighbors: :param queue_size: :return: """ # 存放搜索到的路径 results = [] # 起始结点 start = (self.start + self.sep + self.start, 0) # 终止结点 stop = (self.stop + self.sep + self.stop, 0) queue = Queue.Queue(queue_size) # 起始结点入栈 queue.put([start]) while not queue.empty(): # 出队 phrase = queue.get() # 获取当前短语的最后一个单词 node = phrase[len(phrase) - 1] if stop == node: # 已经是最后一个结点 if len(phrase) >= 8: # 只选择长度在8个单词以上的句子 results.append(phrase) continue # 将当前短语转换成字符串形式 str_phrase = '' for nodeflag, num in phrase: str_phrase += nodeflag.split(self.sep)[0] + ' ' logging.info('results size[%d] queue size[%d] phrase[%s]', len(results), queue.qsize(), str_phrase) # 获取当前结点的邻接后继结点 pos_neighbors = self.graph.neighbors(node) # 每个后继结点的综合得分(考虑路径得分和语言模型得分) neighbor_weight = {} # 依次处理每个后继结点 for pos_neighbor in pos_neighbors: # 获取两个结点之间边的权重 edge_weight = self.graph.get_edge_data(node, pos_neighbor)['weight'] if edge_weight == 0: continue # 计算当前结点与之前语句构成的新的语句的语言模型得分 fluency_weight = self.grammar_scorer.cal_fluency(str_phrase + pos_neighbor[0].split(self.sep)[0]) # 计算综合得分 general_score = 1 / edge_weight + lambd * fluency_weight / (len(re.split('\s+', str_phrase)) + 1) logging.info("lambd[%f] general score[%f] edge weight[%f] fluency weight[%f]", lambd, general_score, edge_weight, fluency_weight) # 计算当前后继结点的综合得分 neighbor_weight[pos_neighbor] = general_score # 将后继结点按综合得分由大到小进行排序(可以考虑改成推排序来提升性能) sort_neighbor_weight = sorted(neighbor_weight.iteritems(), key=lambda neighbor_weight : neighbor_weight[1], reverse=True) # 选择指定数目的结点如队列 for i in range(min(max_neighbors, len(sort_neighbor_weight))): if queue.full(): break # 综合得分最高的max_neighbors个邻接后继结点如队列 new_phrase = phrase + [sort_neighbor_weight[i][0]] queue.put(new_phrase) return results