def _process_muse(muse): """ Processes a Muse's tweets, creating Tweet objects and saving them to the db. """ username = muse.username logger.info("Collecting tweets for %s..." % username) try: tweets = twitter.tweets(username=username) except TweepError: return [] new_tweets = [] for tweet in tweets: data = {"body": tweet["body"], "tid": tweet["tid"], "username": username} t = Tweet(**data) try: t.save() new_tweets.append(tweet) except (NotUniqueError, DuplicateKeyError, OperationError): # Duplicate tweet pass return new_tweets
def process_task(taskId): with app.app_context(): try: task = db.session.query(Task).filter(Task.id == taskId).one() task.status = 'deploying' db.session.commit() steps = Step.query.filter(Step.taskId == taskId).all() returncode = 0 for step in steps: returncode = worker.process(step.content, step.id) logger.info("returncode for step %s is %s" % (step.id, returncode)) step.log = worker.logs.get(step.id, '') logger.info(worker.logs) if worker.logs.has_key(step.id): worker.logs.pop(step.id) if returncode != 0: step.status = 'failed' else: step.status = 'success' db.session.commit() if step.status == 'failed': break if returncode != 0: task.status = 'failed' else: task.status = 'success' db.session.commit() # del process_tasks[task.id] except Exception, e: logger.error('error while process task %s' % task.id) logger.error(traceback.format_exc())
def setup_redis(): if not redis.exists(switch_key): ret = redis.hmset(switch_key, {oj: 1 for oj in SUPPORT_OJ}) if ret: logger.info('setup switch key success') else: log_spider_status()
def run_scripts(info, message): os.environ['PLX_MESSAGE'] = message for x in info: try: os.environ['PLX_%s' % x.upper()] = '%s' % info[x] except: os.environ['PLX_%s' % x.upper()] = 'n/a' logger.warning("unable to set env variable for PLX_%s, setting to 'n/a'" % (x.upper())) continue #logger.warning("unable to set env variable for PLX_%s = %s" % (x.upper(), info[x])) for script in config.PP_SCRIPTS: if not os.path.exists(script): logger.warning("%s does not exist", script) continue logger.info("executing script: [%s]" % script) p = Popen([script], stdout=PIPE, stderr=None) if config.PP_SCRIPTS_LOGGING: for line in p.stdout: log_lines(script, line) p.communicate() if p.returncode == 0: logger.info("script executed successfull: [%s]" % script) else: logger.warning("script: [%s] failed with code: %s" % (script, p.returncode))
def post(self): config = Config.objects[0] form = self.form(request.form, obj=config) if form.validate(): form.populate_obj(config) # Need to reload the brain. if form.ramble.data != brain.MKV.ramble or form.spasm.data != brain.MKV.spasm: logger.info('Brain config value changed!') brain.MKV.ramble = form.ramble.data brain.MKV.spasm = form.spasm.data # If the ngram size has changed, # the brain needs to be retrained. if form.ngram_size.data != brain.MKV.n: logger.info('Brain ngram size changed! Retraining...') brain.MKV.n = form.ngram_size.data brain.retrain() config.save() flash('I will change my ways.') return redirect(url_for('config_api')) return redirect(url_for('config_api'))
def spider_init(): for oj, oj_queue in spider_factory.items(): spider_name = settings.SUPPORT_OJ[oj] spider_class = getattr(sys.modules['app.spiders.' + spider_name], spider_name) while oj_queue.qsize() < oj_queue.maxsize: oj_queue.put(spider_class()) logger.info('[{}] queue INIT OK => size {}'.format(spider_name, oj_queue.qsize()))
def fetch_cookie(self): if self.cookie: return True response = yield self.load_page(self.index_url) if not response: return False self.cookie = response.headers['Set-Cookie'] logger.info('{} fetch cookie success'.format(self.TAG)) return True
def spider_runner(): while True: cur = yield account_queue.get() logger.info('{} from queue, start working'.format(cur)) yield gen.sleep(5) cur.set_normal() cur.save() logger.info('{} work done'.format(cur)) account_queue.task_done()
def account_producer(): logger.info('queue start working') while True: cur = account.get_available_account() if cur: yield account_queue.put(cur) logger.info('{} put into queue, queue size {}'.format(cur, account_queue.qsize())) else: yield gen.sleep(10)
def tweet(text): try: api.update_status(text) except TweepError as err: # Assume we may have violated some rate limit # and forget about it if '403' in err: logger.info('403 error when trying to tweet. Possibly hit a rate limit.') else: raise err
def send_notification(message): logger.info(u"sending notification mail: %s" % message) msg = Message("plexivity notification", recipients=[config.MAIL_RECIPIENT], sender=config.MAIL_FROM) msg.body = message if mail.send(msg): logger.info(u"Notification mail successfully send") return True else: logger.error(u"unable to send mail notification") return False
def send_notification(message): logger.info(u"sending notification to Pushbullet: %s" % message) args = {"type": "note", "title": message, "body": message} status = requests.post("https://api.pushbullet.com/v2/pushes", auth=(config.PUSHBULLET_KEY, ""), data=args) if status.ok: logger.info(u"Notification to Pushbullet successfully send: %s" % status.content) return True else: logger.error(u"unable to send notification to pushbullet: %s" % status.content) return False
def fetch_cookie(self): if self.cookie: return True response = yield self.load_page(self.index_url) if not response: return False self.cookie = response.headers['Set-Cookie'] self.cookie = self.cookie.split(';')[0] + '; username=Raychat;' logger.info('{} fetch cookie success'.format(self.TAG)) return True
def send_notification(message): logger.info(u"sending notification to Pushover: %s" % message) args = {"token": config.PUSHOVER_TOKEN, "user": config.PUSHOVER_USER, "message": message} status = requests.post("https://api.pushover.net/1/messages.json", data=args) if status.ok and status.json()["status"] == 1: logger.info(u"Notification to Pushover successfully send with response %s" % status.content) return True else: logger.error(u"Unable to send notification to pushover: %s" % status.content) return False
def send_notification(message): logger.info(u"sending notification to Boxcar: %s" % message) args = {'notification[long_message]': message, 'notification[title]': "plexivity", 'notification[sound]': 'bird-1', 'user_credentials': config.BOXCAR_TOKEN} status = requests.post("https://new.boxcar.io/api/notifications", data=args, timeout=2) if status.ok: logger.info(u"Notification to Boxcar successfully send: %s" % status.content) return True else: logger.error(u"unable to send notication to boxcar %s" % status.content) return False
def retrain(): """ Retrains the Markov generator on the documents in the database. """ MKV.reset() tweets = [tweet.body for tweet in Tweet.objects.all()] logger.info("Training on %s tweets" % len(tweets)) MKV.train(tweets) docs = [doc.body for doc in Doc.objects.all()] logger.info("Training on %s docs" % len(docs)) MKV.train(docs)
def test_crack_captcha(count): output = crack_captcha_cnn() saver = tf.train.Saver() correct = 0 for i in range(count): text, image = gen_captcha() image = convert2gray(image) image = image.flatten() / 255 predict_text = crack_captcha(image, output, saver) if text.lower() == predict_text.lower(): correct = correct + 1 logger.info("正确: {} 预测: {}".format(text, predict_text)) logger.info('总数 ' + str(i) + " 正确 " + str(correct) + ' 准确率:' + str(correct / (i + 1)))
def _consider_retweets(tweets): """ Retweets if positive classification is above THRESHOLD. 0 = neg, 1 = pos """ logger.info("Considering retweeting...") num_retweeted = 0 retweet_threshold = config().retweet_threshold # Filter out protected tweets. candidates = [tweet for tweet in tweets if not tweet["protected"] and not tweet["retweeted"]] txts = _get_tweet_texts(candidates) if txts: for idx, doc_probs in enumerate(CLS.classify(txts)): if num_retweeted >= config().max_retweets: logger.info("Hit maximum retweet limit, stopping for now.") break if doc_probs[1] > retweet_threshold: logger.info( "Classified as %s retweetable, above %s threshold, retweeting..." % (doc_probs[1], retweet_threshold) ) twitter.retweet(candidates[idx]["tid"]) num_retweeted += 1 else: logger.info( "Classified as %s retweetable, below %s threshold, not retweeting..." % (doc_probs[1], retweet_threshold) )
def __init__(self, cookie, clear=True): super(Order, self).__init__(cookie) self.status = {1: "已卖出", 2: "已买入", 3: "繁育收入", 4: "繁育支出"} self.txnStatus = {0: "上链中", 1: "上链中", 2: "成功", 3: "失败", 4: "失败"} if (clear): # 查询保存前先清掉所有数据 logger.info('强制更新数据,清除所有记录') mongo.order_collection.delete_many({}) # 清除微积分数据 logger.info('强制更新数据,清除微积分总数记录') mongo.calculus_collection.delete_many({})
def _automatic_login(self): self.driver.get('https://mail.yahoo.com/d/folders/1') self.driver.implicitly_wait(3) time.sleep(5) # check the login. if self.driver.current_url.startswith('https://login.yahoo.com'): self.loggedin = False logger.info('{} (automatic login...).'.format(self.profile.email)) email = self.driver.find_element_by_id("login-username") email.clear() email.send_keys(self.profile.email) email.send_keys(Keys.RETURN) self.driver.implicitly_wait(3) time.sleep(3) password = self.driver.find_element_by_id('login-passwd') password.clear() password.send_keys(self.profile.password) time.sleep(1) # Click login try: password.send_keys(Keys.RETURN) except StaleElementReferenceException: self.driver.find_element_by_id('login-signin').click() self.driver.implicitly_wait(2) time.sleep(2) self.driver.get('https://mail.yahoo.com/d/folders/1') time.sleep(2) # check the login status. try: wait = WebDriverWait(self.driver, 30, poll_frequency=0.05) wait.until(EC.url_contains('https://mail.yahoo.com/d/folders')) except TimeoutException: logger.warning('{} (May need a manual login).'.format( self.profile.email)) raise exceptions.CantLogin() # report that we are logged in :D self.loggedin = True else: # report that we are logged in :D self.loggedin = True
def get_save_pets(self, rare_degree): # 当前百度允许查询的最大页数为200 max_page_no = 200 for page_no in range(max_page_no): page_no = page_no + 1 logger.info('第{0}页{1}狗狗'.format(page_no, self.rare_degree_dic[rare_degree])) # 获取市场上售卖的狗狗 pets_on_sale = self.get_pets_on_sale(page_no, rare_degree) # 获取市场上繁育的狗狗 pets_on_breed = self.get_pets_on_breed(page_no, rare_degree) # 合并市场上售卖和繁育的狗狗 pets = pets_on_sale + pets_on_breed for pet in pets: self.query_save_pet_and_ancestors(pet['petId'])
def create_int_id(): index = 1 count = mongo.attributes.find({}, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING).count() cursor = mongo.attributes.find({}, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING) for doc in cursor: mongo.attributes.update_one({'_id': doc['_id']}, {'$set': { 'intId': index }}, upsert=False) logger.info('一共 {0} 种独立属性,已处理 {1} 条'.format(count, index)) index = index + 1 cursor.close()
def update_my_pets(self): my_pets.delete_many({'user': self.user.name}) page_size = 10 total = self.get_pets_count() pages = total // page_size if total % page_size == 0 else ( total // page_size + 1) index = 0 for page_no in range(pages): page_no = page_no + 1 pets = self.get_pets(page_no, page_size, pages, total) for pet in pets: pet_info = self.get_pet_info_on_market(pet['petId']) self.save_my_pet(pet_info) index = index + 1 logger.info('保存第{0}条狗狗:{1}'.format(index, pet_info['petId']))
def get(self, pet_id): """ 获取验证码 :param pet_id: 狗狗ID :return: base64格式的验证码图片 """ url = 'https://pet-chain.baidu.com/data/captcha/gen' headers = self.headers_template headers[ 'Referer'] = 'https://pet-chain.baidu.com/chain/detail?channel=market&petId=' + pet_id + '&validCode=' data = {"requestId": int(time.time() * 1000), "appId": 1, "tpl": ""} r = requests.post(url, headers=headers, data=json.dumps(data)) response = json.loads(r.content) logger.info(response) return response['data']['img']
def query_save_pet_and_ancestors(self, pet_id): if self.pet_exist(pet_id): return info = self.get_pet_info_on_market(pet_id) self.save_pet(info) self.save_update_attributes(info['attributes']) if info['father']: logger.info('狗狗父亲:{0}'.format(info['father']['petId'])) self.query_save_pet_and_ancestors(info['father']['petId']) logger.info('狗狗母亲:{0}'.format(info['mother']['petId'])) self.query_save_pet_and_ancestors(info['mother']['petId']) else: return
def get_save_pet_svg_criteria(self, rare_degree): # 当前百度允许查询的最大页数为200 max_page_no = 200 sample_count = 0 for page_no in range(max_page_no): page_no = page_no + 1 logger.info('第{0}页{1}狗狗'.format(page_no, self.rare_degree_dic[rare_degree])) # 获取市场上售卖的狗狗 pets_on_sale = self.get_pets_on_sale(page_no, rare_degree) # 获取市场上繁育的狗狗 pets_on_breed = self.get_pets_on_breed(page_no, rare_degree) # 合并市场上售卖和繁育的狗狗 pets = pets_on_sale + pets_on_breed for pet in pets: pet_id = pet['petId'] logger.info('第 {0} 个样本 {1}'.format(sample_count, pet_id)) sample_count = sample_count + 1 info = self.get_pet_info_on_market(pet_id) svg_xml = self.svg.get_pet_svg(pet_id) # print(svg_xml) svg_json = xmltodict.parse(svg_xml) # 体型 body_shape = self.svg.get_body_shape(svg_json) self.save_update_attribute(info['attributes'], '体型', body_shape) # 身体色 body_color = self.svg.get_body_color(svg_json) self.save_update_attribute(info['attributes'], '身体色', body_color) # 嘴巴 nose_mouth = self.svg.get_nose_mouth(svg_json) self.save_update_attribute(info['attributes'], '嘴巴', nose_mouth) # 花纹 pattern = self.svg.get_pattern(svg_json) self.save_update_attribute(info['attributes'], '花纹', pattern) # 花纹色 pattern_color = self.svg.get_pattern_color(svg_json) self.save_update_attribute(info['attributes'], '花纹色', pattern_color) # 肚皮色 tummy_color = self.svg.get_tummy_color(svg_json) self.save_update_attribute(info['attributes'], '肚皮色', tummy_color) # 眼睛色 eye_color = self.svg.get_eye_color(svg_json) self.save_update_attribute(info['attributes'], '眼睛色', eye_color) # 眼睛 eye_shape = self.svg.get_eye_shape(svg_json) self.save_update_attribute(info['attributes'], '眼睛', eye_shape) time.sleep(5)
def prepare_parents(self, father_rare_num, father_price, mother_rare_num): while True: try: father, mother = self.get_parents(father_rare_num, mother_rare_num) if not father or not mother: logger.warn('无满足条件的繁育双亲, 一分钟后重试') time.sleep(60) continue # 未上架繁育,将其上架 if father['shelfStatus'] == 0: logger.info('父亲狗狗{0}处于未上架繁育状态,将其上架'.format(father['petId'])) shelf = Shelf(self.cookie) shelf.shelf(father['petId'], father_price) # 等待3分钟避免错误:专属分享,3分钟后可购买 time.sleep(3 * 60) # 出售中,将其下架然后上架繁育 elif father['shelfStatus'] == 1: logger.info('父亲狗狗{0}处于售卖中, 将其下架, 三分钟后再挂出繁育'.format(father['petId'])) sale = Sale(self.cookie) sale.unsale(father['petId']) # 3分钟后再挂出繁育,避免上下架过频繁 time.sleep(3 * 60) logger.info('挂出繁育父亲狗狗{0}'.format(father['petId'])) shelf = Shelf(self.cookie) shelf.shelf(father['petId'], father_price) # 出售中,将其下架 if mother['shelfStatus'] == 1: logger.info('母亲狗狗{0}处于出售状态,将其下架然'.format(mother['petId'])) sale = Sale(self.cookie) sale.unsale(mother['petId']) # 挂出繁育中,将其下架 elif mother['shelfStatus'] == 2: logger.info('母亲狗狗{0}处于挂出繁育状态,将其下架'.format(mother['petId'])) shelf = Shelf(self.cookie) shelf.off_shelf(mother['petId']) # 再次获取狗狗双亲信息,保证信息是最新的 father = self.get_pet_info_on_market(father['petId']) mother = self.get_pet_info_on_market(mother['petId']) return (father, mother) except: traceback.print_exc()
def data_pool_consumer(): logger.info('[DataPool] consumer start working for process data') while True: while DataPool.empty(): yield gen.sleep(10) new_data = yield DataPool.get() if new_data['type'] == DataType.Submit: if submit.create_submit(new_data): logger.info( '[DataPool] success new status for <{} {} {}>'.format( new_data['account'].oj_name, new_data['run_id'], new_data['account'].nickname)) elif new_data['type'] == DataType.Code: if not submit.update_code(new_data): yield DataPool.put(new_data) DataPool.task_done()
def get_captcha_and_seed(self): url = 'https://pet-chain.baidu.com/data/captcha/gen' headers = self.headers_template headers['Referer'] = 'https://pet-chain.baidu.com/chain/chooseMyDog?appId=1&tpl=' data = { "requestId": int(time.time() * 1000), "appId": 1, "tpl": "" } r = requests.post(url, headers=headers, data=json.dumps(data)) response = json.loads(r.content) if response['errorNo'] != '00': logger.info('获取验证码失败:{0}'.format(response['errorMsg'])) return None, None return response['data']['seed'], response['data']['img']
def test_vec_text(): vec = text2vec("Fdfd") logger.info(vec) text = vec2text(vec) logger.info(text) # Fdfd vec = text2vec("Fdhd") logger.info(vec) text = vec2text(vec) logger.info(text) # Fdhd
def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': '******', 'userpass': '******', 'login': '******' }) response = yield self.fetch(self.login_url, method=HttpMethod.POST, headers={'cookie': self.cookie}, body=post_body) code = response.code page = response.body.decode('gb2312') if (code != 200 and code != 302) or page.find('Sign Out') == -1: return False logger.info('{} login success'.format(self.TAG)) self.has_login = True return True
def save_pet(self, pet_info): pet = { 'id': pet_info['id'], 'petId': pet_info['petId'], 'generation': pet_info['generation'], 'rareDegree': pet_info['rareDegree'], 'rareAmount': self.get_rare_amount(pet_info['attributes']), 'fatherId': pet_info['father']['petId'] if pet_info['father'] else None, 'motherId': pet_info['mother']['petId'] if pet_info['father'] else None, 'bgColor': pet_info['bgColor'], 'petUrl': pet_info['petUrl'], 'attributes': pet_info['attributes'], } mongo.pets.insert(pet) logger.info('保存狗狗:{0}'.format(pet_info['petId']))
def verify_product_data(self): self.wait_for_element_appears(*ProductPageLocators.PRODUCT_TITLE) # print(self.find_element(*ProductPageLocators.PRODUCT_TITLE).text) # print(self.find_element(*ProductPageLocators.PRODUCT_SHORT_DESCRIPTION).text) logger.info( f'current product: {self.find_element(*ProductPageLocators.PRODUCT_TITLE).text}' ) assert self.products_titles() != '', \ f"Error. Title should be present for all products. Current title: {self.products_titles()}" assert self.product_price( ) > 0, f"Error. Price should be more than 0. Current price {self.product_price()}" description = self.find_element( *ProductPageLocators.PRODUCT_SHORT_DESCRIPTION).text assert description != '', \ f"Error. Description should be present for all products. Current description: {description}" self.verify_image_is_loaded( self.find_element(*ProductPageLocators.PRODUCT_IMAGE))
def __init__(self, cookie): self.cookie = cookie self.correct_amount_file = 'correct_amount.txt' self.fail_amount_file = 'fail_amount.txt' self.correct_captcha_folder = './data/recognized_captcha/correct/' self.fail_captcha_folder = './data/recognized_captcha/fail/' self.total = 0 self.correct = 0 self.fail = 0 self.crack = Crack() if os.path.exists(self.correct_amount_file): file = open(self.correct_amount_file, 'r') amount = file.read() logger.info('当前验证码数量:{}'.format(amount)) file.close() self.correct_captcha_amount = int(amount) else: self.correct_captcha_amount = 0 if os.path.exists(self.fail_amount_file): file = open(self.fail_amount_file, 'r') amount = file.read() logger.info('当前验证码数量:{}'.format(amount)) file.close() self.fail_captcha_amount = int(amount) else: self.fail_captcha_amount = 0 if not os.path.exists(self.correct_captcha_folder): os.makedirs(self.correct_captcha_folder) logger.info('创建文件夹:' + self.correct_captcha_folder) if not os.path.exists(self.fail_captcha_folder): os.makedirs(self.fail_captcha_folder) logger.info('创建文件夹:' + self.fail_captcha_folder) self.headers_template = { 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6', 'Content-Type': 'application/json', 'Cookie': self.cookie, 'Host': 'pet-chain.baidu.com', 'Origin': 'https://pet-chain.baidu.com', 'Referer': 'https://pet-chain.baidu.com/chain/dogMarket?appId=1&tpl=', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' }
def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': self.account.nickname, 'userpass': self.account.password, 'login': '******' }) response = yield self.fetch(self.login_url, method=HttpMethod.POST, headers={'Cookie': self.cookie}, body=post_body) code = response.code page = response.body.decode('gb2312') if (code != 200 and code != 302) or page.find('Sign Out') == -1: return False logger.info('{} {} login success'.format(self.TAG, self.account)) self.has_login = True return True
def get_paused(session_id): logger.info("getting paused time for %s" % session_id) result = db.session.query(models.Processed).filter(models.Processed.session_id == session_id).first() total = result.paused_counter if result.paused and not result.stopped: if total: total = datetime.timedelta(seconds=total) total += datetime.datetime.now() - result.paused else: total = datetime.datetime.now() - result.paused if not total: total = 0 elif type(total) == datetime.timedelta: total = total.total_seconds() return int(total)
def send_notification(message): logger.info(u"sending notification to Pushover: %s" % message) args = { "token": config.PUSHOVER_TOKEN, "user": config.PUSHOVER_USER, "message": message } status = requests.post("https://api.pushover.net/1/messages.json", data=args) if status.ok and status.json()["status"] == 1: logger.info( u"Notification to Pushover successfully send with response %s" % status.content) return True else: logger.error(u"Unable to send notification to pushover: %s" % status.content) return False
def create(self, pet_id, price): logger.info('创建卖出狗狗单子 {0},价格{1}'.format(pet_id, price)) url = 'https://pet-chain.baidu.com/data/market/sale/shelf/create' headers = self.headers_template headers['Referer'] = 'https://pet-chain.baidu.com/chain/detail?channel=center&petId=' + pet_id + '&appId=1&tpl=' data = { "petId": pet_id, "amount": price, "requestId": int(time.time() * 1000), "appId": 1, "tpl": "", } r = requests.post(url, headers=headers, data=json.dumps(data)) response = json.loads(r.content) if response['errorNo'] != '00': logger.fail('创建单子失败:{0}'.format(response['errorMsg'])) return response
def db_copy(name, local_to_remote=True): if local_to_remote: src_client = MongoClient() src_db = src_client['LaiCiGou'] src_coll = src_db[name] des_db = mongo.db des_coll = des_db[name] else: src_db = mongo.db src_coll = src_db[name] des_client = MongoClient() des_db = des_client['LaiCiGou'] des_coll = des_db[name] last_process_id = get_last_process_id('db_copy') if last_process_id: # 有处理记录, 接着上次继续处理 logger.suc('继续上次的处理,上次最后处理的id为:{0}'.format(last_process_id)) query = {'_id': {'$gt': ObjectId(last_process_id)}} total = src_coll.find(query).sort('_id', pymongo.ASCENDING).count() # 设置no_cursor_timeout为真,避免处理时间过长报错:pymongo.errors.CursorNotFound: Cursor not found, cursor id: xxxxxxxxx cursor = src_coll.find(query, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING) else: # 无处理记录, 清除数据从头开始处理 logger.warn('无上次处理记录, 强制清除数据从头开始处理') mongo.breed_info.drop() total = src_coll.find({}).sort('_id', pymongo.ASCENDING).count() # 设置no_cursor_timeout为真,避免处理时间过长报错:pymongo.errors.CursorNotFound: Cursor not found, cursor id: xxxxxxxxx cursor = src_coll.find({}, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING) index = 0 for doc in cursor: index = index + 1 des_coll.insert(doc) insert_update_last_process_id('db_copy', doc['_id']) if index % 100 == 0: logger.info('一共 {0} 份文档,已迁移 {1} 条'.format(total, index)) logger.info('一共 {0} 份文档,已迁移 {1} 条'.format(total, index)) cursor.close()
def run_train(): with tf.Graph().as_default(), tf.device('/cpu:0'): opt = tf.train.AdamOptimizer(1e-4) tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % (i)) as scope: loss = tower_loss(scope, keep_prob=0.5) tf.get_variable_scope().reuse_variables() grads = opt.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) train_op = opt.apply_gradients(grads) saver = tf.train.Saver(tf.global_variables()) init = tf.global_variables_initializer() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) sess.run(init) tf.train.start_queue_runners(sess=sess) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') logger.info(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: logger.info('%s Saving in %s' % (datetime.now(), FLAGS.checkpoint)) saver.save(sess, FLAGS.checkpoint, global_step=step)
def count_breed_data(): results = {} index = 0 total1 = mongo.pet_collection.find().count() total = mongo.pet_collection.find({'fatherId': {'$ne': None}}).count() # 设置no_cursor_timeout为真,避免处理时间过长报错:pymongo.errors.CursorNotFound: Cursor not found, cursor id: xxxxxxxxx cursor = mongo.pet_collection.find({'fatherId': {'$ne': None}}, no_cursor_timeout=True) for pet in cursor: index = index + 1 father = mongo.pet_collection.find_one({'petId': pet['fatherId']}) mother = mongo.pet_collection.find_one({'petId': pet['motherId']}) if not father or not mother: continue key = '{0}-{1}'.format(father['rareAmount'], mother['rareAmount']) rare_amount = str(pet['rareAmount']) if rare_amount in results: has_key = False for p in results[rare_amount]: if key in p: p[key] = p[key] + 1 has_key = True break if not has_key: results[rare_amount].append({key: 1}) else: results[rare_amount] = [{key: 1}] if index % 100 == 0: logger.info('一共 {0} 条狗狗,已统计处理 {1} 条'.format(total, index)) if index % 10000 == 0: new_results = copy.deepcopy(results) new_results['no'] = index / 10000 mongo.breed_prob_collection.insert(new_results) cursor.close() new_results = copy.deepcopy(results) new_results['no'] = index / 10000 mongo.breed_prob_collection.insert(new_results)
def get_paused(session_id): logger.info("getting paused time for %s" % session_id) result = db.session.query(models.Processed).filter( models.Processed.session_id == session_id).first() total = result.paused_counter if result.paused and not result.stopped: if total: total = datetime.timedelta(seconds=total) total += datetime.datetime.now() - result.paused else: total = datetime.datetime.now() - result.paused if not total: total = 0 elif type(total) == datetime.timedelta: total = total.total_seconds() return int(total)
def create_attributes_int_ids(): last_process_id = get_last_process_id('create_attributes_int_ids') if last_process_id: # 有处理记录, 接着上次继续处理 logger.suc('继续上次的处理,上次最后处理的id为:{0}'.format(last_process_id)) total = mongo.pets.find({ '_id': { '$gt': ObjectId(last_process_id) } }).sort('_id', pymongo.ASCENDING).count() cursor = mongo.pets.find({ '_id': { '$gt': ObjectId(last_process_id) } }, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING) else: # 无处理记录, 清除数据从头开始处理 logger.warn('无上次处理记录, 强制清除数据从头开始处理') total = mongo.pets.find({}).sort('_id', pymongo.ASCENDING).count() cursor = mongo.pets.find({}, no_cursor_timeout=True).sort( '_id', pymongo.ASCENDING) index = 0 for pet in cursor: attributes = pet['attributes'] aIds = list() for attribute in attributes: doc = mongo.attributes.find_one(attribute) aIds.append(doc['intId']) mongo.pets.update_one({'_id': pet['_id']}, {'$set': { 'aIds': aIds }}, upsert=False) insert_update_last_process_id('create_attributes_int_ids', pet['_id']) logger.info('一共 {0} 条狗狗,已统计处理 {1} 条'.format(total, index)) index = index + 1 logger.info('一共 {0} 条狗狗,已统计处理 {1} 条'.format(total, index)) cursor.close()
def open_product_category_by_clicking_on_active_banner( self, open_in_new_window=False): self.hover_over_banner() link_name_css = f'a[href*="{TestData.PRODUCT_CATEGORY_PARTIAL_LINK}"]' link_to_category = self.CURRENT_ACTIVE_BANNER.find_element_by_css_selector( link_name_css) link_to_category_text = link_to_category.get_attribute("href") category_name_from_link = link_to_category_text.strip('/').split( '/')[-1] if open_in_new_window: self.open_link_in_new_tab(link_to_category_text) self.switch_to_new_window() else: link_to_category.click() expected_title = TestData.PRODUCT_CATEGORIES[category_name_from_link] self.wait_title_contains(expected_title) logger.info( f'Open page {link_to_category_text}. Current title: {self.get_title()}' )
def send_notification(message): logger.info(u"sending notification to Boxcar: %s" % message) args = { 'notification[long_message]': message, 'notification[title]': "plexivity", 'notification[sound]': 'bird-1', 'user_credentials': config.BOXCAR_TOKEN } status = requests.post("https://new.boxcar.io/api/notifications", data=args, timeout=2) if status.ok: logger.info(u"Notification to Boxcar successfully send: %s" % status.content) return True else: logger.error(u"unable to send notication to boxcar %s" % status.content) return False
def db_copy(name): src_client = MongoClient() src_db = src_client['lai_ci_gou'] src_coll = src_db[name] des_db = mongo.db des_coll = des_db[name] index = 0 total = src_coll.find().count() # 设置no_cursor_timeout为真,避免处理时间过长报错:pymongo.errors.CursorNotFound: Cursor not found, cursor id: xxxxxxxxx cursor = src_coll.find(no_cursor_timeout=True) for document in cursor: index = index + 1 des_coll.insert(document) if index % 100 == 0: logger.info('一共 {0} 份文档,已迁移 {1} 条'.format(total, index)) logger.info('一共 {0} 份文档,已迁移 {1} 条'.format(total, index)) cursor.close()
def check_attributes(self, info): for exist_type in self.types: flag = True for attribute_name in self.attributes_names: exist_value = self.get_attribute_value( exist_type['attributes'], attribute_name) value = self.get_attribute_value(info['attributes'], attribute_name) if exist_value != value: flag = False break if flag: exist_type['petIds'].append(info['petId']) logger.info('有相同类型的狗狗') return logger.info('没有相同类型的狗狗') t = {'petIds': [info['petId']], 'attributes': info['attributes']} self.types.append(t)
def sign(self): url = 'https://pet-chain.baidu.com/data/user/sign' headers = self.headers_template headers[ 'Referer'] = 'https://pet-chain.baidu.com/chain/personal?appId=1&tpl=' data = { "requestId": int(time.time() * 1000), "appId": 1, "tpl": "", } r = requests.post(url, headers=headers, data=json.dumps(data)) response = json.loads(r.content) if response['errorNo'] != '00': logger.fail('签到失败:{0}'.format(response['errorMsg'])) info = response['data'] if info['isSigned']: logger.info('已签到,获得{0}微,签到次数{1} 累计{2}微'.format( info['signAmount'], info['totalSignTimes'], info['totalSignAmount']))
def convert_to_dataframe(self, data_object=None, date_column=None, output=True): """ convert loaded dict to dataframe """ data = data_object if data_object else self._items_dict dt_columns = date_column if date_column else ['year', 'month', 'day'] self._invoices_df = pd.DataFrame.from_dict(data) self._invoices_df['invoice_date'] = pd.to_datetime(self._invoices_df[dt_columns].apply( lambda row: '-'.join(row.values.astype(str)), axis=1)) ## sort by date and reset the index self._invoices_df.sort_values(by='invoice_date', inplace=True) self._invoices_df.reset_index(drop=True, inplace=True) logger.info(msg=("{} items converted to time series".format(len(self._invoices_df)))) if output: return self._invoices_df return self
def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': '******', 'password': '******', 'remember': 'on' }) headers = { 'Host': 'bestcoder.hdu.edu.cn', 'Cookie': self.cookie } response = yield self.fetch(self.login_url, method=HttpMethod.POST, headers=headers, body=post_body) code = response.code page = response.body.decode('gb2312') if code != 200 and code != 302 or page.find('Logout') == -1: return False self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
def login(self): if self.has_login: return True post_body = parse.urlencode({ 'user_id1': 'Raychat', 'password1': '63005610', 'B1': 'login', 'url': '/' }) headers = { 'Cookie': self.cookie } response = yield self.fetch(self.login_url, method=HttpMethod.POST, body=post_body, headers=headers) code = response.code page = response.body.decode() if code != 200 and code != 302 or page.find('Log Out') == -1: return False self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': '******', 'password': '******' }) headers = { 'Host': 'acm.hust.edu.cn', 'Origin': 'http://acm.hust.edu.cn', 'Referer': 'http://acm.hust.edu.cn/vjudge/index' } response = yield self.fetch(self.login_url, method=HttpMethod.POST, body=post_body, headers=headers) code = response.code res = response.body.decode() if code != 200 and code != 302 or res != 'success': return False self.cookie = response.headers['Set-Cookie'] self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
def send_notification(message): auth = tweepy.OAuthHandler("T4NRPcEtUrCEU58FesRmRtkdW", "zmpbytgPpSbro6RZcXsKgYQoz24zLH3vYZHOHAAs5j33P4eoRg") auth.set_access_token(config.TWITTER_ACCESS_TOKEN, config.TWITTER_ACCESS_TOKEN_SECRET) api = tweepy.API(auth) try: api.auth.get_username() except: logger.error(u"check your twitter credits!") return False logger.info(u"sending notification to twitter: %s" % message) if config.TWITTER_USE_DM: status = api.send_direct_message(user=config.TWITTER_DM_USER, text=message) else: status = api.update_status(status=message) if status: logger.info(u"Notification to twitter successfully send: %s" % status.text) return True else: logger.error(u"unable to send twitter notification: %s" % status) return False
def ponder(): """ Fetch tweets from the Muses and memorize them; i.e. train classifier or Markov on them. """ logger.info("Pondering new twitter data...") # Each of these are just a list # of tweets as strings. pos = [] neg = [] # For the good muses... for muse in Muse.objects(negative=False): pos += _process_muse(muse) # For the evil muses... for muse in Muse.objects(negative=True): neg += _process_muse(muse) # Extract the tweet contents into lists. pos_txts = _get_tweet_texts(pos) neg_txts = _get_tweet_texts(neg) # Construct training matrix and labels. labels = [1 for i in range(len(pos_txts))] + [0 for i in range(len(neg_txts))] # See if there's anything to retweet. _consider_retweets(pos) # Combine the new tweets. new_tweets = pos_txts + neg_txts # Update the classifier and markov. logger.info("Collected %s new tweets, training..." % len(new_tweets)) CLS.train(new_tweets, labels) MKV.train(pos_txts)
def consider(): """ Decide whether or not to act (tweet). """ logger.info("Considering tweeting...") roll = random.random() chance = config().chance_to_act if roll < chance: logger.info("Rolled %s, chance to act is %s, tweeting." % (roll, chance)) twitter.tweet(MKV.generate()) else: logger.info("Rolled %s, chance to act is %s, NOT tweeting." % (roll, chance))
def get_unnotified(): logger.info(u"getting unnotified entrys from database") result = db.session.query(models.Processed).filter(db.or_(models.Processed.notified == None, models.Processed.notified != 1)).all() return result
def task(): p = plex.Server(config.PMS_HOST, config.PMS_PORT) live = p.currentlyPlaying() started = get_started() playing = dict() recentlyAdded = p.recentlyAdded() if len(recentlyAdded): logger.debug("processing recently added media") for x in recentlyAdded: check = db.session.query(models.RecentlyAdded).filter(models.RecentlyAdded.item_id == x.get("ratingKey")).first() if check: logger.debug("already notified for recently added '%s'" % check.title) continue if x.get("type") == "season" or x.get("type") == "epsiode": fullseason = p.episodes(x.get("ratingKey")) for ep in fullseason: if x.get("addedAt") == ep.get("addedAt"): xml = p.getInfo(ep.get("ratingKey")).find("Video") else: xml = p.getInfo(x.get('ratingKey')).find("Video") if not xml: logger.error("error loading xml for recently added entry") continue info = info_from_xml(xml, "recentlyadded", 1, 1, 0) info["added"] = datetime.datetime.fromtimestamp(float(x.get("addedAt"))).strftime("%Y-%m-%d %H:%M") if notify(info): logger.info(u"adding %s to recently added table" % info["title"]) new = models.RecentlyAdded() new.item_id = x.get("ratingKey") new.time = datetime.datetime.now() new.filename = xml.find("Media").find("Part").get("file") new.title = info["title"] new.debug = "%s" % info db.session.merge(new) db.session.commit() else: logger.debug("nothing was recently added") if live and not len(live): logger.debug("seems like nothing is currently played") for session in live: #logger.debug(session.tostring()) userID = session.find('User').get('id') if not userID: userID = "Local" db_key = "%(id)s_%(key)s_%(userid)s" % { "id": session.get('sessionKey'), "key": session.get('key'), "userid": userID } playing[db_key] = 1 logger.debug(playing) did_unnotify = 0 un_done = get_unnotified() if un_done: logger.debug("processing unnotified entrys from database") for k in un_done: start_epoch = k.time stop_epoch = k.stopped if not stop_epoch: stop_epoch = datetime.datetime.now() ntype = "stop" if k.session_id in playing: ntype = "start" paused = get_paused(k.session_id) info = info_from_xml(k.xml, ntype, start_epoch, stop_epoch, paused) logger.debug(info) logger.debug("sending notification for: %s : %s" % (info["user"], info["orig_title_ep"])) #TODO: fix this.... for now just dont notify again! if notify(info): k.notified = 1 #make sure we have a stop time if we are not playing this anymore! if ntype == "stop": k.stopped = stop_epoch k.progress = int(info["percent_complete"]) db.session.commit() set_notified(k.session_id) did_unnotify = 1 else: logger.info("nothing found to send (new) notifications for") did_unnotify = 1 ## notify stopped ## redo this! currently everything started is set to stopped? if did_unnotify: logger.info("processing recently started entrys from db and checking for stopped") #started = get_started() for k in started: logger.debug("checking if %s is still in playling list" % k.session_id) if not k.session_id in playing: logger.debug("%s is stopped!" % k.session_id) start_epoch = k.time stop_epoch = datetime.datetime.now() xml = ET.fromstring(k.xml) xml.find("Player").set('state', 'stopped') process_update(xml, k.session_id) paused = get_sec_paused(k.session_id) info = info_from_xml(k.xml, "stop", start_epoch, stop_epoch, paused) k.stopped = datetime.datetime.now() k.paused = None k.notified = 0 #set_stopped(started[k.session_id, stop_epoch) #https://github.com/ljunkie/plexWatch/blob/master/plexWatch.pl#L552 info["decoded"] = 1 if notify(info): k.notified = 1 k.progress = info['percent_complete'] db.session.merge(k) db.session.commit() ## notify start/now playing logger.debug("processing live content") was_started = dict() for k in live: if k.get('librarySectionID') in config.EXCLUDE_SECTIONS: logger.info("Watching something from section: %s which is in EXCLUDE_SECTIONS: %s" % (k.get('librarySectionID'), config.EXCLUDE_SECTIONS)) continue if k.get('type') == "clip": logger.info("Skipping Video-Clip like trailers, specials, scenes, interviews etc..") continue start_epoch = datetime.datetime.now() stop_epoch = None #not stopped yet xml_string = ET.tostring(k) info = info_from_xml(k, "start", start_epoch, stop_epoch, 0) info["decoded"] = 1 # logger.debug(info) userID = info["userID"] if not userID: userID = "Local" db_key = "%(id)s_%(key)s_%(userid)s" % { "id": k.get('sessionKey'), "key": k.get('key'), "userid": userID } logger.debug("plex returned a live element: %s " % db_key) ## ignore content already been notified #TODO: get_startet should return a dict accessable by db_key #so we can check: if x in startet: check for change, if not mark as started now #first go through all started stuff and check for status change if started: logger.debug("we still have not stopped entrys in our database checking for matches") for x in started: logger.debug("checking if db entry '%s' is in live content " % x.session_id) state_change = False if x.session_id == db_key: logger.debug("that was a match! check for status changes") #already in database only check for status changes! state_change = process_update(k, db_key) was_started[db_key] = x if state_change: info["ntype"] = state_change logger.debug("%s: %s: state changed [%s] notify called" % (info["user"], info["title"], info["state"])) notify(info) else: logger.debug("all entrys in our database have been set to stopped") #also check if there is a element in the db which may be a resumed play from up to 24 hours ago if not db_key in was_started: logger.debug("trying to search for similar plays which stopped in the last 24 hours") view_offset = k.get("viewOffset") max_time = datetime.datetime.now() - datetime.timedelta(hours=24) like_what = "%" + k.get('key') + "_" + userID restarted = db.session.query(models.Processed).filter(models.Processed.session_id.like(like_what)).filter(models.Processed.time > max_time).filter(models.Processed.view_offset <= view_offset).filter(models.Processed.stopped != None).first() if restarted: logger.debug("seems like someone repeated an stopped play, updating db key from %s to %s" % (restarted.session_id, db_key)) restarted.session_id = db_key restarted.stopped = None db.session.commit() state_change = process_update(k, db_key) was_started[db_key] = restarted info["ntype"] = "resume" notify(info) else: #if still not processed till now, its a new play! logger.debug("we got those entrys which already where in the database: %s " % was_started) logger.info("seems like this is a new entry: %s" % db_key) #unnotified insert to db and notify process_start(xml_string, db_key, info) if notify(info): set_notified(db_key)
def notify(info): if "orig_user" in info and info["orig_user"] in config.EXCLUDE_USERS: logger.info("'%s' is set as an EXCLUDE_USER, i'm not sending a notification!" % info["orig_user"]) return True #notify all providers with the given stuff... if info["ntype"] == "recentlyadded" and config.NOTIFY_RECENTLYADDED: try: message = config.RECENTLYADDED_MESSAGE % info except KeyError: logger.error("Unable to map info to your recently added notification string. Please check your settings!") elif info["ntype"] == "start" and config.NOTIFY_START: try: message = config.START_MESSAGE % info except KeyError: logger.error("Unable to map info to your start notification string. Please check your settings!") elif info["ntype"] == "stop" and config.NOTIFY_STOP: try: message = config.STOP_MESSAGE % info except KeyError: logger.error("Unable to map info to your stop notification string. Please check your settings!") elif info["ntype"] == "pause" and config.NOTIFY_PAUSE: try: message = config.PAUSE_MESSAGE % info except KeyError: logger.error("Unable to map info to your pause notification string. Please check your settings!") elif info["ntype"] == "resume" and config.NOTIFY_RESUME: try: message = config.RESUME_MESSAGE % info except KeyError: logger.error("Unable to map info to your resume notification string. Please check your settings!") elif info["ntype"] == "test": message = "plexivity notification test" else: message = False status = False if message and config.USE_PPSCRIPTS: from app.providers import scripts scripts.run_scripts(info, message) if message: #only log notify args if it actually calls notify! logger.debug("notify called with args: %s" % info) if config.NOTIFY_PUSHOVER: from app.providers import pushover status = pushover.send_notification(message) if config.NOTIFY_PUSHBULLET: from app.providers import pushbullet status = pushbullet.send_notification(message) if config.NOTIFY_MAIL: from app.providers import mail status = mail.send_notification(message) if config.NOTIFY_BOXCAR: from app.providers import boxcar status = boxcar.send_notification(message) if config.NOTIFY_TWITTER: from app.providers import twitter status = twitter.send_notification(message) return status return False
import settings import time from tornado import ioloop from app import account_producer, main, spider_init from app.logger import setup_logger, logger from app.redis_client import setup_redis if __name__ == '__main__': start = time.clock() setup_logger(settings.log_level, settings.log_dir) logger.info('[ACM-Spider] 程序初始化 ......') setup_redis() # spider_init() io_loop = ioloop.IOLoop().current() # io_loop.spawn_callback(account_producer) io_loop.run_sync(main) print('used time {}'.format(time.clock() - start))
def get_started(): logger.info(u"getting recently started entrys from database") result = db.session.query(models.Processed).filter(models.Processed.time != None).filter(models.Processed.stopped == None).all() return result