def query_video_list(request): """ Query video list by title <-- generated regex :param request: :return: """ query_keyword = request.GET["name"] page_size = int(get_request_with_default(request, "n", "30")) page_index = int(get_request_with_default(request, "p", "1")) like_video = get_request_with_default(request, "like", "no") == "yes" if like_video: like_condition = {"$eq": True} else: like_condition = {"$ne": None} assert page_size >= 1 assert page_index >= 1 with MongoDBDatabase("website_pron") as mgdb: query_res = mgdb.get_collection("video_info").find({ "name": { "$regex": "(%s)" % "|".join(query_keyword.split(" ")) }, "like": like_condition }).sort("_id", -1).skip((page_index - 1) * page_size).limit(page_size) return [x for x in query_res]
def main(self): self.last_op = "Initialized." with MongoDBDatabase(self.db_name) as mongoDB: failed_times = 0 waitingColl = mongoDB.get_collection(self.coll_prefix + "_queue") runningColl = mongoDB.get_collection(self.coll_prefix + "_running") deletedColl = mongoDB.get_collection(self.coll_prefix + "_deleted") storageColl = mongoDB.get_collection(self.coll_prefix + "_storage") coll_list = (waitingColl, deletedColl, runningColl, storageColl) while True: if failed_times >= failed_time_limit: print("Failed too much, ended: " + self.name) break task = waitingColl.find_one_and_delete({}) self.last_op = "Got a task." if task is None: failed_times += 1 print("Failed: " + self.name) self.last_op = "Fail sleeping." time.sleep(2) else: url = task["_id"] try: runningColl.insert_one({"_id": url}) if storageColl.find_one({"_id": url}, {"_id": 1}) is None: self.last_op = "Querying." doc_storage = query_url(url) self.last_op = "Queried." relatedURLs = doc_storage.pop("related") runningColl.delete_one({"_id": url}) storageColl.insert_one(doc_storage) self.last_op = "Inserted, appending." for task in relatedURLs: condition_doc = {"_id": task.split("?")[0]} if all((coll.find_one(condition_doc, {"_id": 1}) is None for coll in coll_list)): try: waitingColl.insert_one(condition_doc) except: pass else: print("DUMP : %s" % url) failed_times = 0 except Exception as ex: if deleted_error(ex): print("Found a HTTP Error URL:" + url) try: runningColl.delete_one({"_id": url}) deletedColl.insert_one({"_id": url}) except: print("Error to process deleted url.") else: print("Error to process: %s" % url) print(traceback.format_exc()) waitingColl.insert_one({"_id": url}) runningColl.delete_one({"_id": url}) failed_times += 1
def get_images_info(request): """ Query image urls by indicated index :param request: HTTP request :return: images' urls by given index """ image_page_index = int(request.GET["id"]) with MongoDBDatabase("website_pron") as mgdb: doc = mgdb.get_collection("images_info_ahash_weed").find_one( {"_id": image_page_index}) return doc
def set_novel_like(request): """ Set images like or discard :param request: :return: """ novel_id = int(request.POST['id']) is_like = request.POST["like"] == "true" with MongoDBDatabase("website_pron") as mongo_conn: collection = mongo_conn.get_collection("novels") condition = {"_id": novel_id} collection.update_one(condition, {"$set": {"like": is_like}}) return {"status": "success"}
def query_video(requset): """ Query video info by id :param requset: :return: """ video_id = int(requset.GET["id"]) with MongoDBDatabase("website_pron") as mgdb: return mgdb.get_collection("video_info").find({ "_id": { "$eq": video_id } }).next()
def get_novel(request): """ Query image urls by indicated index :param request: HTTP request :return: images' urls by given index """ novel_index = int(request.GET["id"]) with MongoDBDatabase("website_pron") as mgdb: res = mgdb.get_collection("novels").find_one( {"_id": { "$eq": novel_index }}) assert res is not None res["novel_text"] = read_raw(local_novel_path_gen % novel_index) return res
def query_novel_by_condition(request): """ Query novel list by title <-- json condition :param request: :return: """ query_condition = json.loads(request.GET["condition"]) page_size = int(get_request_with_default(request, "n", "30")) page_index = int(get_request_with_default(request, "p", "1")) assert page_size >= 1 assert page_index >= 1 with MongoDBDatabase("website_pron") as mgdb: query_res = mgdb.get_collection("novels").find(query_condition).sort( "words_count", -1).skip( (page_index - 1) * page_size).limit(page_size) return [x for x in query_res]
def run(self): with MongoDBDatabase(self.db_name) as mongoDB: failed_times = 0 waitingColl = mongoDB.get_collection(self.coll_prefix + "_queue") runningColl = mongoDB.get_collection(self.coll_prefix + "_running") storageColl = mongoDB.get_collection(self.coll_prefix + "_storage") coll_list = (waitingColl, runningColl, storageColl) while True: if failed_times >= failed_time_limit: break task = waitingColl.find_one_and_delete({}) if task is None: failed_times += 1 time.sleep(2) else: url = task["_id"] try: runningColl.insert_one({"_id": url}) _, sp = getSoup(url) relatedURLs = getRelatedVideosLink(sp) if storageColl.find_one({"_id": url}, {"_id": 1}) is None: doc_storage = { "_id": url, "title": getTitleFromSoup(sp), "label": getCategories(sp), "rate": getRating(sp), "duration": getTime(sp), "preview": getPreviewImageList(sp), "preview_all": getAllPreviewImageList(sp) } storageColl.insert_one(doc_storage) runningColl.delete_one({"_id": url}) print("%s done." % url) for task in relatedURLs: condition_doc = {"_id": task.split("?")[0]} if all((coll.find_one(condition_doc, {"_id": 1}) is None for coll in coll_list)): waitingColl.insert_one(condition_doc) else: print("%s dumped" % url) failed_times = 0 except: print("%s download error, excepted." % url) waitingColl.insert_one({"_id": url}) failed_times += 1
def query_images_list(request): """ Query images list via title<--generated regex :param request: :return: """ query_keyword = request.GET["key_words"] page_size = int(get_request_with_default(request, "n", "30")) page_index = int(get_request_with_default(request, "p", "1")) assert page_size >= 1 assert page_index >= 1 with MongoDBDatabase("website_pron") as mgdb: query_res = mgdb.get_collection("images_info").find({ "title": { "$regex": "(%s)" % "|".join(query_keyword.split(" ")) } }).sort("_id", -1).skip((page_index - 1) * page_size).limit(page_size) return [x for x in query_res]
def remove_novel(request): """ Remove novel by given index :param request: HTTP request :return: images' urls by given index """ novel_index = int(request.POST["id"]) with MongoDBDatabase("website_pron") as mgdb: collection = mgdb.get_collection("novels") condition = {"_id": {"$eq": novel_index}} doc = collection.find_one(condition) assert doc is not None, "query failed" res = collection.delete_one(condition).deleted_count fn = local_novel_path_gen % novel_index shutil.move(fn, trash_novel_path_gen % doc["title"]) if res > 0: return {"status": "success"} else: return {"status": "error"}
def set_images_like(request): """ Set images like or discard :param request: :return: """ video_id = int(request.POST['id']) is_like = request.POST["like"] == "true" with MongoDBDatabase("website_pron") as mongo_conn: collection = mongo_conn.get_collection("images_info") condition = {"_id": {"$eq": video_id}} matched_count = collection.update_one(condition, { "$set": { "like": is_like } }).modified_count if matched_count > 0: return {"status": "success"} else: return {"status": "error"}
def migrate(): with MongoDBDatabase("website_pron") as mgdb: source_coll = mgdb.get_collection("images_info") target_coll = mgdb.get_collection("images_info_ahash_weed") total_size = source_coll.count() for index, doc in enumerate(source_coll.find({})): print("{}/{} is prcoessing.\r".format(index, total_size)) image_list_index = doc["_id"] image_dir = local_image_list_path % { "page_index": int(image_list_index) } hash_list = [ hash_res["weed_fid"].replace(",", "/") + "/" + hash_res["_id"] + "." + hash_res["file_type"] for hash_res in ( insert_image_to_weed(os.path.join(image_dir, file)) for file in os.listdir(image_dir)) if hash_res is not None ] insert_doc = doc insert_doc["image_list"] = hash_list target_coll.insert_one(insert_doc)
def remove_video(request): """ Remove video from mongoDB and hard-drive :param request: :return: """ remove_id = int(request.POST['id']) with MongoDBDatabase("website_pron") as mongo_conn: collection = mongo_conn.get_collection("video_info") condition = {"_id": {"$eq": remove_id}} cursor = collection.find(condition) if cursor.count() <= 0: raise Exception("Given id not found") doc = cursor.next() with open(trash_video_info % remove_id, "w") as fp: json.dump(doc, fp) collection.delete_one(condition) shutil.move(video_saving_path % remove_id, trash_video_file % remove_id) os.remove(shortcuts_saving_path % remove_id) return {"status": "success"}
def get_images_info(request): """ Query image urls by indicated index :param request: HTTP request :return: images' urls by given index """ image_page_index = int(request.GET["id"]) file_list = filter( filter_images, os.listdir(local_image_list_path % {"page_index": image_page_index})) image_list = [ image_url_template % { "page_index": image_page_index, "image_filename": filename } for filename in file_list ] with MongoDBDatabase("website_pron") as mgdb: doc = mgdb.get_collection("images_info").find_one( {"_id": image_page_index}) doc["images"] = image_list return doc
def remove_images(request): """ Remove novel by given index :param request: HTTP request :return: images' urls by given index """ images_index = int(request.POST["id"]) with MongoDBDatabase("website_pron") as mgdb: collection = mgdb.get_collection("images_info") condition = {"_id": {"$eq": images_index}} doc = collection.find_one(condition) assert doc is not None, "query failed" res = collection.delete_one(condition).deleted_count fn = local_image_list_path % {"page_index": images_index} target_dir = trash_image_path_gen % {"page_index": images_index} shutil.move(fn, target_dir) with open(os.path.join(target_dir, "_info.json"), "w") as fp: json.dump(doc, fp) if res > 0: return {"status": "success"} else: return {"status": "error"}
def query_novel_by_title(request): """ Query novel list by title <-- generated regex :param request: :return: """ query_keyword = request.GET["query"] block_settings = json.loads( get_request_with_default(request, "block", "[]")) page_size = int(get_request_with_default(request, "n", "30")) page_index = int(get_request_with_default(request, "p", "1")) assert page_size >= 1 assert page_index >= 1 with MongoDBDatabase("website_pron") as mgdb: query_res = mgdb.get_collection("novels").find({ "title": { "$regex": "(%s)" % "|".join(query_keyword.split(" ")) }, "novel_type": { "$nin": block_settings } }).sort("words_count", -1).skip( (page_index - 1) * page_size).limit(page_size) return [x for x in query_res]
def get_nearby_images(request): """ Query next/last images block info by given id and keywords :param request: :return: """ current_id = int(request.GET["id"]) keywords = get_request_with_default(request, "key_words", "") direction = request.GET["dir"].upper() == "NEXT" if direction: query_dir_condition = {"$gt": current_id} sort_flip = 1 else: query_dir_condition = {"$lt": current_id} sort_flip = -1 with MongoDBDatabase("website_pron") as mgdb: query_res = mgdb.get_collection("images_info").find({ "title": { "$regex": "(%s)" % "|".join(keywords.split(" ")) }, "_id": query_dir_condition }).sort("_id", sort_flip).next() return query_res
def insert_image_to_weed(file: str, remove_after_insert: bool = False, silence: bool = True): try: file_hash = hash_algorithm(file) file_type = re_find_tail.findall(file)[0] with MongoDBDatabase("website_pron") as mongodb: coll = mongodb.get_collection("image_hash_pool") return_data = coll.find_one({"_id": file_hash}) if return_data is None: weed_fs = WeedFS("192.168.1.103") file_id = weed_fs.upload_file(file) insert_info = { "_id": file_hash, "weed_fid": file_id, "file_type": file_type } coll.insert_one(insert_info) return insert_info else: return return_data except: print(traceback.format_exc(), file=sys.stderr) return None
deletedColl.insert_one({"_id": url}) except: print("Error to process deleted url.") else: print("Error to process: %s" % url) print(traceback.format_exc()) waitingColl.insert_one({"_id": url}) runningColl.delete_one({"_id": url}) failed_times += 1 if __name__ == '__main__': dbName = "spider" collPrefix = "xhamster" with MongoDBDatabase(dbName) as mgdb: for res in mgdb.get_collection(collPrefix + "_running").find({}): try: mgdb.get_collection(collPrefix + "_queue").insert(res) except: pass mgdb.get_collection(collPrefix + "_running").drop() with MongoDBCollection(dbName, collPrefix + "_queue") as coll: if coll.count() <= 0: for i in range(3): for url in get_top_urls(i + 1): try: coll.insert({"_id": url}) except: print("Error while inserting " + url)