Ejemplo n.º 1
0
def query_video_list(request):
    """
    Query video list by title <-- generated regex
    :param request:
    :return:
    """
    query_keyword = request.GET["name"]
    page_size = int(get_request_with_default(request, "n", "30"))
    page_index = int(get_request_with_default(request, "p", "1"))
    like_video = get_request_with_default(request, "like", "no") == "yes"
    if like_video:
        like_condition = {"$eq": True}
    else:
        like_condition = {"$ne": None}

    assert page_size >= 1
    assert page_index >= 1
    with MongoDBDatabase("website_pron") as mgdb:
        query_res = mgdb.get_collection("video_info").find({
            "name": {
                "$regex": "(%s)" % "|".join(query_keyword.split(" "))
            },
            "like":
            like_condition
        }).sort("_id", -1).skip((page_index - 1) * page_size).limit(page_size)
        return [x for x in query_res]
Ejemplo n.º 2
0
 def main(self):
     self.last_op = "Initialized."
     with MongoDBDatabase(self.db_name) as mongoDB:
         failed_times = 0
         waitingColl = mongoDB.get_collection(self.coll_prefix + "_queue")
         runningColl = mongoDB.get_collection(self.coll_prefix + "_running")
         deletedColl = mongoDB.get_collection(self.coll_prefix + "_deleted")
         storageColl = mongoDB.get_collection(self.coll_prefix + "_storage")
         coll_list = (waitingColl, deletedColl, runningColl, storageColl)
         while True:
             if failed_times >= failed_time_limit:
                 print("Failed too much, ended: " + self.name)
                 break
             task = waitingColl.find_one_and_delete({})
             self.last_op = "Got a task."
             if task is None:
                 failed_times += 1
                 print("Failed: " + self.name)
                 self.last_op = "Fail sleeping."
                 time.sleep(2)
             else:
                 url = task["_id"]
                 try:
                     runningColl.insert_one({"_id": url})
                     if storageColl.find_one({"_id": url},
                                             {"_id": 1}) is None:
                         self.last_op = "Querying."
                         doc_storage = query_url(url)
                         self.last_op = "Queried."
                         relatedURLs = doc_storage.pop("related")
                         runningColl.delete_one({"_id": url})
                         storageColl.insert_one(doc_storage)
                         self.last_op = "Inserted, appending."
                         for task in relatedURLs:
                             condition_doc = {"_id": task.split("?")[0]}
                             if all((coll.find_one(condition_doc,
                                                   {"_id": 1}) is None
                                     for coll in coll_list)):
                                 try:
                                     waitingColl.insert_one(condition_doc)
                                 except:
                                     pass
                     else:
                         print("DUMP  : %s" % url)
                     failed_times = 0
                 except Exception as ex:
                     if deleted_error(ex):
                         print("Found a HTTP Error URL:" + url)
                         try:
                             runningColl.delete_one({"_id": url})
                             deletedColl.insert_one({"_id": url})
                         except:
                             print("Error to process deleted url.")
                     else:
                         print("Error to process: %s" % url)
                         print(traceback.format_exc())
                         waitingColl.insert_one({"_id": url})
                         runningColl.delete_one({"_id": url})
                         failed_times += 1
Ejemplo n.º 3
0
def get_images_info(request):
    """
    Query image urls by indicated index
    :param request: HTTP request
    :return: images' urls by given index
    """
    image_page_index = int(request.GET["id"])
    with MongoDBDatabase("website_pron") as mgdb:
        doc = mgdb.get_collection("images_info_ahash_weed").find_one(
            {"_id": image_page_index})
        return doc
Ejemplo n.º 4
0
def set_novel_like(request):
    """
    Set images like or discard
    :param request:
    :return:
    """
    novel_id = int(request.POST['id'])
    is_like = request.POST["like"] == "true"
    with MongoDBDatabase("website_pron") as mongo_conn:
        collection = mongo_conn.get_collection("novels")
        condition = {"_id": novel_id}
        collection.update_one(condition, {"$set": {"like": is_like}})
        return {"status": "success"}
Ejemplo n.º 5
0
def query_video(requset):
    """
    Query video info by id
    :param requset:
    :return:
    """
    video_id = int(requset.GET["id"])
    with MongoDBDatabase("website_pron") as mgdb:
        return mgdb.get_collection("video_info").find({
            "_id": {
                "$eq": video_id
            }
        }).next()
Ejemplo n.º 6
0
def get_novel(request):
    """
    Query image urls by indicated index
    :param request: HTTP request
    :return: images' urls by given index
    """
    novel_index = int(request.GET["id"])
    with MongoDBDatabase("website_pron") as mgdb:
        res = mgdb.get_collection("novels").find_one(
            {"_id": {
                "$eq": novel_index
            }})
        assert res is not None
        res["novel_text"] = read_raw(local_novel_path_gen % novel_index)
        return res
Ejemplo n.º 7
0
def query_novel_by_condition(request):
    """
    Query novel list by title <-- json condition
    :param request:
    :return:
    """
    query_condition = json.loads(request.GET["condition"])
    page_size = int(get_request_with_default(request, "n", "30"))
    page_index = int(get_request_with_default(request, "p", "1"))
    assert page_size >= 1
    assert page_index >= 1
    with MongoDBDatabase("website_pron") as mgdb:
        query_res = mgdb.get_collection("novels").find(query_condition).sort(
            "words_count", -1).skip(
                (page_index - 1) * page_size).limit(page_size)
        return [x for x in query_res]
Ejemplo n.º 8
0
 def run(self):
     with MongoDBDatabase(self.db_name) as mongoDB:
         failed_times = 0
         waitingColl = mongoDB.get_collection(self.coll_prefix + "_queue")
         runningColl = mongoDB.get_collection(self.coll_prefix + "_running")
         storageColl = mongoDB.get_collection(self.coll_prefix + "_storage")
         coll_list = (waitingColl, runningColl, storageColl)
         while True:
             if failed_times >= failed_time_limit:
                 break
             task = waitingColl.find_one_and_delete({})
             if task is None:
                 failed_times += 1
                 time.sleep(2)
             else:
                 url = task["_id"]
                 try:
                     runningColl.insert_one({"_id": url})
                     _, sp = getSoup(url)
                     relatedURLs = getRelatedVideosLink(sp)
                     if storageColl.find_one({"_id": url},
                                             {"_id": 1}) is None:
                         doc_storage = {
                             "_id": url,
                             "title": getTitleFromSoup(sp),
                             "label": getCategories(sp),
                             "rate": getRating(sp),
                             "duration": getTime(sp),
                             "preview": getPreviewImageList(sp),
                             "preview_all": getAllPreviewImageList(sp)
                         }
                         storageColl.insert_one(doc_storage)
                         runningColl.delete_one({"_id": url})
                         print("%s done." % url)
                         for task in relatedURLs:
                             condition_doc = {"_id": task.split("?")[0]}
                             if all((coll.find_one(condition_doc,
                                                   {"_id": 1}) is None
                                     for coll in coll_list)):
                                 waitingColl.insert_one(condition_doc)
                     else:
                         print("%s dumped" % url)
                     failed_times = 0
                 except:
                     print("%s download error, excepted." % url)
                     waitingColl.insert_one({"_id": url})
                     failed_times += 1
Ejemplo n.º 9
0
def query_images_list(request):
    """
    Query images list via title<--generated regex
    :param request:
    :return:
    """
    query_keyword = request.GET["key_words"]
    page_size = int(get_request_with_default(request, "n", "30"))
    page_index = int(get_request_with_default(request, "p", "1"))
    assert page_size >= 1
    assert page_index >= 1
    with MongoDBDatabase("website_pron") as mgdb:
        query_res = mgdb.get_collection("images_info").find({
            "title": {
                "$regex": "(%s)" % "|".join(query_keyword.split(" "))
            }
        }).sort("_id", -1).skip((page_index - 1) * page_size).limit(page_size)
        return [x for x in query_res]
Ejemplo n.º 10
0
def remove_novel(request):
    """
    Remove novel by given index
    :param request: HTTP request
    :return: images' urls by given index
    """
    novel_index = int(request.POST["id"])
    with MongoDBDatabase("website_pron") as mgdb:
        collection = mgdb.get_collection("novels")
        condition = {"_id": {"$eq": novel_index}}
        doc = collection.find_one(condition)
        assert doc is not None, "query failed"
        res = collection.delete_one(condition).deleted_count
        fn = local_novel_path_gen % novel_index
        shutil.move(fn, trash_novel_path_gen % doc["title"])
        if res > 0:
            return {"status": "success"}
        else:
            return {"status": "error"}
Ejemplo n.º 11
0
def set_images_like(request):
    """
    Set images like or discard
    :param request:
    :return:
    """
    video_id = int(request.POST['id'])
    is_like = request.POST["like"] == "true"
    with MongoDBDatabase("website_pron") as mongo_conn:
        collection = mongo_conn.get_collection("images_info")
        condition = {"_id": {"$eq": video_id}}
        matched_count = collection.update_one(condition, {
            "$set": {
                "like": is_like
            }
        }).modified_count
        if matched_count > 0:
            return {"status": "success"}
        else:
            return {"status": "error"}
Ejemplo n.º 12
0
def migrate():
    with MongoDBDatabase("website_pron") as mgdb:
        source_coll = mgdb.get_collection("images_info")
        target_coll = mgdb.get_collection("images_info_ahash_weed")
        total_size = source_coll.count()
        for index, doc in enumerate(source_coll.find({})):
            print("{}/{} is prcoessing.\r".format(index, total_size))
            image_list_index = doc["_id"]
            image_dir = local_image_list_path % {
                "page_index": int(image_list_index)
            }
            hash_list = [
                hash_res["weed_fid"].replace(",", "/") + "/" +
                hash_res["_id"] + "." + hash_res["file_type"] for hash_res in (
                    insert_image_to_weed(os.path.join(image_dir, file))
                    for file in os.listdir(image_dir)) if hash_res is not None
            ]
            insert_doc = doc
            insert_doc["image_list"] = hash_list
            target_coll.insert_one(insert_doc)
Ejemplo n.º 13
0
def remove_video(request):
    """
    Remove video from mongoDB and hard-drive
    :param request:
    :return:
    """
    remove_id = int(request.POST['id'])
    with MongoDBDatabase("website_pron") as mongo_conn:
        collection = mongo_conn.get_collection("video_info")
        condition = {"_id": {"$eq": remove_id}}
        cursor = collection.find(condition)
        if cursor.count() <= 0:
            raise Exception("Given id not found")
        doc = cursor.next()
        with open(trash_video_info % remove_id, "w") as fp:
            json.dump(doc, fp)
        collection.delete_one(condition)
        shutil.move(video_saving_path % remove_id,
                    trash_video_file % remove_id)
        os.remove(shortcuts_saving_path % remove_id)
        return {"status": "success"}
Ejemplo n.º 14
0
def get_images_info(request):
    """
    Query image urls by indicated index
    :param request: HTTP request
    :return: images' urls by given index
    """
    image_page_index = int(request.GET["id"])
    file_list = filter(
        filter_images,
        os.listdir(local_image_list_path % {"page_index": image_page_index}))
    image_list = [
        image_url_template % {
            "page_index": image_page_index,
            "image_filename": filename
        } for filename in file_list
    ]
    with MongoDBDatabase("website_pron") as mgdb:
        doc = mgdb.get_collection("images_info").find_one(
            {"_id": image_page_index})
        doc["images"] = image_list
        return doc
Ejemplo n.º 15
0
def remove_images(request):
    """
    Remove novel by given index
    :param request: HTTP request
    :return: images' urls by given index
    """
    images_index = int(request.POST["id"])
    with MongoDBDatabase("website_pron") as mgdb:
        collection = mgdb.get_collection("images_info")
        condition = {"_id": {"$eq": images_index}}
        doc = collection.find_one(condition)
        assert doc is not None, "query failed"
        res = collection.delete_one(condition).deleted_count
        fn = local_image_list_path % {"page_index": images_index}
        target_dir = trash_image_path_gen % {"page_index": images_index}
        shutil.move(fn, target_dir)
        with open(os.path.join(target_dir, "_info.json"), "w") as fp:
            json.dump(doc, fp)
        if res > 0:
            return {"status": "success"}
        else:
            return {"status": "error"}
Ejemplo n.º 16
0
def query_novel_by_title(request):
    """
    Query novel list by title <-- generated regex
    :param request:
    :return:
    """
    query_keyword = request.GET["query"]
    block_settings = json.loads(
        get_request_with_default(request, "block", "[]"))
    page_size = int(get_request_with_default(request, "n", "30"))
    page_index = int(get_request_with_default(request, "p", "1"))
    assert page_size >= 1
    assert page_index >= 1
    with MongoDBDatabase("website_pron") as mgdb:
        query_res = mgdb.get_collection("novels").find({
            "title": {
                "$regex": "(%s)" % "|".join(query_keyword.split(" "))
            },
            "novel_type": {
                "$nin": block_settings
            }
        }).sort("words_count", -1).skip(
            (page_index - 1) * page_size).limit(page_size)
        return [x for x in query_res]
Ejemplo n.º 17
0
def get_nearby_images(request):
    """
    Query next/last images block info by given id and keywords
    :param request:
    :return:
    """
    current_id = int(request.GET["id"])
    keywords = get_request_with_default(request, "key_words", "")
    direction = request.GET["dir"].upper() == "NEXT"
    if direction:
        query_dir_condition = {"$gt": current_id}
        sort_flip = 1
    else:
        query_dir_condition = {"$lt": current_id}
        sort_flip = -1
    with MongoDBDatabase("website_pron") as mgdb:
        query_res = mgdb.get_collection("images_info").find({
            "title": {
                "$regex": "(%s)" % "|".join(keywords.split(" "))
            },
            "_id":
            query_dir_condition
        }).sort("_id", sort_flip).next()
        return query_res
Ejemplo n.º 18
0
def insert_image_to_weed(file: str,
                         remove_after_insert: bool = False,
                         silence: bool = True):
    try:
        file_hash = hash_algorithm(file)
        file_type = re_find_tail.findall(file)[0]
        with MongoDBDatabase("website_pron") as mongodb:
            coll = mongodb.get_collection("image_hash_pool")
            return_data = coll.find_one({"_id": file_hash})
            if return_data is None:
                weed_fs = WeedFS("192.168.1.103")
                file_id = weed_fs.upload_file(file)
                insert_info = {
                    "_id": file_hash,
                    "weed_fid": file_id,
                    "file_type": file_type
                }
                coll.insert_one(insert_info)
                return insert_info
            else:
                return return_data
    except:
        print(traceback.format_exc(), file=sys.stderr)
        return None
Ejemplo n.º 19
0
                                deletedColl.insert_one({"_id": url})
                            except:
                                print("Error to process deleted url.")
                        else:
                            print("Error to process: %s" % url)
                            print(traceback.format_exc())
                            waitingColl.insert_one({"_id": url})
                            runningColl.delete_one({"_id": url})
                            failed_times += 1


if __name__ == '__main__':
    dbName = "spider"
    collPrefix = "xhamster"

    with MongoDBDatabase(dbName) as mgdb:
        for res in mgdb.get_collection(collPrefix + "_running").find({}):
            try:
                mgdb.get_collection(collPrefix + "_queue").insert(res)
            except:
                pass
        mgdb.get_collection(collPrefix + "_running").drop()

    with MongoDBCollection(dbName, collPrefix + "_queue") as coll:
        if coll.count() <= 0:
            for i in range(3):
                for url in get_top_urls(i + 1):
                    try:
                        coll.insert({"_id": url})
                    except:
                        print("Error while inserting " + url)