Beispiel #1
0
def searchPostHtml(request: dict):
    # print(request)
    site_type_re = {
        "nha.chotot.com": {
            "land": r"^.*/mua-ban-dat/.*$",
            "house": r"^.*/mua-ban-nha-dat/.*$",
            "apartment": r"^.*/mua-ban-can-ho-chung-cu/.*$"
        },
        "nhadat247.com.vn": {
            "land": r"^.*nhadat247.com.vn/ban-dat.*$",
            "apartment": r"^.*nhadat247.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*nhadat247.com.vn/ban-nha.*$"
        },
        "batdongsan.com.vn": {
            "land": r"^.*batdongsan.com.vn/ban-dat.*$",
            "apartment": r"^.*batdongsan.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*batdongsan.com.vn/ban-nha.*$"
        }
    }

    try:
        db = DBObject()

        _site = request["site"] if "site" in request else None
        _crawl_date = request["crawl_date"] if "crawl_date" in request else None
        _post_date = request["post_date"] if "post_date" in request else None
        _type = request["type"] if "type" in request else "all"
        _limit = int(request["limit"]) if (
            "limit" in request) and len(request["limit"]) > 0 else 0

        list_filter = []

        if _site in site_type_re:
            list_filter.append(
                {"url": {
                    "$regex": "^https://%s/.*$" % (_site)
                }})

        if _type in site_type_re[_site]:
            list_filter.append({"url": {"$regex": site_type_re[_site][_type]}})
        else:
            list_filter.append({
                "$or": [{
                    "url": {
                        "$regex": site_type_re[_site][_t]
                    }
                } for _t in site_type_re[_site]]
            })

        _d_range = d_range(_crawl_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        _d_range = d_range(_post_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "post_date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        query_return = []
        for post in db.query_html_db(query_dict={"$and": list_filter},
                                     limit=_limit):
            post.pop("html")
            post.pop("_id")

            post["html"] = "content is eliminated"
            query_return.append(post)
        # print(query_return[0])
        return {"code": 200, "message": "successfull", "content": query_return}
    except:
        # traceback.print_exc()
        return {"code": 404, "message": "failed", "content": []}
Beispiel #2
0
import re
import pandas as pd
from itertools import chain
from database import DBObject
import traceback

import traceback
from database import DBObject

from time import time
import pandas as pd
from datetime import date

db = DBObject()


def strip_text(text):
    return text.replace("\t", "").replace("\n", "").strip()


def stringify_children(node):
    # print(str(node.tag))

    parts = ([node.text] + list(
        chain(*((stringify_children(c) + ("\n" if str(c.tag) == "div" else ""))
                for c in node.getchildren()))) + [node.tail])

    return ''.join(filter(None, parts))


def clean_trash(html):
import pandas as pd
from datetime import datetime, date
import time

import hashlib

from ParserObject import ParserObject
from ParserModelSelector import ParserModelSelector
from LibFunc import clean_trash
from database import DBObject
from Settings import Settings

#=============================================================================================
#=============================================================================================

database = DBObject()


def parse(posts_data,
          site=None,
          type=None,
          num=None,
          many: bool = False,
          model_name=None,
          resume=False):

    print("Go to Parsing Data")
    the_status = "parsing"
    __failed_urls = []
    __saved_post = []
    task_id = (int)(time.time())
Beispiel #4
0
    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")
Beispiel #5
0
    def callback(ch, method, properties, body):
        command = "nothing"
        try:
            body = body.decode('ascii')
            message = message_loads(body)
            command = message["command"]

            if command == "crawl":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    Popen(['python', 'worker.py', body])
                else:
                    command = "is runing"

            elif command == "parse":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    file = open("parse_posts.data", "w")
                    file.write(message["posts"])
                    file.close()
                    model = message["model"] if "model" in message else "auto"
                    type = message["type"] if "type" in message else "all"
                    site = message["site"] if "site" in message else "all"

                    Popen([
                        'python', 'worker.py',
                        "command:parse site:%s type:%s model:%s" %
                        (site, type, model)
                    ])
                else:
                    command = "is runing"

            elif command == "stop":

                db = DBObject()
                db.cancel_task(Settings.worker_id)
                try:
                    pid = int(open("data.lock", "r").read())
                    os.kill(pid, signal.SIGTERM)
                except:
                    ""
                subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                subprocess.call("TASKKILL /f  /IM  CHROME.EXE")

            elif command == "pause":

                db = DBObject()
                pid = int(open("data.lock", "r").read())
                _working, _as = db.workAs(Settings.worker_id)
                if _working:
                    db.pause_task(Settings.worker_id)
                    try:
                        os.kill(pid, signal.SIGTERM)
                    except:
                        ""
                    subprocess.call("TASKKILL /f  /IM  CHROME.EXE")
                    subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                else:
                    if not psutil.pid_exists(pid):
                        Popen([
                            'python', 'worker.py',
                            "command:%s resume:1" % (_as)
                        ])
                    else:
                        command = "is runing"

            elif command == "shield":
                shield_on = True if (
                    ("shield" in message and int(message["shield"]) == 1) or
                    (not Settings.isShieldEnable())) else False
                Settings.enableShield(shield_on)
            else:
                command = "nothing"
                ""
        except:
            traceback.print_exc()

        print(" [x] Received \n    -> Do %s" % (command))