Exemple #1
0
  def __init__(self):
    DBObject.__init__(self)

    self.server		= None
    self.localization	= hlib.i18n.Localization(languages = ['cz'])
    self.trumpet	= hlib.database.SimpleMapping()
    self.users		= hlib.database.StringMapping()
Exemple #2
0
  def __init__(self, name, password, email):
    DBObject.__init__(self)

    self.name		= unicode(name)
    self.password	= unicode(password)
    self.admin		= False
    self.date_format = '%d/%m/%Y %H:%M:%S'
    self.email		= unicode(email)
    self.maintenance_access	= False

    self.cookies	= hlib.database.SimpleMapping()

    self.events		= hlib.database.IndexedMapping()

    self.api_tokens	= hlib.database.SimpleList()
Exemple #3
0
  def __getattr__(self, name):
    if name == 'is_admin':
      return self.admin == True

    if name == 'is_online':
      return self.name in hruntime.app.sessions.online_users

    return DBObject.__getattr__(self, name)
Exemple #4
0
def searchPostHtml(request: dict):
    # print(request)
    site_type_re = {
        "nha.chotot.com": {
            "land": r"^.*/mua-ban-dat/.*$",
            "house": r"^.*/mua-ban-nha-dat/.*$",
            "apartment": r"^.*/mua-ban-can-ho-chung-cu/.*$"
        },
        "nhadat247.com.vn": {
            "land": r"^.*nhadat247.com.vn/ban-dat.*$",
            "apartment": r"^.*nhadat247.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*nhadat247.com.vn/ban-nha.*$"
        },
        "batdongsan.com.vn": {
            "land": r"^.*batdongsan.com.vn/ban-dat.*$",
            "apartment": r"^.*batdongsan.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*batdongsan.com.vn/ban-nha.*$"
        }
    }

    try:
        db = DBObject()

        _site = request["site"] if "site" in request else None
        _crawl_date = request["crawl_date"] if "crawl_date" in request else None
        _post_date = request["post_date"] if "post_date" in request else None
        _type = request["type"] if "type" in request else "all"
        _limit = int(request["limit"]) if (
            "limit" in request) and len(request["limit"]) > 0 else 0

        list_filter = []

        if _site in site_type_re:
            list_filter.append(
                {"url": {
                    "$regex": "^https://%s/.*$" % (_site)
                }})

        if _type in site_type_re[_site]:
            list_filter.append({"url": {"$regex": site_type_re[_site][_type]}})
        else:
            list_filter.append({
                "$or": [{
                    "url": {
                        "$regex": site_type_re[_site][_t]
                    }
                } for _t in site_type_re[_site]]
            })

        _d_range = d_range(_crawl_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        _d_range = d_range(_post_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "post_date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        query_return = []
        for post in db.query_html_db(query_dict={"$and": list_filter},
                                     limit=_limit):
            post.pop("html")
            post.pop("_id")

            post["html"] = "content is eliminated"
            query_return.append(post)
        # print(query_return[0])
        return {"code": 200, "message": "successfull", "content": query_return}
    except:
        # traceback.print_exc()
        return {"code": 404, "message": "failed", "content": []}
Exemple #5
0
import re
import pandas as pd
from itertools import chain
from database import DBObject
import traceback

import traceback
from database import DBObject

from time import time
import pandas as pd
from datetime import date

db = DBObject()


def strip_text(text):
    return text.replace("\t", "").replace("\n", "").strip()


def stringify_children(node):
    # print(str(node.tag))

    parts = ([node.text] + list(
        chain(*((stringify_children(c) + ("\n" if str(c.tag) == "div" else ""))
                for c in node.getchildren()))) + [node.tail])

    return ''.join(filter(None, parts))


def clean_trash(html):
Exemple #6
0
    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")
Exemple #7
0
class NhaDat247(CrawlerObject):

    BASE_URL = "https://nhadat247.com.vn/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        for i in range(5):
            try:
                element_present = EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "body > div.footer"))
                _html = self.browser.get_html(url, until_ec=element_present)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["ban-dat"]
        elif key == "apartment":
            return ["ban-can-ho-chung-cu"]
        elif key == "house":
            return ["ban-nha-mat-pho", "ban-nha-biet-thu", "ban-nha-rieng"]

        return [
            "ban-dat", "ban-can-ho-chung-cu", "ban-nha-rieng",
            "ban-nha-mat-pho", "ban-nha-biet-thu"
        ]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#ContentPlaceHolder1_ProductDetail1_divprice > div").get_text(
                ).split("|")[1]
            str_date = slugify(str_date.strip().lower())
            if "hom-kia" in str_date:
                post_date = date.today() - timedelta(days=2)
            elif "hom-qua" in str_date:
                post_date = date.today() - timedelta(days=1)
            elif "hom-nay" in str_date:
                post_date = date.today()
            else:

                post_date = datetime.strptime(str_date, '%d-%m-%Y').date()

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                NhaDat247.save_list(local_urls, self.file_log_new_url)
                NhaDat247.save_list(visited_post, self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment": ["https://nhadat247.com.vn/ban-can-ho-chung-cu.html"],
            "house": [
                "https://nhadat247.com.vn/ban-nha-rieng.html",
                "https://nhadat247.com.vn/ban-nha-biet-thu-lien-ke.html",
                "https://nhadat247.com.vn/ban-nha-mat-pho.html"
            ],
            "land": [
                "https://nhadat247.com.vn/ban-dat-nen-du-an.html",
                "https://nhadat247.com.vn/ban-dat.html"
            ]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()
Exemple #8
0
class ChoTotCrawler(CrawlerObject):

    BASE_URL = "https://nha.chotot.com/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nha.chotot.com":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nha.chotot.com, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = ChoTotCrawler.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_chotot_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_chotot_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.htm)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]+)?[/][a-z][-a-z0-9]+/[-a-z0-9]+.htm")

        self.key_type = ChoTotCrawler.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        click_phone_script = """
            function getElementByXpath(path) {
                return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
            }

            var phone = getElementByXpath("//*[@id='__next']/div[3]/div[1]/div/div[4]/div[3]/div/linkcontact");
            if (phone != null) {
                phone.click();
            }                    
        """

        for i in range(5):
            try:
                is_post = re.search(self.regex_post, url)
                element_present = EC.presence_of_element_located(
                    (By.XPATH, """//html/body/div[1]/footer"""))
                _html = self.browser.get_html(
                    url=url,
                    until_ec=element_present,
                    run_script=click_phone_script if is_post else None)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["mua-ban-dat"]
        elif key == "apartment":
            return ["mua-ban-can-ho-chung-cu"]
        elif key == "house":
            return ["mua-ban-nha-dat"]

        return ["mua-ban-dat", "mua-ban-nha-dat", "mua-ban-can-ho-chung-cu"]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def convert_str2date(date_str):
        _date = None

        date_str = slugify(date_str.lower())
        _l = date_str.split("-")
        if "hom-qua" in date_str:
            _date = date.today() - timedelta(days=1)
        elif "thang" in _l:
            _n = int(_l[_l.index("thang") - 1][0])
            _date = date.today() - timedelta(days=30 * _n)
        elif "tuan" in _l:
            _n = int(_l[_l.index("tuan") - 1][0])
            _date = date.today() - timedelta(days=7 * _n)
        elif "ngay" in _l:
            _n = int(_l[_l.index("ngay") - 1][0])
            _date = date.today() - timedelta(days=1)
        elif "hom-nay" in date_str or "gio" in _l or "phut" in _l:
            _date = date.today()
        else:
            _date = datetime.strptime(date_str, '%d/%m/%Y').date()

        return _date

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#__next > div > div.ct-detail.adview > div > div.col-md-8 > div.adImageWrapper___KTd-h > div.imageCaption___cMU2J > span"
            ).get_text()
            str_date = str_date.strip()
            post_date = ChoTotCrawler.convert_str2date(str_date)

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                ChoTotCrawler.save_list(local_urls, self.file_log_new_url)
                ChoTotCrawler.save_list(visited_post,
                                        self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment":
            ["https://nha.chotot.com/toan-quoc/mua-ban-can-ho-chung-cu"],
            "house": ["https://nha.chotot.com/toan-quoc/mua-ban-nha-dat"],
            "land": ["https://nha.chotot.com/toan-quoc/mua-ban-dat"]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()
import pandas as pd
from datetime import datetime, date
import time

import hashlib

from ParserObject import ParserObject
from ParserModelSelector import ParserModelSelector
from LibFunc import clean_trash
from database import DBObject
from Settings import Settings

#=============================================================================================
#=============================================================================================

database = DBObject()


def parse(posts_data,
          site=None,
          type=None,
          num=None,
          many: bool = False,
          model_name=None,
          resume=False):

    print("Go to Parsing Data")
    the_status = "parsing"
    __failed_urls = []
    __saved_post = []
    task_id = (int)(time.time())
Exemple #10
0
    try:
        row = file.readline()
        if row == None or len(row) < 10:
            continue
        row = json.loads(row)

        row["url"] = row["url"].strip()
        row["url_hash"] = hashlib.md5(row["url"].encode()).hexdigest()

        soup = BeautifulSoup(row["html"], 'html.parser')
        _date = soup.select_one(
            "#product-detail-web > div.detail-product > div.product-config.pad-16 > ul > li:nth-child(1) > span.sp3"
        ).get_text()
        _date = _date.strip()
        _date = datetime.strptime(_date, '%d/%m/%Y').date()
        row["post_date"] = _date.strftime("%d/%m/%Y")

        row.pop('parser', None)

        data.append(row)

        print(i, ". ", _date)
    except:
        print("-" * 20)
        print("ERROR", i, ":")
        traceback.print_exc()
        print("-" * 20)

db = DBObject()
db.insert_html_data(json_row=data, many=True)
Exemple #11
0
  def __init__(self, stamp, hidden):
    DBObject.__init__(self)

    self.id		= None
    self.stamp		= stamp
    self.hidden		= hidden
Exemple #12
0
  def __getattr__(self, name):
    if name == 'online_users':
      return hruntime.app.sessions.online_users

    return DBObject.__getattr__(self, name)
Exemple #13
0
  def __init__(self):
    DBObject.__init__(self)

    self.events			= hlib.database.IndexedMapping()
    self.maintenance_mode	= False
Exemple #14
0
    def callback(ch, method, properties, body):
        command = "nothing"
        try:
            body = body.decode('ascii')
            message = message_loads(body)
            command = message["command"]

            if command == "crawl":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    Popen(['python', 'worker.py', body])
                else:
                    command = "is runing"

            elif command == "parse":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    file = open("parse_posts.data", "w")
                    file.write(message["posts"])
                    file.close()
                    model = message["model"] if "model" in message else "auto"
                    type = message["type"] if "type" in message else "all"
                    site = message["site"] if "site" in message else "all"

                    Popen([
                        'python', 'worker.py',
                        "command:parse site:%s type:%s model:%s" %
                        (site, type, model)
                    ])
                else:
                    command = "is runing"

            elif command == "stop":

                db = DBObject()
                db.cancel_task(Settings.worker_id)
                try:
                    pid = int(open("data.lock", "r").read())
                    os.kill(pid, signal.SIGTERM)
                except:
                    ""
                subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                subprocess.call("TASKKILL /f  /IM  CHROME.EXE")

            elif command == "pause":

                db = DBObject()
                pid = int(open("data.lock", "r").read())
                _working, _as = db.workAs(Settings.worker_id)
                if _working:
                    db.pause_task(Settings.worker_id)
                    try:
                        os.kill(pid, signal.SIGTERM)
                    except:
                        ""
                    subprocess.call("TASKKILL /f  /IM  CHROME.EXE")
                    subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                else:
                    if not psutil.pid_exists(pid):
                        Popen([
                            'python', 'worker.py',
                            "command:%s resume:1" % (_as)
                        ])
                    else:
                        command = "is runing"

            elif command == "shield":
                shield_on = True if (
                    ("shield" in message and int(message["shield"]) == 1) or
                    (not Settings.isShieldEnable())) else False
                Settings.enableShield(shield_on)
            else:
                command = "nothing"
                ""
        except:
            traceback.print_exc()

        print(" [x] Received \n    -> Do %s" % (command))