Beispiel #1
0
    def get_last_modified(url):
        try:
            session = requests.Session()
            session.trust_env = False  # Don't read proxy settings from OS
            r_headers = session.head(
                url,
                timeout=Site._RESPONSE_HEADER_TIMEOUT,
                headers={'User-Agent': crawler.Crawler.USERAGENT})
        except ConnectionError as err:
            get_log().error(err)
            return None
        except MissingSchema as err:
            get_log().error(err)
            return None
        except Timeout as err:
            get_log().error(err)
            return None
        except Exception as err:
            get_log().error(err)
            return None

        if 400 <= r_headers.status_code < 600 or 'Last-Modified' not in r_headers.headers:
            get_log().debug(
                'Return code between 400 and 600 or there is no header')
            return None
        return datetime.datetime(
            *eutils.parsedate(r_headers.headers['Last-Modified'])[:6])
Beispiel #2
0
 def get_crawl_delay(self, useragent):
     try:
         return self._rp.crawl_delay(useragent)
     except AttributeError as e:
         get_log().error("url: {} AttributeError: {}".format(
             self.hostname, e))
         print(self._rp.path)
         return None
Beispiel #3
0
    def query_handler(self, websocket, path):
        city_query = yield from websocket.recv()
        city, query = city_query.split('\n')
        print("query:", query)

        try:
            answer = process(query, city, self.lock, self.path_to_checkpoints,
                             self.descr_file)
        except Exception as err:
            get_log().error(err)
            print(err)
            answer = '0\n' + 'Internal error!'
        yield from websocket.send(answer)
        print("send:", answer)
Beispiel #4
0
 def read_robots_txt(self):
     default_timeout = socket.getdefaulttimeout()
     socket.setdefaulttimeout(self._RESPONSE_ROBOT_TIMEOUT)
     try:
         self._rp.read()
         status = True
     except (URLError, ValueError) as e:
         status = False
         get_log().error(e)
     except socket.timeout as e:
         status = False
         get_log().error(e)
     finally:
         socket.setdefaulttimeout(default_timeout)
     return status
Beispiel #5
0
    def __init__(self, driver, **kwargs):
        """启动浏览器参数化,默认启动 Chrom."""
        self.logger = kwargs.get("logger", None)
        if not self.logger:
            self.logger = config.get_log("huilian")

        if driver:
            self.driver = driver
Beispiel #6
0
 def run(self):
     get_log().debug('begin execute run')
     while not self.frontier.done():
         self.steps_count += 1
         website = self.frontier.next_site()
         current_url = website.next_updated_url()
         get_log().debug(self.steps_count)
         if not current_url:
             continue
         get_log().debug(current_url)
         website.read_robots_txt()
         website_delay = website.get_crawl_delay(get_crawler_name())
         page = Page(current_url, website_delay)
         if not page.retrieve():
             self.frontier.remove_url(current_url)
             continue
         if website.permit_crawl(current_url):
             if page.allow_cache():
                 text = page.get_text()
                 if not any(word in text.lower()
                            for word in self.entertainment_words):
                     if any(word in text.lower()
                            for word in ['map', 'karta', 'kart']):
                         urls = page.extract_urls(current_url)
                         for url in urls:
                             self.frontier.add_url(url)
                     self.frontier.remove_url(current_url)
                     continue
                 self.store_document(current_url, text)
                 self.enities_wrapper.index(
                     self.url_id(current_url), current_url,
                     url_retriever_factory(current_url, text,
                                           page.main_soup).form_db_entry())
                 self.fileattribute_wrapper.index(self.url_id(current_url),
                                                  current_url, page.soup)
             urls = page.extract_urls(current_url)
             for url in urls:
                 self.frontier.add_url(url)
         if self.steps_count % 20 == 0:  # 000 == 0:
             self.create_index()
         if self.steps_count % 10 == 0:  # 00 == 0:
             self.create_checkpoint()
 def __init__(self, *args, **kwargs):
     self.logger = kwargs.get("logger", None)
     self.user = kwargs.get("user", "")
     if not self.logger:
         self.logger = config.get_log("filetools")
Beispiel #8
0
# -*- coding: utf-8 -*-
import telebot
import config
import credent
import RM
import logging
import general_except
import sqlite3

ADDRESS = config.get_address()
''' logger '''
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler(config.get_log())
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
''' data base '''
db_conn = sqlite3.connect("users.db", check_same_thread=False)
db_curs = db_conn.cursor()

# base init

pattern_cred = credent.Credent()
pattern_project = credent.Project()
pattern_iss = credent.Issue()

bot = telebot.TeleBot(str(config.get_telegram_key()))
Beispiel #9
0
    def __init__(self, *args, **kwargs):

        #kwargs
        self.logger = kwargs.get("logger", None)
        if not self.logger:
            self.logger = config.get_log("Tools")