def get_last_modified(url): try: session = requests.Session() session.trust_env = False # Don't read proxy settings from OS r_headers = session.head( url, timeout=Site._RESPONSE_HEADER_TIMEOUT, headers={'User-Agent': crawler.Crawler.USERAGENT}) except ConnectionError as err: get_log().error(err) return None except MissingSchema as err: get_log().error(err) return None except Timeout as err: get_log().error(err) return None except Exception as err: get_log().error(err) return None if 400 <= r_headers.status_code < 600 or 'Last-Modified' not in r_headers.headers: get_log().debug( 'Return code between 400 and 600 or there is no header') return None return datetime.datetime( *eutils.parsedate(r_headers.headers['Last-Modified'])[:6])
def get_crawl_delay(self, useragent): try: return self._rp.crawl_delay(useragent) except AttributeError as e: get_log().error("url: {} AttributeError: {}".format( self.hostname, e)) print(self._rp.path) return None
def query_handler(self, websocket, path): city_query = yield from websocket.recv() city, query = city_query.split('\n') print("query:", query) try: answer = process(query, city, self.lock, self.path_to_checkpoints, self.descr_file) except Exception as err: get_log().error(err) print(err) answer = '0\n' + 'Internal error!' yield from websocket.send(answer) print("send:", answer)
def read_robots_txt(self): default_timeout = socket.getdefaulttimeout() socket.setdefaulttimeout(self._RESPONSE_ROBOT_TIMEOUT) try: self._rp.read() status = True except (URLError, ValueError) as e: status = False get_log().error(e) except socket.timeout as e: status = False get_log().error(e) finally: socket.setdefaulttimeout(default_timeout) return status
def __init__(self, driver, **kwargs): """启动浏览器参数化,默认启动 Chrom.""" self.logger = kwargs.get("logger", None) if not self.logger: self.logger = config.get_log("huilian") if driver: self.driver = driver
def run(self): get_log().debug('begin execute run') while not self.frontier.done(): self.steps_count += 1 website = self.frontier.next_site() current_url = website.next_updated_url() get_log().debug(self.steps_count) if not current_url: continue get_log().debug(current_url) website.read_robots_txt() website_delay = website.get_crawl_delay(get_crawler_name()) page = Page(current_url, website_delay) if not page.retrieve(): self.frontier.remove_url(current_url) continue if website.permit_crawl(current_url): if page.allow_cache(): text = page.get_text() if not any(word in text.lower() for word in self.entertainment_words): if any(word in text.lower() for word in ['map', 'karta', 'kart']): urls = page.extract_urls(current_url) for url in urls: self.frontier.add_url(url) self.frontier.remove_url(current_url) continue self.store_document(current_url, text) self.enities_wrapper.index( self.url_id(current_url), current_url, url_retriever_factory(current_url, text, page.main_soup).form_db_entry()) self.fileattribute_wrapper.index(self.url_id(current_url), current_url, page.soup) urls = page.extract_urls(current_url) for url in urls: self.frontier.add_url(url) if self.steps_count % 20 == 0: # 000 == 0: self.create_index() if self.steps_count % 10 == 0: # 00 == 0: self.create_checkpoint()
def __init__(self, *args, **kwargs): self.logger = kwargs.get("logger", None) self.user = kwargs.get("user", "") if not self.logger: self.logger = config.get_log("filetools")
# -*- coding: utf-8 -*- import telebot import config import credent import RM import logging import general_except import sqlite3 ADDRESS = config.get_address() ''' logger ''' logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.FileHandler(config.get_log()) handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) ''' data base ''' db_conn = sqlite3.connect("users.db", check_same_thread=False) db_curs = db_conn.cursor() # base init pattern_cred = credent.Credent() pattern_project = credent.Project() pattern_iss = credent.Issue() bot = telebot.TeleBot(str(config.get_telegram_key()))
def __init__(self, *args, **kwargs): #kwargs self.logger = kwargs.get("logger", None) if not self.logger: self.logger = config.get_log("Tools")