def __init__(self, _id, env="test"): ''' websources - websources objects dictionary (key - ws name, value - config) skill - search criteria (skill) ''' self._id = _id self.env = env self.logger = self.init_logger() self.db = Data_base('crawler')
def db_setup(self): ''' Connect to MongoDB collection 'Crawler'. ''' db = Data_base("crawler").connect_db() collection = db["crawler"] return collection
def __init__(self): self.data_base = Data_base('crawler').connect_db() self.log_file_path = path.join(path.dirname(path.abspath(__file__)), 'logging.conf') logging.config.fileConfig(self.log_file_path) self.logger = logging.getLogger('parserApp') self.num = 0 self.array_vacancies = []
def __init__(self): self.arr_graph_skill = [] self.arr_graph_connects = [] self.data_base = Data_base('crawler').connect_db() self.log_file_path = path.join(path.dirname(path.abspath(__file__)), 'logging.conf') logging.config.fileConfig(self.log_file_path) self.logger = logging.getLogger('pythonApp') self.data_vacancy = self.data_base['parsed_vacancy'] self.data_graph_skill = self.data_base['graph_skill'] self.count_vacancy = self.data_vacancy.find({'status': 'NEW'}).count() self.current_num = 0
class Crawler(object): ''' crawler class, has methods that scrap web services. give an _id from crawler collection as an argument ''' status = INACTIVE # current status of crawler instance ("INACTIVE", "IN_PROCESS", "FAILED", "PROCESSED") skill = None # search skill criteria websources = { } # key - name of websource, value - dict with configurations page_links_dict = { } # key - name of websource, value - list of page (pagination pages) links with vacancies vac_links_dict = { } # key - name of websourse, value - list of vacanciy's links f_vac_links_dict = {} # filtered vac_links_dict vacancies_dict = {} # dict of vacancies dicts with link, title, raw def __init__(self, _id, env="test"): ''' websources - websources objects dictionary (key - ws name, value - config) skill - search criteria (skill) ''' self._id = _id self.env = env self.logger = self.init_logger() self.db = Data_base('crawler') def init_logger(self): ''' Initialize log file. ''' logger = logging.getLogger('crawler_app %s' % self._id) logger.setLevel(logging.INFO) # create a file handler handler = logging.FileHandler('crawler %s %s.log' % (self._id, dt.now())) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) return logger def get_crawler_dict_from_db(self): ''' get crawler instance dictionary from database (contains search skill) ''' cursor = self.db.connect_db().crawler.find({'_id': ObjectId(self._id)}) for crawler in cursor: crawler_dict = crawler return crawler_dict def read_skill_from_db(self): ''' get search skill from db ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', SETUP, fname) skill = self.get_crawler_dict_from_db()['search_condition'] self.skill = skill def read_websourses_from_db(self): ''' get websources configuration from db ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', SETUP, fname) cursor = self.db.connect_db().websource.find() for ws in cursor: self.websources.update({ws.pop('name'): ws}) def collect_pages_links(self): ''' Collects page links (pagination pages) ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) for ws_name, ws in self.websources.items(): search_pattern = ws['base_url'] + ws['search_pattern'] pag_start = ws['pagination_start'] headers = ws['headers'] pars_skill = urllib.parse.quote_plus(self.skill) url = search_pattern.format(skill=pars_skill, page=pag_start) tree = html.fromstring(get_bin(url, headers, self.logger)) jobs_qty_elem_list = tree.xpath( self.websources[ws_name]['jobs_qty_xpath']) if jobs_qty_elem_list: jobs_qty_elem = jobs_qty_elem_list[0] jobs_qty_text = jobs_qty_elem.text_content() jobs_qty = extract_1_num(jobs_qty_text) if jobs_qty: pages_qty = math.ceil(jobs_qty / ws['pagination']) pages_range = range(pag_start, pages_qty + pag_start) ws_page_list = [ search_pattern.format(skill=pars_skill, page=x) for x in pages_range ] self.page_links_dict[ws_name] = ws_page_list self.logger.info("%s - %s - page links collected for %s", IN_PROCESS, fname, ws_name) def get_link(self, item): ''' takes ws name and page url, then returns vacancies links on this page ''' ws_name, page_link = item ws = self.websources[ws_name] link_xpath = ws['link_xpath'] link_is_abs = ws['absolute_links'] base_url = ws['base_url'] headers = ws['headers'] page_bin_html = get_bin(page_link, headers, self.logger) vac_links = get_vac_links(base_url, page_bin_html, link_xpath, link_is_abs) return vac_links def prepare_page_links(self, item): ''' prepares page links and creates pool in whitch we use function get_link that collects vac links ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) ws_name, page_links = item wsname_and_link_seq = list(map(lambda x: (ws_name, x), page_links)) pool = Pool(10) ws_vac_links = [] ws_vac_links_2d = pool.map(self.get_link, wsname_and_link_seq) for lis in ws_vac_links_2d: ws_vac_links.extend(lis) pool.close() pool.join() self.vac_links_dict[ws_name] = ws_vac_links self.logger.info("%s - %s - vacancies links collected for %s", IN_PROCESS, fname, ws_name) def filter_vac_links(self): ''' Cleans up lists in vac_links_dict ( deletes repeating values ) ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) filtered_links = dict() for ws_name, links in self.vac_links_dict.items(): link_set = set(links) link_list = list(link_set) filtered_links[ws_name] = link_list self.logger.info("%s - %s - vacancies links filtered for %s", IN_PROCESS, fname, ws_name) self.f_vac_links_dict = filtered_links def collect_vacancy(self, vac_tuple): """ collects vacancy's raw from websource. Takes vac tuple with websourse name and link, and writes it to vacancies_dict """ ws_name, vac_link = vac_tuple fname = inspect.stack()[0][3] ws = self.websources[ws_name] headers = ws['headers'] bin_html = get_bin(vac_link, headers, self.logger) if bin_html != None: title, raw = get_title_n_raw_from_bin(ws, bin_html) self.vacancies_dict[ws_name].append({ 'crawler_id': ObjectId(self._id), 'link': vac_link, 'title': title.lower(), 'raw': raw.lower(), 'status': NEW, 'created_date': dt.now(), 'modified_date': dt.now(), }) self.logger.info( "%s - %s - vacancy '%s' (title, raw) collected from %s", IN_PROCESS, fname, title, ws_name) def process_pool_for_collecting_links(self, item): """ procces pool for collecting links """ fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) ws_name, vac_links = item self.vacancies_dict[ws_name] = [] vac_links = list(map(lambda x: (ws_name, x), vac_links)) pool = Pool(10) pool.map(self.collect_vacancy, vac_links) pool.close() pool.join() def process_ws_pool(self, func, sequence): """ takes function and sequence. Than creates pool with separetaed thread for each websource """ fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) ws_qty = len(self.websources) pool = Pool(ws_qty) pool.map(func, sequence) pool.close() pool.join() def write_vacancies_in_db(self): ''' write vacancies to db, packes by websources ''' fname = inspect.stack()[0][3] self.logger.info('%s - %s', IN_PROCESS, fname) for ws_name, vacancies in self.vacancies_dict.items(): if vacancies: self.db.connect_db().vacancy.insert_many(vacancies) self.vacancies_dict[ws_name].clear() self.logger.info( "%s - %s - vacancies for has been written to database from %s", IN_PROCESS, fname, ws_name) def setup(self): ''' setup method, use it every time after you've initialyzed nes Crawler instance ''' try: self.logger.info('%s START', SETUP) self.status = SETUP self.read_skill_from_db() self.read_websourses_from_db() self.status = INACTIVE self.logger.info('%s FINISH', SETUP) except: self.status = FAILED self.logger.exception("crawler setup error") raise SystemError( 'crawler setup error, look crawler log for information') def run(self): ''' after setup you can run this method to collect and write vacancies to db ''' try: self.logger.info("%s START", IN_PROCESS) self.status = IN_PROCESS self.collect_pages_links() self.process_ws_pool(self.prepare_page_links, self.page_links_dict.items()) self.filter_vac_links() self.process_ws_pool(self.process_pool_for_collecting_links, self.f_vac_links_dict.items()) self.write_vacancies_in_db() self.status = PROCESSED self.logger.info("%s FINISH", IN_PROCESS) except: self.status = FAILED self.logger.exception("crawler runtime error") raise SystemError( 'crawler runtime error, look crawler log for information')
class Crawler(object): ''' crawler class, has methods that scrap web services. give an _id from crawler collection as an argument ''' status = INACTIVE # current status of crawler instance ("INACTIVE", "IN_PROCESS", "FAILED", "PROCESSED") skill = None # search skill criteria websources = { } # key - name of websource, value - dict with configurations page_links_dict = { } # key - name of websource, value - list of page (pagination pages) links with vacancies vac_links_dict = { } # key - name of websourse, value - list of vacanciy's links vacancies_dict = {} # dict of vacancies dicts with link, title, raw def __init__(self, _id, env='test'): ''' websources - websources objects dictionary (key - ws name, value - config) skill - search criteria (skill) ''' self._id = _id self.env = env # self.logger = self.init_logger() self.db = Data_base('crawler') """ def init_logger(self): ''' Initialize log file. ''' logger = logging.getLogger('crawler_app {}'.format(self._id)) logger.setLevel(logging.INFO) # create a file handler handler = logging.FileHandler('crawler {} {}.log'.format(self._id, dt.now())) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) return logger """ def get_crawler_dict_from_db(self): ''' get crawler instance dictionary from database (contains search skill) ''' cursor = self.db.connect_db().crawler.find({'_id': ObjectId(self._id)}) for crawler in cursor: crawler_dict = crawler return crawler_dict def read_skill_from_db(self): ''' get search skill from db ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(SETUP, fname)) skill = self.get_crawler_dict_from_db()['search_condition'] self.skill = skill def read_websourses_from_db(self): ''' get websources configuration from db ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(SETUP, fname)) cursor = self.db.connect_db().websource.find() for ws in cursor: self.websources.update({ws.pop('name'): ws}) def collect_pages_links(self): ''' Collects page links (pagination pages) ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(IN_PROCESS, fname)) for ws_name, ws in self.websources.items(): search_pattern = ws['base_url'] + ws['search_pattern'] pag_start = ws['pagination_start'] headers = ws['headers'] pars_skill = urllib.parse.quote_plus(self.skill) url = search_pattern.format(skill=pars_skill, page=pag_start) tree = html.fromstring(get_bin(url, headers)) jobs_qty_elem = tree.xpath( self.websources[ws_name]['jobs_qty_xpath'])[0] jobs_qty_text = jobs_qty_elem.text_content() jobs_qty = extract_1_num(jobs_qty_text) pages_qty = math.ceil(jobs_qty / ws['pagination']) pages_range = range(pag_start, pages_qty + pag_start) ws_page_list = [ search_pattern.format(skill=pars_skill, page=x) for x in pages_range ] self.page_links_dict[ws_name] = ws_page_list message = "{} - {} - page links collected for {}".format( IN_PROCESS, fname, ws_name) # self.logger.info(message) def collect_vac_links(self): ''' Collects vacancies links from page links ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(IN_PROCESS, fname)) for ws_name, page_links in self.page_links_dict.items(): ws = self.websources[ws_name] link_xpath = ws['link_xpath'] link_is_abs = ws['absolute_links'] base_url = ws['base_url'] headers = ws['headers'] ws_vac_links = [] for page_link in page_links: page_bin_html = get_bin(page_link, headers) vac_links = get_vac_links(base_url, page_bin_html, link_xpath, link_is_abs) ws_vac_links.extend(vac_links) self.vac_links_dict[ws_name] = ws_vac_links message = "{} - {} - vacancies links collected for {}".format( IN_PROCESS, fname, ws_name) # self.logger.info(message) def filter_vac_links(self): ''' Cleans up lists in vac_links_dict ( deletes repeating values ) ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(IN_PROCESS, fname)) filtered_links = dict() for ws_name, links in self.vac_links_dict.items(): link_set = set(links) link_list = list(link_set) filtered_links[ws_name] = link_list message = "{} - {} - vacancies links filtered for {}".format( IN_PROCESS, fname, ws_name) # self.logger.info(message) self.vac_links_dict = filtered_links def sub_collect(self, vac_tuple): ws_name, fname, vac_link = vac_tuple ws = self.websources[ws_name] headers = ws['headers'] bin_html = get_bin(vac_link, headers) title, raw = get_title_n_raw_from_bin(ws, bin_html) self.vacancies_dict[ws_name].append({ 'crawler_id': self._id, 'link': vac_link, 'title': title.lower(), 'raw': raw.lower(), 'status': NEW, 'created_date': dt.now(), 'modified_date': dt.now(), }) message = "{} - {} - vacancy '{}' (title, raw) collected from {}".format( IN_PROCESS, fname, title, ws_name) # self.logger.info(message) def collect_vacs_sub(self): ''' Collects vanancies (title, raw) into vacancies list ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(IN_PROCESS, fname)) for ws_name, vac_links in self.vac_links_dict.items(): self.vacancies_dict[ws_name] = [] vac_links = list(map(lambda x: (ws_name, fname, x), vac_links)) pool = Pool(10) pool.map(self.sub_collect, vac_links) pool.close() pool.join() def write_vacancies_in_db(self): ''' write vacancies to db, packes by websources ''' fname = inspect.stack()[0][3] # self.logger.info('{} - {}'.format(IN_PROCESS, fname)) for ws_name, vacancies in self.vacancies_dict.items(): if vacancies: self.db.connect_db().vacancy.insert_many(vacancies) self.vacancies_dict[ws_name].clear() message = "{} - {} - vacancies for has been written to database from {}".format( IN_PROCESS, fname, ws_name) # self.logger.info(message) def setup(self): ''' setup method, use it every time after you've initialyzed nes Crawler instance ''' try: # self.logger.info('{} START'.format(SETUP)) self.status = SETUP self.read_skill_from_db() self.read_websourses_from_db() self.status = INACTIVE # self.logger.info('{} FINISH'.format(SETUP)) except: self.status = FAILED # self.logger.exception("crawler setup error") raise SystemError( 'crawler setup error, look crawler log for information') def run(self): ''' after setup you can run this method to collect and write vacancies to db ''' try: # self.logger.info("{} START".format(IN_PROCESS)) self.status = IN_PROCESS self.collect_pages_links() self.collect_vac_links() self.filter_vac_links() self.collect_vacs_sub() self.write_vacancies_in_db() self.status = PROCESSED # self.logger.info("{} FINISH".format(IN_PROCESS)) except: self.status = FAILED # self.logger.exception("crawler runtime error") raise SystemError( 'crawler runtime error, look crawler log for information')