def process_group(hg, crawl_id, requesters, processed_groups, dbpool): """Gevent worker that should process single hitgroup. This should write some data into database and do not return any important data. """ hg['keywords'] = ', '.join(hg['keywords']) # for those hit goups that does not contain hash group, create one and # setup apropiate flag hg['qualifications'] = ', '.join(hg['qualifications']) conn = dbpool.getconn(thread.get_ident()) db = DB(conn) try: hit_group_content_id = db.hit_group_content_id(hg['group_id']) if hit_group_content_id is None: # check if there's profile for current requester and if does # exists with non-public status, then setup non public status for # current hitsgroup content profile = requesters.get(hg['requester_id'], None) if profile and profile.is_public is False: hg['is_public'] = False else: hg['is_public'] = True # fresh hitgroup - create group content entry, but first add some data # required by hitgroup content table hg['occurrence_date'] = datetime.datetime.now() hg['first_crawl_id'] = crawl_id if not hg['group_id_hashed']: # if group_id is hashed, we cannot fetch details because we # don't know what the real hash is hg.update(hits_group_info(hg['group_id'])) else: hg['html'] = '' hit_group_content_id = db.insert_hit_group_content(hg) log.debug('new hit group content: %s;;%s', hit_group_content_id, hg['group_id']) hg['hit_group_content_id'] = hit_group_content_id hg['crawl_id'] = crawl_id hg['now'] = datetime.datetime.now() db.insert_hit_group_status(hg) conn.commit() except Exception: processed_groups.remove(hg['group_id']) log.exception('process_group fail - rollback') conn.rollback() finally: db.curr.close() dbpool.putconn(conn, thread.get_ident()) msg = ('This really should not happen, Hitgroupstatus was processed but' ' is not on the list, race condition?') assert hg['group_id'] in processed_groups, msg return True
def close(self): tid = get_ident() sock = self.in_use.pop(tid, None) if sock: sock.close() return True return False
def checkin(self): tid = get_ident() if tid in self.in_use: sock = self.in_use.pop(tid) if not sock.is_closed: heapq.heappush(self.free, (time.time(), sock)) return True return False
def checkout(self): now = time.time() tid = get_ident() if tid in self.in_use: sock = self.in_use[tid] if sock.is_closed: del self.in_use[tid] else: return self.in_use[tid] while self.free: ts, sock = heapq.heappop(self.free) if ts < now - self.max_age: sock.close() else: self.in_use[tid] = sock return sock sock = self.create_socket() self.in_use[tid] = sock return sock
def process_group(hg, crawl_id, requesters, processed_groups): """Gevent worker that should process single hitgroup. This should write some data into database and do not return any important data. """ hg['keywords'] = ', '.join(hg['keywords']) # for those hit goups that does not contain hash group, create one and # setup apropiate flag hg['group_id_hashed'] = not bool(hg.get('group_id', None)) hg['qualifications'] = ', '.join(hg['qualifications']) if hg['group_id_hashed']: composition = ';'.join( map(str, (hg['title'], hg['requester_id'], hg['time_alloted'], hg['reward'], hg['description'], hg['keywords'], hg['qualifications']))) + ';' hg['group_id'] = hashlib.md5(composition).hexdigest() log.debug('group_id not found, creating hash: %s %s', hg['group_id'], composition) if hg['group_id'] in processed_groups: # this higroup was already processed log.info('duplicated group: %s;;%s', crawl_id, hg['group_id']) return False conn = dbpool.getconn(thread.get_ident()) db = DB(conn) try: hit_group_content_id = db.hit_group_content_id(hg['group_id']) if hit_group_content_id is None: # check if there's profile for current requester and if does # exists with non-public status, then setup non public status for # current hitsgroup content profile = requesters.get(hg['requester_id'], None) if profile and profile.is_public is False: hg['is_public'] = False else: hg['is_public'] = True # fresh hitgroup - create group content entry, but first add some data # required by hitgroup content table hg['occurrence_date'] = datetime.datetime.now() hg['first_crawl_id'] = crawl_id if not hg['group_id_hashed']: # if group_id is hashed, we cannot fetch details because we # don't know what the real hash is hg.update(hits_group_info(hg['group_id'])) else: hg['html'] = '' hit_group_content_id = db.insert_hit_group_content(hg) log.debug('new hit group content: %s;;%s', hit_group_content_id, hg['group_id']) hg['hit_group_content_id'] = hit_group_content_id hg['crawl_id'] = crawl_id db.insert_hit_group_status(hg) conn.commit() except Exception: log.exception('process_group fail - rollback') conn.rollback() finally: db.curr.close() dbpool.putconn(conn, thread.get_ident()) processed_groups.add(hg['group_id']) return True
def inject_greenlet_id(record): """ Sets the greenlet ID on `record.extra`. :param record: logbook record """ record.extra[GREENLET_ID_KEY] = get_ident()
def thread(self): """The ident of the thread. This is evaluated late and means that if the log record is passed to another thread, :meth:`pull_information` was called in the old thread. """ return thread.get_ident()
def get(self, strategy, persist=None, wait=True, request_ident=None, **proxy_params): if not callable(strategy): if isinstance(strategy, str): strategy, *strategy_params = strategy.split(':') strategy = getattr(self, GET_STRATEGY[strategy].value) if strategy_params: strategy_ = strategy def strategy(proxies): return strategy_(proxies, *strategy_params) elif isinstance(strategy, enum.Enum): strategy = getattr(self, strategy.value) if not len(self.active_proxies) and not self.fetcher: raise InsufficientProxies('No proxies and no fetcher {}'.format( self._stats_str)) self.maybe_update() ident = get_ident() # unique integer id for greenlet while True: ready_proxies = self.get_ready_proxies(**proxy_params) if ready_proxies: break elif not wait or ((not self.fetcher or self.fetcher.ready) and not self.in_use): # fetcher.ready also returns false on checker processing raise InsufficientProxies('No ready proxies {} {}{}'.format( proxy_params, request_ident and request_ident + ' ' or '', self._stats_str)) else: # logger.info('Wait proxy (thread %s) %s', ident, self._stats_str) self.proxy_ready.clear() if ident not in self.waiting: # Storing extra data for superproxy monitoring self.waiting[ident] = dict(since=datetime.utcnow(), request_ident=request_ident, params=proxy_params) delta = 0 elif wait is not True: delta = (datetime.utcnow() - self.waiting[ident]['since']).total_seconds() if delta >= wait: del self.waiting[ident] raise InsufficientProxies( 'Ready proxies wait timeout({}) {} {}{}'.format( wait, proxy_params, request_ident and request_ident + ' ' or '', self._stats_str)) try: self.proxy_ready.wait(None if wait is True else wait - delta) except Timeout: continue except BaseException: del self.waiting[ident] raise if ident in self.waiting: del self.waiting[ident] if persist: proxy = ready_proxies.get(persist, None) if proxy: proxy.in_use += 1 return proxy proxy = strategy(ready_proxies) if proxy: proxy.in_use += 1 return proxy raise InsufficientProxies( 'No proxies from {} ready with {} strategy {}{}'.format( len(ready_proxies), strategy, request_ident and request_ident + ' ' or '', self._stats_str))
def process_group(hg, crawl_id, requesters, processed_groups): """Gevent worker that should process single hitgroup. This should write some data into database and do not return any important data. """ hg['keywords'] = ', '.join(hg['keywords']) # for those hit goups that does not contain hash group, create one and # setup apropiate flag hg['group_id_hashed'] = not bool(hg.get('group_id', None)) hg['qualifications'] = ', '.join(hg['qualifications']) if hg['group_id_hashed']: composition = ';'.join(map(str, ( hg['title'], hg['requester_id'], hg['time_alloted'], hg['reward'], hg['description'], hg['keywords'], hg['qualifications']))) + ';' hg['group_id'] = hashlib.md5(composition).hexdigest() log.debug('group_id not found, creating hash: %s %s', hg['group_id'], composition) if hg['group_id'] in processed_groups: # this higroup was already processed log.info('duplicated group: %s;;%s', crawl_id, hg['group_id']) return False conn = dbpool.getconn(thread.get_ident()) db = DB(conn) try: hit_group_content_id = db.hit_group_content_id(hg['group_id']) if hit_group_content_id is None: # check if there's profile for current requester and if does # exists with non-public status, then setup non public status for # current hitsgroup content profile = requesters.get(hg['requester_id'], None) if profile and profile.is_public is False: hg['is_public'] = False else: hg['is_public'] = True # fresh hitgroup - create group content entry, but first add some data # required by hitgroup content table hg['occurrence_date'] = datetime.datetime.now() hg['first_crawl_id'] = crawl_id if not hg['group_id_hashed']: # if group_id is hashed, we cannot fetch details because we # don't know what the real hash is hg.update(hits_group_info(hg['group_id'])) else: hg['html'] = '' hit_group_content_id = db.insert_hit_group_content(hg) log.debug('new hit group content: %s;;%s', hit_group_content_id, hg['group_id']) hg['hit_group_content_id'] = hit_group_content_id hg['crawl_id'] = crawl_id db.insert_hit_group_status(hg) conn.commit() except Exception: log.exception('process_group fail - rollback') conn.rollback() finally: db.curr.close() dbpool.putconn(conn, thread.get_ident()) processed_groups.add(hg['group_id']) return True