コード例 #1
0
ファイル: tasks.py プロジェクト: maciej-gol/Mturk-Tracker
def process_group(hg, crawl_id, requesters, processed_groups, dbpool):
    """Gevent worker that should process single hitgroup.

    This should write some data into database and do not return any important
    data.
    """
    hg['keywords'] = ', '.join(hg['keywords'])
    # for those hit goups that does not contain hash group, create one and
    # setup apropiate flag

    hg['qualifications'] = ', '.join(hg['qualifications'])

    conn = dbpool.getconn(thread.get_ident())
    db = DB(conn)
    try:
        hit_group_content_id = db.hit_group_content_id(hg['group_id'])
        if hit_group_content_id is None:
            # check if there's profile for current requester and if does
            # exists with non-public status, then setup non public status for
            # current hitsgroup content
            profile = requesters.get(hg['requester_id'], None)
            if profile and profile.is_public is False:
                hg['is_public'] = False
            else:
                hg['is_public'] = True
            # fresh hitgroup - create group content entry, but first add some data
            # required by hitgroup content table
            hg['occurrence_date'] = datetime.datetime.now()
            hg['first_crawl_id'] = crawl_id
            if not hg['group_id_hashed']:
                # if group_id is hashed, we cannot fetch details because we
                # don't know what the real hash is
                hg.update(hits_group_info(hg['group_id']))
            else:
                hg['html'] = ''
            hit_group_content_id = db.insert_hit_group_content(hg)
            log.debug('new hit group content: %s;;%s',
                    hit_group_content_id, hg['group_id'])

        hg['hit_group_content_id'] = hit_group_content_id
        hg['crawl_id'] = crawl_id
        hg['now'] = datetime.datetime.now()
        db.insert_hit_group_status(hg)
        conn.commit()
    except Exception:
        processed_groups.remove(hg['group_id'])
        log.exception('process_group fail - rollback')
        conn.rollback()
    finally:
        db.curr.close()
        dbpool.putconn(conn, thread.get_ident())
        msg = ('This really should not happen, Hitgroupstatus was processed but'
            ' is not on the list, race condition?')
        assert hg['group_id'] in processed_groups, msg

    return True
コード例 #2
0
ファイル: greendb.py プロジェクト: ruvcoindev/greendb
 def close(self):
     tid = get_ident()
     sock = self.in_use.pop(tid, None)
     if sock:
         sock.close()
         return True
     return False
コード例 #3
0
ファイル: greendb.py プロジェクト: ruvcoindev/greendb
 def checkin(self):
     tid = get_ident()
     if tid in self.in_use:
         sock = self.in_use.pop(tid)
         if not sock.is_closed:
             heapq.heappush(self.free, (time.time(), sock))
         return True
     return False
コード例 #4
0
ファイル: greendb.py プロジェクト: ruvcoindev/greendb
    def checkout(self):
        now = time.time()
        tid = get_ident()
        if tid in self.in_use:
            sock = self.in_use[tid]
            if sock.is_closed:
                del self.in_use[tid]
            else:
                return self.in_use[tid]

        while self.free:
            ts, sock = heapq.heappop(self.free)
            if ts < now - self.max_age:
                sock.close()
            else:
                self.in_use[tid] = sock
                return sock

        sock = self.create_socket()
        self.in_use[tid] = sock
        return sock
コード例 #5
0
def process_group(hg, crawl_id, requesters, processed_groups):
    """Gevent worker that should process single hitgroup.

    This should write some data into database and do not return any important
    data.
    """
    hg['keywords'] = ', '.join(hg['keywords'])
    # for those hit goups that does not contain hash group, create one and
    # setup apropiate flag
    hg['group_id_hashed'] = not bool(hg.get('group_id', None))
    hg['qualifications'] = ', '.join(hg['qualifications'])
    if hg['group_id_hashed']:
        composition = ';'.join(
            map(str, (hg['title'], hg['requester_id'], hg['time_alloted'],
                      hg['reward'], hg['description'], hg['keywords'],
                      hg['qualifications']))) + ';'
        hg['group_id'] = hashlib.md5(composition).hexdigest()
        log.debug('group_id not found, creating hash: %s  %s', hg['group_id'],
                  composition)

    if hg['group_id'] in processed_groups:
        # this higroup was already processed
        log.info('duplicated group: %s;;%s', crawl_id, hg['group_id'])
        return False

    conn = dbpool.getconn(thread.get_ident())
    db = DB(conn)
    try:
        hit_group_content_id = db.hit_group_content_id(hg['group_id'])
        if hit_group_content_id is None:
            # check if there's profile for current requester and if does
            # exists with non-public status, then setup non public status for
            # current hitsgroup content
            profile = requesters.get(hg['requester_id'], None)
            if profile and profile.is_public is False:
                hg['is_public'] = False
            else:
                hg['is_public'] = True
            # fresh hitgroup - create group content entry, but first add some data
            # required by hitgroup content table
            hg['occurrence_date'] = datetime.datetime.now()
            hg['first_crawl_id'] = crawl_id
            if not hg['group_id_hashed']:
                # if group_id is hashed, we cannot fetch details because we
                # don't know what the real hash is
                hg.update(hits_group_info(hg['group_id']))
            else:
                hg['html'] = ''
            hit_group_content_id = db.insert_hit_group_content(hg)
            log.debug('new hit group content: %s;;%s', hit_group_content_id,
                      hg['group_id'])

        hg['hit_group_content_id'] = hit_group_content_id
        hg['crawl_id'] = crawl_id
        db.insert_hit_group_status(hg)
        conn.commit()
    except Exception:
        log.exception('process_group fail - rollback')
        conn.rollback()
    finally:
        db.curr.close()
        dbpool.putconn(conn, thread.get_ident())

    processed_groups.add(hg['group_id'])
    return True
コード例 #6
0
def inject_greenlet_id(record):
    """
    Sets the greenlet ID on `record.extra`.
    :param record: logbook record
    """
    record.extra[GREENLET_ID_KEY] = get_ident()
コード例 #7
0
ファイル: base.py プロジェクト: Infinidat/logbook
 def thread(self):
     """The ident of the thread.  This is evaluated late and means that
     if the log record is passed to another thread, :meth:`pull_information`
     was called in the old thread.
     """
     return thread.get_ident()
コード例 #8
0
    def get(self,
            strategy,
            persist=None,
            wait=True,
            request_ident=None,
            **proxy_params):
        if not callable(strategy):
            if isinstance(strategy, str):
                strategy, *strategy_params = strategy.split(':')
                strategy = getattr(self, GET_STRATEGY[strategy].value)
                if strategy_params:
                    strategy_ = strategy

                    def strategy(proxies):
                        return strategy_(proxies, *strategy_params)
            elif isinstance(strategy, enum.Enum):
                strategy = getattr(self, strategy.value)

        if not len(self.active_proxies) and not self.fetcher:
            raise InsufficientProxies('No proxies and no fetcher {}'.format(
                self._stats_str))
        self.maybe_update()

        ident = get_ident()  # unique integer id for greenlet
        while True:
            ready_proxies = self.get_ready_proxies(**proxy_params)
            if ready_proxies:
                break
            elif not wait or ((not self.fetcher or self.fetcher.ready)
                              and not self.in_use):
                # fetcher.ready also returns false on checker processing
                raise InsufficientProxies('No ready proxies {} {}{}'.format(
                    proxy_params, request_ident and request_ident + ' ' or '',
                    self._stats_str))
            else:
                # logger.info('Wait proxy (thread %s) %s', ident, self._stats_str)
                self.proxy_ready.clear()
                if ident not in self.waiting:
                    # Storing extra data for superproxy monitoring
                    self.waiting[ident] = dict(since=datetime.utcnow(),
                                               request_ident=request_ident,
                                               params=proxy_params)
                    delta = 0
                elif wait is not True:
                    delta = (datetime.utcnow() -
                             self.waiting[ident]['since']).total_seconds()
                    if delta >= wait:
                        del self.waiting[ident]
                        raise InsufficientProxies(
                            'Ready proxies wait timeout({}) {} {}{}'.format(
                                wait, proxy_params,
                                request_ident and request_ident + ' ' or '',
                                self._stats_str))
                try:
                    self.proxy_ready.wait(None if wait is True else wait -
                                          delta)
                except Timeout:
                    continue
                except BaseException:
                    del self.waiting[ident]
                    raise
        if ident in self.waiting:
            del self.waiting[ident]

        if persist:
            proxy = ready_proxies.get(persist, None)
            if proxy:
                proxy.in_use += 1
                return proxy
        proxy = strategy(ready_proxies)
        if proxy:
            proxy.in_use += 1
            return proxy
        raise InsufficientProxies(
            'No proxies from {} ready with {} strategy {}{}'.format(
                len(ready_proxies), strategy,
                request_ident and request_ident + ' ' or '', self._stats_str))
コード例 #9
0
ファイル: tasks.py プロジェクト: faridani/Mturk-Tracker
def process_group(hg, crawl_id, requesters, processed_groups):
    """Gevent worker that should process single hitgroup.

    This should write some data into database and do not return any important
    data.
    """
    hg['keywords'] = ', '.join(hg['keywords'])
    # for those hit goups that does not contain hash group, create one and
    # setup apropiate flag
    hg['group_id_hashed'] = not bool(hg.get('group_id', None))
    hg['qualifications'] = ', '.join(hg['qualifications'])
    if hg['group_id_hashed']:
        composition = ';'.join(map(str, (
            hg['title'], hg['requester_id'], hg['time_alloted'],
            hg['reward'], hg['description'], hg['keywords'],
            hg['qualifications']))) + ';'
        hg['group_id'] = hashlib.md5(composition).hexdigest()
        log.debug('group_id not found, creating hash: %s  %s',
                hg['group_id'], composition)

    if hg['group_id'] in processed_groups:
        # this higroup was already processed
        log.info('duplicated group: %s;;%s', crawl_id, hg['group_id'])
        return False

    conn = dbpool.getconn(thread.get_ident())
    db = DB(conn)
    try:
        hit_group_content_id = db.hit_group_content_id(hg['group_id'])
        if hit_group_content_id is None:
            # check if there's profile for current requester and if does
            # exists with non-public status, then setup non public status for
            # current hitsgroup content
            profile = requesters.get(hg['requester_id'], None)
            if profile and profile.is_public is False:
                hg['is_public'] = False
            else:
                hg['is_public'] = True
            # fresh hitgroup - create group content entry, but first add some data
            # required by hitgroup content table
            hg['occurrence_date'] = datetime.datetime.now()
            hg['first_crawl_id'] = crawl_id
            if not hg['group_id_hashed']:
                # if group_id is hashed, we cannot fetch details because we
                # don't know what the real hash is
                hg.update(hits_group_info(hg['group_id']))
            else:
                hg['html'] = ''
            hit_group_content_id = db.insert_hit_group_content(hg)
            log.debug('new hit group content: %s;;%s',
                    hit_group_content_id, hg['group_id'])

        hg['hit_group_content_id'] = hit_group_content_id
        hg['crawl_id'] = crawl_id
        db.insert_hit_group_status(hg)
        conn.commit()
    except Exception:
        log.exception('process_group fail - rollback')
        conn.rollback()
    finally:
        db.curr.close()
        dbpool.putconn(conn, thread.get_ident())

    processed_groups.add(hg['group_id'])
    return True