Exemple #1
0
    def get_tasks(self):
        tasks = []
        # relationships = []
        for f in self.followings:
            if f['hashid']:
                dst_node = self.merge_node(f)
                follows = py2neo.Relationship(self.src_node, 'FOLLOWS', dst_node)
                # relationships.append(follows)
                self.g.create_unique(follows)

                if not rh.is_user_crawled(f['domain']):
                    tasks.append(f['domain'].encode('utf-8'))

        # if len(relationships) > 0:
        #     self.g.create_unique(*relationships)

        return tasks
Exemple #2
0
    def get_tasks(self):
        tasks = []
        # relationships = []
        for f in self.followings:
            if f['hashid']:
                dst_node = self.merge_node(f)
                follows = py2neo.Relationship(self.src_node, 'FOLLOWS',
                                              dst_node)
                # relationships.append(follows)
                self.g.create_unique(follows)

                if not rh.is_user_crawled(f['domain']):
                    tasks.append(f['domain'].encode('utf-8'))

        # if len(relationships) > 0:
        #     self.g.create_unique(*relationships)

        return tasks
Exemple #3
0
def start(instance_id):
    global task
    global status
    task = ''

    try:
        logger.warning('Instance id: %s', instance_id)

        hostname = socket.gethostname()
        ip = socket.gethostbyname(hostname)
        start_time = int(time.time())

        session = SessionHelper()

        status = {
            'id': hostname + '-' + str(instance_id),
            'hostname': hostname,
            'ip': ip,
            'finished': finished,
            'task': '',
            'status': 'init',
            'message': '',
            'account': Account.get_using(),
            'start_time': start_time,
            'update_time': int(time.time())
        }

        rh.publish_status(status)

        while True:
            task = ''
            task = rh.get_task_user()
            logger.warning('Get task: ' + task)

            if rh.is_user_crawled(task):
                logger.warning("User %s crawled, skip", task)
                continue

            status.update({
                'finished': finished,
                'task': task,
                'status': 'crawling',
                'account': Account.get_using(),
                'update_time': int(time.time())
            })
            rh.publish_status(status)

            try:
                fc = FollowingsCrawler(session, task)
                user = fc.get()
                logger.warning('Push result: ' + task)
                rh.push_result_user(user)

                finished['user'] += 1
                finished['followings'] += len(user['followings'])

                time.sleep(random.uniform(1, 5))
            except NotFoundException:
                message = "User %s not found, continue" % task
                logger.error(message)
                continue
    except ResponseException:
        message = 'Crawling response error, push back task, quit'
        terminate(message)
    except RedisException:
        logger.error('Redis connection error, quit')
        sys.exit('Redis Error!')
    except NetworkException:
        message = 'Network connection error, quit'
        terminate(message)
    except Exception as e:
        print(traceback.format_exc())
        terminate(e)