Ejemplo n.º 1
0
class SubmissionCrawler(Link):
    def setup(self):
        self.spider_name = rch.get_spider_name('RSC')
        self.processed_ids = CircularOrderedSet(1000)
        self.wait_seconds = MAX_WAIT_SECONDS  # Max waiting seconds between loops

    def generator(self):
        while (True):
            for submission in rch.get_all_submissions_elements(self.spider_name, items_no=100):
                submission_id = rch.get_submission_id(submission)
                if submission_id in self.processed_ids:
                    continue
                self.processed_ids.add(submission_id)

                subreddit_id = rch.get_subreddit_id(submission, retrieve_user_if_not_subreddit=True)

                user_id = rch.get_user_id(submission)
                timestamp = rch.get_submission_timestamp(submission)
                title = rch.get_submission_title(submission)

                value = {
                    'submission_id': submission_id,
                    'subreddit_id': subreddit_id,
                    'user_id': user_id,
                    'timestamp': timestamp,
                    'title': title
                }
                electron = Electron(None, value, topic=self.output_topics[0])
                self.send(electron)

            time.sleep(random.uniform(0, self.wait_seconds))
Ejemplo n.º 2
0
 def setup(self):
     self.spider_name = rch.get_spider_name('FUC')
     self.users_ranking_queue = ThreadingQueue(size=1, circular=True)
     self.seen_users = CircularOrderedSet(50)
     extractor = threading.Thread(target=self.extractor)
     extractor.start()
     self.loop(self.suicide, interval=3500, wait=True)
Ejemplo n.º 3
0
    def setup(self):
        # Small buffer (Mongo won't update inmediatly)
        self.seen_users = CircularOrderedSet(20)

        if len(self.args) > 0:
            self.ranking_mode = self.args[0]
            if len(self.args) >= 2:
                self.ranking_opts = self.args[1:]
            else:
                self.ranking_opts = ['desc']
        else:
            self.ranking_mode = 'proba'
            self.ranking_opts = ['desc']

        if self.ranking_mode not in [
                'random', 'proba', 'comments', 'points', 'hour', 'hierarchy', 'fusion'
        ]:
            raise ValueError('Unknown ranking mode')
        self.logger.log(f'Ranking mode: {self.ranking_mode}')
        if self.ranking_mode not in ['random', 'hierarchy', 'fusion']:
            self.logger.log(f'Ranking opts: {list(self.ranking_opts)}')

        self.mongodb.set_defaults('fuc_benchmark', 'users_ranking')

        self.loop(self.suicide, interval=3600, wait=True)
Ejemplo n.º 4
0
    def setup(self):
        self.subreddits_queue = ThreadingQueue(size=1, circular=True)
        self.seen_subreddits = CircularOrderedSet(50)
        explorer = threading.Thread(target=self.explorer)
        explorer.start()
        self.spider_name = rch.get_spider_name('FUC')

        self.mongodb.set_defaults('fuc_benchmark', 'subreddits')
        self.aerospike.set_defaults('fuc_benchmark')
        self.loop(self.suicide, interval=3300, wait=True)
Ejemplo n.º 5
0
 def init_corpus(self, context, id, c_args):
     if self.id == id:
         if not hasattr(self, 'corpus'):
             if len(c_args) == 1:
                 self.corpus = CircularOrderedSet(int(c_args[0]))
                 print("Length ->", c_args[0])
             else:
                 self.corpus = []
                 self.window = c_args
                 print("Timestamps ->", self.window)
Ejemplo n.º 6
0
class TagCons(Link):

    # Remote method to add input topics dynamically
    def add_topic(self, context, topic, id, image):
        if self.id == id:
            self.add_input_topic(topic)
            self.rpc_call('Consumer',
                          'recalculate_hash',
                          args=[self.input_topics, self.id, image])

    # Remote method to remove input topics dynamically
    def remove_topic(self, context, topic, id, image):
        if self.id == id:
            self.remove_input_topic(topic)
            self.rpc_call('Consumer',
                          'recalculate_hash',
                          args=[self.input_topics, self.id, image])

    # Function that generates a word cloud of a given query
    def __generate_word_cloud(self, batch_msgs):
        clean_texts = []
        texts = list(
            batch_msgs
        )  # we need to put it as a list to avoid run time errors related with changing size of the set
        for text in texts:
            for element in text:
                if element[0] == b'body' or element[0] == b'submission_title':
                    word = preprocessForTag(str(element[1]), self.languages)
                    print("#####Word", word)
                    if word not in clean_texts:
                        clean_texts.append(word)
        print("###clean_texts", clean_texts)
        output = ' '.join(map(str, clean_texts))
        print("####Output", output)
        wordcloud = WordCloud(background_color="white").generate(output)
        #print("Wordcloud",wordcloud.words_)
        result = []
        for t, v in wordcloud.words_.items():
            result.append({"text": t, "value": v * 1000})
        return result

    def setup(self):
        self.id = self.consumer_group
        self.languages = init_nltk(
        )  # Initializa languages for natural language toolkit
        self.batch_msgs = CircularOrderedSet(100)

    def transform(self, electron):
        dict_items = remove_non_ascii(electron.value.items())
        dict_set = frozenset(dict_items)  # value
        self.batch_msgs.add(dict_set)
        electron.topic = self.output_topics[0]
        electron.value = self.__generate_word_cloud(self.batch_msgs)
        return electron
Ejemplo n.º 7
0
 def setup(self):
     try:
         self.ntopics = []
         self.languages = init_nltk(
         )  # Initialize languages for natural language toolkit
         self.id = self.consumer_group
         print("Id del contenedor", self.id)
         if self.args:
             if "#" in self.args:
                 aux = self.args[:self.args.index("#")]
                 if len(aux) == 1:
                     self.corpus = CircularOrderedSet(int(aux[0]))
                     print("Length->", aux[0])
                 else:
                     self.corpus = []
                     self.window = aux
                     print("Timestamps->", self.window)
                 aux2 = self.args[self.args.index('#') + 1:]
             for arg in aux2:
                 self.ntopics.append(arg)
             print("Initial number of topics...", self.ntopics)
     except Exception as e:
         print("Exception", e)
Ejemplo n.º 8
0
class UserExtractor(Link):
    def setup(self):
        self.spider_name = rch.get_spider_name('FUC')
        self.users_ranking_queue = ThreadingQueue(size=1, circular=True)
        self.seen_users = CircularOrderedSet(50)
        extractor = threading.Thread(target=self.extractor)
        extractor.start()
        self.loop(self.suicide, interval=3500, wait=True)

    def _get_user_texts(self, user_id):
        for text in self._get_user_submissions(user_id):
            yield text
        for text in self._get_user_comments(user_id):
            yield text

    def _get_user_submissions(self, user_id):
        for element in rch.get_user_submissions_elements(self.spider_name,
                                                         user_id,
                                                         items_no=100)[::-1]:
            try:
                subreddit_id = rch.get_subreddit_id(element)
                if subreddit_id:
                    self.send(Electron(value=subreddit_id, topic='subreddits'))
                else:
                    subreddit_id = f'u/{user_id}'

                submission_id = rch.get_submission_id(element)
                submission = rch.get_submission_elements(
                    self.spider_name, subreddit_id, submission_id)[0]
                submission_title = rch.get_submission_title(submission)
                submission_body = rch.get_submission_body(submission)
                yield f'{submission_title} {submission_body}'

            except Exception:
                self.logger.log(level='exception')

    def _get_user_comments(self, user_id):
        for element in rch.get_user_comments_elements(self.spider_name,
                                                      user_id,
                                                      items_no=100)[::-1]:
            try:
                subreddit_id = rch.get_comment_subreddit_id(element)
                if subreddit_id:
                    self.send(Electron(value=subreddit_id, topic='subreddits'))

                comment_body = rch.get_comment_body(element)
                yield comment_body

            except Exception:
                self.logger.log(level='exception')

    def reject_request(self, context):
        time.sleep(5)
        self.rpc_notify('request_user', to='UserChooser')

    @rpc
    def put_user(self, context, user_id):
        if user_id not in self.seen_users:
            self.logger.log(f'Received VALID user {user_id} via RPC')
            self.users_ranking_queue.put(user_id)
            return
        self.logger.log(f'Received NOT VALID user {user_id} via RPC')
        self.rpc_notify('request_user', to='UserChooser')

    def _extract(self, user_id):
        self.logger.log(f'Extracting user: {user_id}')
        # Send all extracted texts as they are retrieved
        user_has_texts = False
        texts_counter = 0
        for text in self._get_user_texts(user_id):
            if text:
                electron = Electron(user_id, ('text', text))
                self.send(electron)
                user_has_texts = True
                texts_counter += 1
        self.logger.log(
            f'{texts_counter} texts extracted from the user {user_id}')

        # Send the End-of-User message
        if user_has_texts:
            electron = Electron(user_id, ('eou', None))
            self.send(electron)

    def extractor(self):
        running = True
        while (running):
            self.logger.log(f'User requested')
            self.rpc_notify('request_user', to='UserChooser')
            try:
                user_id = self.users_ranking_queue.get()
                if user_id not in self.seen_users:
                    self._extract(user_id)
                    self.seen_users.add(user_id)
            except Exception:
                self.logger.log(level='exception')
Ejemplo n.º 9
0
class UserChooser(Link):
    def setup(self):
        # Small buffer (Mongo won't update inmediatly)
        self.seen_users = CircularOrderedSet(20)

        if len(self.args) > 0:
            self.ranking_mode = self.args[0]
            if len(self.args) >= 2:
                self.ranking_opts = self.args[1:]
            else:
                self.ranking_opts = ['desc']
        else:
            self.ranking_mode = 'proba'
            self.ranking_opts = ['desc']

        if self.ranking_mode not in [
                'random', 'proba', 'comments', 'points', 'hour', 'hierarchy', 'fusion'
        ]:
            raise ValueError('Unknown ranking mode')
        self.logger.log(f'Ranking mode: {self.ranking_mode}')
        if self.ranking_mode not in ['random', 'hierarchy', 'fusion']:
            self.logger.log(f'Ranking opts: {list(self.ranking_opts)}')

        self.mongodb.set_defaults('fuc_benchmark', 'users_ranking')

        self.loop(self.suicide, interval=3600, wait=True)

    def _pop_first_user_id(self):
        user_id = self._get_first_user(avoid=list(self.seen_users))

        # Mark the user as processed
        self.mongodb.update({'user_id': user_id}, {'processed': True})
        self.seen_users.add(user_id)
        return user_id

    @staticmethod
    def _swap_0_24(central_hour):
        if central_hour == 0:
            return 24
        if central_hour == 24:
            return 0
        return central_hour

    def _get_first_user(self, avoid=None):
        query = {'processed': {'$ne': True}}
        if avoid:
            query.update({'user_id': {'$nin': avoid}})

        # RANDOM
        if self.ranking_mode == 'random':
            result = self.mongodb.get_random(query=query)
            return next(result)['user_id']

        # FUSION
        if self.ranking_mode == 'fusion':
            # probadesc, commentsdesc
            merged_ranking = {}

            # probadesc top 100
            sort = [('proba.value', -1)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=100)
            probadesc_items = 0
            for i, user in enumerate(result):
                probadesc_items = i
                merged_ranking[user['user_id']] = (i + 1, False)
            probadesc_items += 1

            # hour06 top 100
            sort = [('hour.dis06', 1)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=100)
            hour06_items = 0
            for i, user in enumerate(result):
                hour06_items = i
                user_id = user['user_id']
                if user_id in merged_ranking:
                    merged_ranking[user_id] = ((merged_ranking[user_id][0] + i + 1) / 2.0, True)
                    continue
                merged_ranking[user_id] = (i + 1, False)
            hour06_items += 1

            # Average with the worst position
            max_value = min(100, probadesc_items, hour06_items)
            for key, value in merged_ranking.items():
                if not value[1]:
                    merged_ranking[key] = ((merged_ranking[user_id][0] + max_value) / 2.0, True)

            fusion_ranking = [{
                'user_id': key,
                'rank': value[0]
            } for key, value in merged_ranking.items()]
            fusion_ranking = sorted(fusion_ranking, key=lambda k: k['rank'], reverse=False)

            if fusion_ranking:
                return fusion_ranking[0]['user_id']
            return

        # HIERARCHY
        if self.ranking_mode == 'hierarchy':
            # PROBADESC, COMMENTSDESC, HOUR06
            sort = [('proba.value', -1)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=2)
            user_list = [user for user in result]

            # Match for top 2 for probadesc, add hour06
            if len(user_list
                   ) == 2 and user_list[0]['proba']['value'] == user_list[1]['proba']['value']:
                self.logger.log('Match for PROBADESC in the top 2')
                electron = Electron(
                    value={
                        'event': 'user_chooser_hierarchy_match',
                        'timestamp': utils.get_timestamp_ms(),
                        'value': {
                            'ranking': 'probadesc'
                        }
                    })
                self.send(electron, topic='stats')

                proba = user_list[0]['proba']['value']

                query.update({'proba.value': proba})
                sort = [('proba.value', -1), ('hour.dis06', 1)]
                self.logger.log(f'sort: {sort}, query: {query}', level='debug')
                result = self.mongodb.get(query=query, sort=sort, limit=2)
                user_list = [user for user in result]

                # Match for top 2 for probadesc-hour06, add commentsdesc
                if len(user_list
                       ) == 2 and user_list[0]['hour']['dis06'] == user_list[1]['hour']['dis06']:
                    self.logger.log('Match for HOUR06 in the top 2')
                    electron = Electron(
                        value={
                            'event': 'user_chooser_hierarchy_match',
                            'timestamp': utils.get_timestamp_ms(),
                            'value': {
                                'ranking': 'hour06'
                            }
                        })
                    self.send(electron, topic='stats')

                    # Sort by comments.avg
                    user_list = sorted(user_list, key=lambda k: k['comments']['avg'],
                                       reverse=True)[:2]
                    self.logger.log('SELECTED by COMMENTSDESC')
                else:
                    self.logger.log('SELECTED by HOUR06')
                return user_list[0]['user_id']

            elif user_list:
                self.logger.log('SELECTED by PROBADESC')
                return user_list[0]['user_id']

            # Void ranking
            return

        order = -1
        if self.ranking_opts[0] == 'asc':
            order = 1

        # HOUR AVG
        if self.ranking_mode == 'hour':
            central_hour = int(self.ranking_opts[0])

            query.update({'hour.avg': {'$gte': central_hour}})
            sort = [('hour.avg', 1)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=1)
            gte_result = next(result)

            # 24 if central_hour was 0
            central_hour = self._swap_0_24(central_hour)
            query.update({'hour.avg': {'$lte': central_hour}})
            sort = [('hour.avg', -1)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=1)
            lte_result = next(result)

            if not gte_result and not lte_result:
                return
            elif gte_result and not lte_result:
                return gte_result['user_id']
            elif lte_result and not gte_result:
                return lte_result['user_id']

            # 0 if central_hour was 24
            central_hour = self._swap_0_24(central_hour)
            gte_distance = abs(1. * gte_result['hour']['avg'] - central_hour)

            # 24 if central_hour was 0
            central_hour = self._swap_0_24(central_hour)
            lte_distance = abs(1. * lte_result['hour']['avg'] - central_hour)

            if gte_distance >= lte_distance:
                return lte_result['user_id']
            return gte_result['user_id']

        # PROBA, COMMENTS, POINTS
        if self.ranking_mode in ['proba', 'comments', 'points']:
            if self.ranking_mode == 'proba':
                property_ = 'proba.value'
            else:
                property_ = self.ranking_mode + '.avg'
            sort = [(property_, order)]
            self.logger.log(f'sort: {sort}, query: {query}', level='debug')
            result = self.mongodb.get(query=query, sort=sort, limit=1)
            return next(result)['user_id']

    @rpc
    def request_user(self, context, attempts=10):
        user_extractor = context['uid']
        if attempts != 10:
            self.logger.log(f'Attempt {10 - attempts} for extractor {user_extractor}')
        else:
            self.logger.log(f'User request received from extractor {user_extractor}')

        try:
            user_id = self._pop_first_user_id()
            if user_id == None:
                raise ValueError
            self.logger.log(f'Selected user: {user_id} for extractor {user_extractor}')
            self.rpc_notify('put_user', user_id, to=user_extractor)

        except Exception:
            self.logger.log(level='debug')
            self.logger.log(f'Cannot retrieve an user for extractor {user_extractor}', level='warn')
            if attempts > 0:
                time.sleep(0.5)
                self.request_user(context, attempts - 1)
            else:
                self.rpc_notify('reject_request', to=user_extractor)
Ejemplo n.º 10
0
 def setup(self):
     self.spider_name = rch.get_spider_name('RSC')
     self.processed_ids = CircularOrderedSet(1000)
     self.wait_seconds = MAX_WAIT_SECONDS  # Max waiting seconds between loops
Ejemplo n.º 11
0
 def setup(self):
     self.id = self.consumer_group
     self.languages = init_nltk(
     )  # Initializa languages for natural language toolkit
     self.batch_msgs = CircularOrderedSet(100)
Ejemplo n.º 12
0
class SubredditExplorer(Link):
    def setup(self):
        self.subreddits_queue = ThreadingQueue(size=1, circular=True)
        self.seen_subreddits = CircularOrderedSet(50)
        explorer = threading.Thread(target=self.explorer)
        explorer.start()
        self.spider_name = rch.get_spider_name('FUC')

        self.mongodb.set_defaults('fuc_benchmark', 'subreddits')
        self.aerospike.set_defaults('fuc_benchmark')
        self.loop(self.suicide, interval=3300, wait=True)

    def send_seen_user_event(self, user_id):
        electron = Electron(value={
            'event': 'user_explorer_seen_user',
            'timestamp': utils.get_timestamp_ms(),
            'value': {
                'user_id': user_id
            }
        },
                            topic='stats')
        self.logger.log(electron.value)
        self.send(electron)

    def send_seen_subreddit_event(self, subreddit_id):
        electron = Electron(value={
            'event': 'user_explorer_seen_subreddit',
            'timestamp': utils.get_timestamp_ms(),
            'value': {
                'subreddit_id': subreddit_id
            }
        },
                            topic='stats')
        self.logger.log(electron.value)
        self.send(electron)

    def _process_new_submissions(self, subreddit='r/all'):
        for submission in rch.get_subreddit_submissions_elements(
                subreddit, self.spider_name, 100):
            try:
                # Filter repeated submissions
                submission_id = rch.get_submission_id(submission)

                if not self.aerospike.exists(submission_id,
                                             set_name='seen_submissions'):
                    self.aerospike.put(submission_id,
                                       set_name='seen_submissions')
                else:
                    continue

                # Initializer mode, it only makes sense for r/all
                if subreddit == 'r/all':
                    # The text may be posted to its own profile
                    subreddit_id = rch.get_subreddit_id(submission)
                    if subreddit_id:
                        self.send(
                            Electron(value=subreddit_id, topic='subreddits'))
                        self.send_seen_subreddit_event(subreddit_id)

                user_id = rch.get_user_id(submission)
                if not user_id:
                    continue
                self.logger.log(f'  + [text] {user_id} ({submission_id})',
                                level='debug')

                text = rch.get_submission_title(submission)

                # Number of comments
                comments = int(rch.get_submission_no_comments(submission))

                # Points
                points = int(rch.get_submission_score(submission))

                # Timestamp
                timestamp = int(rch.get_submission_timestamp(submission))

            except Exception:
                self.logger.log(level='exception')

            # Messages keyed by user_id
            electron = Electron(
                user_id, {
                    'comments': comments,
                    'points': points,
                    'timestamp': timestamp,
                    'text': text
                })
            self.send_seen_user_event(user_id)
            self.send(electron)
            self.logger.log(f'[SUBMISSION] [{electron.key}] {electron.value}',
                            level='debug')

    def _process_new_comments(self, subreddit='r/all'):
        for comment in rch.get_subreddit_comments_elements(
                subreddit, self.spider_name, 100):
            try:
                # Filter repeated comments
                comment_id = rch.get_comment_id(comment)

                if not self.aerospike.exists(comment_id,
                                             set_name='seen_comments'):
                    self.aerospike.put(comment_id, set_name='seen_comments')
                else:
                    continue

                # Initializer mode, it only makes sense for r/all
                if subreddit == 'r/all':
                    # The text may be posted to its own profile
                    subreddit_id = rch.get_comment_subreddit_id(comment)
                    if subreddit_id:
                        self.send(
                            Electron(value=subreddit_id, topic='subreddits'))
                        self.send_seen_subreddit_event(subreddit_id)
                else:
                    subreddit_id = subreddit

                user_id = rch.get_comment_user_id(comment)
                if not user_id:
                    continue
                self.logger.log(f'  + [text] {user_id} ({comment_id})',
                                level='debug')

                text = rch.get_comment_body(comment)

                # Number of comments
                comments = 0

                # Points
                points = 1

                # Timestamp
                timestamp = int(rch.get_comment_timestamp(comment))

            except Exception:
                self.logger.log(level='exception')

            # Messages keyed by user_id
            electron = Electron(
                user_id, {
                    'comments': comments,
                    'points': points,
                    'timestamp': timestamp,
                    'text': text
                })
            self.send_seen_user_event(user_id)
            self.send(electron)
            self.logger.log(f'[COMMENT] [{electron.key}] {electron.value}',
                            level='debug')

    def reject_request(self, context):
        self.logger.log('Subreddit request rejected', level='warn')
        time.sleep(5)
        self.rpc_notify('request_subreddit', to='SubredditChooser')

    @rpc
    def put_subreddit(self, context, subreddit_id):
        if subreddit_id not in self.seen_subreddits:
            self.logger.log(f'Received VALID subreddit {subreddit_id} via RPC')
            self.subreddits_queue.put(subreddit_id)
            return
        self.logger.log(f'Received NOT VALID subreddit {subreddit_id} via RPC')
        self.rpc_notify('request_subreddit', to='SubredditChooser')

    def _explore(self, subreddit_id):
        self.logger.log(f'Exploring subreddit: {subreddit_id}')
        self._process_new_submissions(subreddit_id)
        self._process_new_comments(subreddit_id)

    def explorer(self):
        running = True
        while (running):
            self.logger.log(f'Subreddit requested')
            self.rpc_notify('request_subreddit', to='SubredditChooser')
            try:
                subreddit_id = self.subreddits_queue.get()
                if subreddit_id not in self.seen_subreddits:
                    self.seen_subreddits.add(subreddit_id)
                    self._explore(subreddit_id)
                else:
                    self.logger.log(f'{subreddit_id} is known, skipping',
                                    level='warn')
            except Exception:
                self.logger.log(level='exception')
Ejemplo n.º 13
0
class TopicAnalysis(Link):

    # Remote method to add input topics dynamically
    def add_topic(self, context, topic, id, image):
        if self.id == id:
            self.add_input_topic(topic)
            self.rpc_call('Consumer',
                          'recalculate_hash',
                          args=[self.input_topics, self.id, image])

    # Remote method to remove input topics dynamically
    def remove_topic(self, context, topic, id, image):
        if self.id == id:
            self.remove_input_topic(topic)
            self.rpc_call('Consumer',
                          'recalculate_hash',
                          args=[self.input_topics, self.id, image])

    # Remote callable method that allows us to add ntopics dynamically
    def add_ntopics(self, context, output_topic):
        try:
            tmp = output_topic.split("-")
            if self.id == tmp[0]:
                if tmp[1] not in self.ntopics:  # Avoids duplicate values
                    print("Adding new number of topics...")
                    self.ntopics.append(tmp[1])
                    self.output_topics.append(output_topic)
                    print("Number of topics", self.ntopics)
                    print("Output topics", self.output_topics)
                    print("###################################")
        except Exception as e:
            print("Exception", e)

    # Remote method to initialize corpus in case it isn't
    def init_corpus(self, context, id, c_args):
        if self.id == id:
            if not hasattr(self, 'corpus'):
                if len(c_args) == 1:
                    self.corpus = CircularOrderedSet(int(c_args[0]))
                    print("Length ->", c_args[0])
                else:
                    self.corpus = []
                    self.window = c_args
                    print("Timestamps ->", self.window)

    def __make_bigram_mod(self, doc):
        bigram = gensim.models.Phrases(doc, min_count=5, threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        return bigram_mod

    def __make_bigrams(self, texts):
        bigrams = []
        for doc in texts:
            bigram_mod = self.__make_bigram_mod(doc)
            bigrams.append(bigram_mod[doc])
        return bigrams

    def __lemmatization(self,
                        texts,
                        allowed_postags=[
                            'NOUN', 'ADJ', 'VERB', 'ADV'
                        ]):  # Do lemmatization keeping only noun, adj, vb, adv
        """https://spacy.io/api/annotation"""
        nlp = spacy.load('en', disable=['parser', 'ner'])
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            texts_out.append([
                token.lemma_ for token in doc if token.pos_ in allowed_postags
            ])
        return texts_out

    def __preprocess_corpus(self, corpus):
        texts = []
        for text in list(
                corpus
        ):  # we need to put it as a list to avoid run time errors related with changing size of the set
            for element in text:
                if element[0] == b'body' or element[0] == b'submission_title':
                    aux = preprocessForTopic(str(element[1]), self.languages)
                    texts.append(aux)

        data_words_bigrams = self.__make_bigrams(texts)

        data_lemmatized = self.__lemmatization(data_words_bigrams)

        # Create Dictionary to associate an id to every word in a document
        id2word = corpora.Dictionary(data_lemmatized)
        print("!!!Id2word", id2word)

        # Create Corpus
        texts = data_lemmatized
        for phrase in texts:
            for word in phrase:
                word = word.encode('ascii', errors='ignore')
        # Term Document Frequency (it associates the previous id with the number of times it appears in the document, e.g [0,1]-> word 0 occurs once)
        corpus = [id2word.doc2bow(text) for text in texts]
        print("!!!Corpus", corpus)
        return corpus, id2word

    def __render_model(self, model, corpus, dict, ntopics):
        data = pyLDAvis.gensim.prepare(model, corpus, dict)
        div_id = "pyldavis"
        html = pyLDAvis.prepared_data_to_html(data,
                                              template_type="simple",
                                              visid=div_id)
        found = '!function(LDAvis){' + re.search(
            r"\!function\(LDAvis\)\{(.+?)\}\(LDAvis\)\;", html,
            re.MULTILINE | re.DOTALL).group(1) + '}(LDAvis);'
        #print("Found->",found)
        return found

    def __make_topic_analysis(self, corpus, id2word, ntopics):
        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=ntopics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)
        pprint(lda_model.print_topics())
        top_words_per_topic = []
        for t in range(lda_model.num_topics):
            top_words_per_topic.extend([
                (t, ) + x for x in lda_model.show_topic(t, topn=5)
            ])  # We pick the 5 words with the highest value for each topic

        return self.__render_model(lda_model, corpus, id2word, ntopics)

        # return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_json() # The dataframe has this form: Topic - Word - P
        #                              0       w1    0.231
        #                              0       w2    0.142
        #                              0       w3    0.789
        #                              0       w4    0.542
        #                              0       w5    0.639
        #                              1       w1    0.541
        #                              1       w2    0.142
        #                              ...

    def setup(self):
        try:
            self.ntopics = []
            self.languages = init_nltk(
            )  # Initialize languages for natural language toolkit
            self.id = self.consumer_group
            print("Id del contenedor", self.id)
            if self.args:
                if "#" in self.args:
                    aux = self.args[:self.args.index("#")]
                    if len(aux) == 1:
                        self.corpus = CircularOrderedSet(int(aux[0]))
                        print("Length->", aux[0])
                    else:
                        self.corpus = []
                        self.window = aux
                        print("Timestamps->", self.window)
                    aux2 = self.args[self.args.index('#') + 1:]
                for arg in aux2:
                    self.ntopics.append(arg)
                print("Initial number of topics...", self.ntopics)
        except Exception as e:
            print("Exception", e)

    def send_electrons(self, electron):
        corpus, id2word = self.__preprocess_corpus(self.corpus)
        print("Ntopics", self.ntopics)
        for n, o in zip(self.ntopics, self.output_topics):
            electron.topic = o
            electron.value = self.__make_topic_analysis(corpus, id2word, n)
            self.send(electron)

    def transform(self, electron):
        try:
            if type(self.corpus) is list:
                timestamp = int(electron.value['timestamp'])
                if electron.value['src'] == 'twitter':
                    timestamp /= 1000  # because tweet timestamps are expressed in miliseconds
                if int(self.window[0]) <= timestamp <= int(
                        self.window[1]
                ):  # if the text belongs to the given range
                    dict_items = remove_non_ascii(electron.value.items())
                    dict_set = frozenset(dict_items)  # value
                    self.corpus.append(dict_set)
                    self.send_electrons(electron)
            else:
                dict_items = remove_non_ascii(electron.value.items())
                dict_set = frozenset(dict_items)  # value
                self.corpus.add(dict_set)
                self.send_electrons(electron)
        except (Exception, UnicodeEncodeError) as e:
            print("Exception", e)
            traceback.print_exc()