Esempio n. 1
0
    def coleta_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(self.base, self)
Esempio n. 2
0
    def process_status(self, status_id_doc):
        """
        Process status
        :param id_doc: id_doc for analytics
        :param status_id_doc: Status id_doc
        :return: True or False
        """
        try:
            result = self.status_base.get_document(status_id_doc)
        except ConnectionError as e:
            log.error("CONNECTION ERROR: Error processing %s\n%s", status_id_doc, e.message)
            time.sleep(1)
            result = self.status_base.get_document(status_id_doc)

        # JSON
        status_dict = conv.document2dict(self.status_base.lbbase, result)
        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = status_id_doc

        # Add status to analytics if positives are bigger than negatives
        update = False
        if status_dict.get('positives') is not None or status_dict.get('negatives') is not None:
            update = True

        if update:
            return status_dict
        else:
            return None
Esempio n. 3
0
    def orgao_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(orgao_base.lbbase, self)
Esempio n. 4
0
    def user_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(user_base.lbbase, self)
Esempio n. 5
0
    def atividade_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(atividade_base.lbbase, self)
Esempio n. 6
0
    def twitter_embed(self):
        """
        View for twitter embed

        :return: Twitter HTML code with oEmbed
        """
        status_id = self.request.matchdict.get('status_id')
        if status_id is None:
            log.error("You have to supply status_id")
            raise HTTPError

        status = self.status_base.get_document(status_id)
        status_dict = conv.document2dict(self.status_base.lbbase, status)
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = status_id

        # Load original source
        source = json.loads(status_dict['source'])
        status_dict['source'] = source[0]

        # Get status oembed
        oembed = self.lbt.api.GetStatusOembed(id=status_dict['source']['_id'],
                                              lang='pt')

        # Get category
        if status_dict.get('events_tokens') is None:
            status_dict['category'] = utils.get_category(
                status_dict['events_tokens'])
        else:
            status_dict['category'] = utils.get_category(
                [status_dict['search_term']])

        return {'oembed_html': oembed['html'], 'status': status_dict}
Esempio n. 7
0
    def allreports_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(allreports.lbbase, self)
Esempio n. 8
0
    def test_store_location(self):
        """
        Test location base storage
        """
        location_base = location.LocationBase()
        location_lbbase = location_base.create_base()
        self.assertIsInstance(location_lbbase, Base)

        # Load data
        status_list = self.lbt.search(count=10)
        result = self.lbt.store_twitter(status_list=status_list, tokenize=True)
        self.assertTrue(result)

        status_id_list = self.status_base.get_document_ids()
        log.debug("Number of status found: %s", len(status_id_list))
        status = self.status_base.get_document(status_id_list[0])

        status_dict = conv.document2dict(self.status_base.lbbase, status)
        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = status._metadata.id_doc
        # Now try to find location
        status_dict = liblocation.get_location(status_dict)

        # Update base
        self.assertIsNotNone(
            self.status_base.documentrest.update(
                status_dict['_metadata']['id_doc'],
                json.dumps(status_dict)
            )
        )

        result = location_base.remove_base()
        self.assertTrue(result)
Esempio n. 9
0
    def notify_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(notify_base.lbbase, self)
Esempio n. 10
0
    def atividade_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(atividade_base.lbbase, self)
Esempio n. 11
0
    def desc_to_dict(self):
        """
        Convert status object to Python dict
        :return:
        """

        return conv.document2dict(desc.lbbase, self)
Esempio n. 12
0
    def twitter_embed(self):
        """
        View for twitter embed

        :return: Twitter HTML code with oEmbed
        """
        status_id = self.request.matchdict.get('status_id')
        if status_id is None:
            log.error("You have to supply status_id")
            raise HTTPError

        status = self.status_base.get_document(status_id)
        status_dict = conv.document2dict(self.status_base.lbbase, status)
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = status_id

        # Load original source
        source = json.loads(status_dict['source'])
        status_dict['source'] = source[0]

        # Get status oembed
        oembed = self.lbt.api.GetStatusOembed(id=status_dict['source']['_id'], lang='pt')

        # Get category
        if status_dict.get('events_tokens') is None:
            status_dict['category'] = utils.get_category(status_dict['events_tokens'])
        else:
            status_dict['category'] = utils.get_category([status_dict['search_term']])

        return {
            'oembed_html': oembed['html'],
            'status': status_dict
        }
Esempio n. 13
0
    def process_geo_dict(self, id_doc, max_distance=50000, status_dict=None):
        """
        Get Brasil city distance from document
        :param max_distance: Max distance (Meters) to consider
        :return: Dict with Geo information from LBGeo
        """
        if status_dict is None:
            document = self.get_document(id_doc)
            status_dict = conv.document2dict(self.lbbase, document)

        if status_dict.get('location') is None:
            if status_dict.get('arg_structures') is not None:
                # Now try to find location again
                status_dict['_metadata'] = dict()
                status_dict['_metadata']['id_doc'] = id_doc
                status_dict = location.get_location(status_dict)

                if status_dict.get('location') is None:
                    log.error("Location not available for document id = %s",
                              id_doc)
                    return status_dict
            else:
                log.error("Location not available for document id = %s",
                          id_doc)
                return status_dict

        params = {
            'lat': status_dict['location']['latitude'],
            'lng': status_dict['location']['longitude']
        }

        url = self.geo_url + '/city'
        result = requests.post(url=url, data=json.dumps(params))

        # Check for Exception
        try:
            result.raise_for_status()
        except HTTPError as e:
            log.error("Connection error in id_doc = %s\n%s", id_doc, e.message)
            return status_dict

        try:
            city = result.json()
        except ValueError as e:
            log.error("Error parsing response for id_doc = %s\n%s", id_doc,
                      e.message)
            return status_dict

        # Check for max distance
        if float(city['city_distance']) > float(max_distance):
            # Do not take this distance
            log.debug("Distance = %s bigger than maximum = %s",
                      city['city_distance'], max_distance)
            return status_dict

        # Now update document with city
        status_dict['brasil_city'] = city

        return status_dict
Esempio n. 14
0
 def modify_aaData(self, results, base_name):
     model = self.model_base(base_name)
     data = []
     for result in results:
         temp = document2dict(model, result)
         temp['id_doc'] = result._metadata.id_doc
         data.append(temp)
     return data
Esempio n. 15
0
 def modify_aaData(self, results, base_name):
     model = self.model_base(base_name)
     data = []
     for result in results:
         temp = document2dict(model, result)
         temp['id_doc'] = result._metadata.id_doc
         data.append(temp)
     return data
Esempio n. 16
0
    def process_geo_dict(self, id_doc, max_distance=50000, status_dict=None):
        """
        Get Brasil city distance from document
        :param max_distance: Max distance (Meters) to consider
        :return: Dict with Geo information from LBGeo
        """
        if status_dict is None:
            document = self.get_document(id_doc)
            status_dict = conv.document2dict(self.lbbase, document)

        if status_dict.get('location') is None:
            if status_dict.get('arg_structures') is not None:
                # Now try to find location again
                status_dict['_metadata'] = dict()
                status_dict['_metadata']['id_doc'] = id_doc
                status_dict = location.get_location(status_dict)

                if status_dict.get('location') is None:
                    log.error("Location not available for document id = %s", id_doc)
                    return status_dict
            else:
                log.error("Location not available for document id = %s", id_doc)
                return status_dict

        params = {
            'lat': status_dict['location']['latitude'],
            'lng': status_dict['location']['longitude']
        }

        url = self.geo_url + '/city'
        result = requests.post(
            url=url,
            data=json.dumps(params)
        )

        # Check for Exception
        try:
            result.raise_for_status()
        except HTTPError as e:
            log.error("Connection error in id_doc = %s\n%s", id_doc, e.message)
            return status_dict

        try:
            city = result.json()
        except ValueError as e:
            log.error("Error parsing response for id_doc = %s\n%s", id_doc, e.message)
            return status_dict

        # Check for max distance
        if float(city['city_distance']) > float(max_distance):
            # Do not take this distance
            log.debug("Distance = %s bigger than maximum = %s", city['city_distance'], max_distance)
            return status_dict

        # Now update document with city
        status_dict['brasil_city'] = city

        return status_dict
Esempio n. 17
0
    def process_hashtags(self, id_doc):
        result = self.get_document(id_doc)

        # JSON
        status_dict = conv.document2dict(self.lbbase, result)

        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = id_doc

        return self.process_hashtags_dict(status_dict)
Esempio n. 18
0
    def process_hashtags(self, id_doc):
        result = self.get_document(id_doc)

        # JSON
        status_dict = conv.document2dict(self.lbbase, result)

        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = id_doc

        return self.process_hashtags_dict(status_dict)
Esempio n. 19
0
    def process_tokens(self, id_doc, update=True):
        """
        Process tokens for this id_doc

        :param id_doc: Document to be processed
        :param update: Whether we should update dictionary frequency or not
        :return: True or False
        """
        result = self.get_document(id_doc)

        # JSON
        status_dict = conv.document2dict(self.lbbase, result)

        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = id_doc

        # SRL tokenize
        tokenized = srl.srl_tokenize(status_dict['text'])
        if tokenized.get('arg_structures') is not None:
            status_dict['arg_structures'] = tokenized.get('arg_structures')

        if tokenized.get('tokens') is not None:
            status_dict['tokens'] = tokenized.get('tokens')

        # Now try to find location
        status_dict = location.get_location(status_dict)

        # Process tokens if selected
        dictionary_base = dic.DictionaryBase(
            dic_base=self.dictionary_base
        )
        result = dictionary.process_tokens_dict(status_dict, dictionary_base, update=update)
        log.debug("Corpus da tokenização calculado. id_doc = %s", id_doc)
        status_dict = result['status']

        # Extract hashtags
        status_dict = self.get_hashtags_dict(status_dict)

        # Calculate category
        status_dict = self.get_category(status_dict)

        # Get brasil city information
        status_dict = self.process_geo_dict(
            id_doc=id_doc,
            status_dict=status_dict
        )

        # Now update document back
        self.documentrest.update(id_doc, json.dumps(status_dict))

        return True
Esempio n. 20
0
    def process_tokens(self, id_doc, update=True):
        """
        Process tokens for this id_doc

        :param id_doc: Document to be processed
        :param update: Whether we should update dictionary frequency or not
        :return: True or False
        """
        result = self.get_document(id_doc)

        # JSON
        status_dict = conv.document2dict(self.lbbase, result)

        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = id_doc

        # SRL tokenize
        tokenized = srl.srl_tokenize(status_dict['text'])
        if tokenized.get('arg_structures') is not None:
            status_dict['arg_structures'] = tokenized.get('arg_structures')

        if tokenized.get('tokens') is not None:
            status_dict['tokens'] = tokenized.get('tokens')

        # Now try to find location
        status_dict = location.get_location(status_dict)

        # Process tokens if selected
        dictionary_base = dic.DictionaryBase(dic_base=self.dictionary_base)
        result = dictionary.process_tokens_dict(status_dict,
                                                dictionary_base,
                                                update=update)
        log.debug("Corpus da tokenização calculado. id_doc = %s", id_doc)
        status_dict = result['status']

        # Extract hashtags
        status_dict = self.get_hashtags_dict(status_dict)

        # Calculate category
        status_dict = self.get_category(status_dict)

        # Get brasil city information
        status_dict = self.process_geo_dict(id_doc=id_doc,
                                            status_dict=status_dict)

        # Now update document back
        self.documentrest.update(id_doc, json.dumps(status_dict))

        return True
Esempio n. 21
0
    def process_status_categories(self, status_id_doc):
        """
        Process status
        :param status_id_doc: Status id_doc
        :return: Status dict stored
        """
        try:
            result = self.status_base.get_document(status_id_doc)
        except ConnectionError as e:
            log.error("CONNECTION ERROR: Error processing %s\n%s", status_id_doc, e.message)

            # Try again in one second
            time.sleep(1)
            status_dict = self.process_status_categories(status_id_doc)
            return status_dict

        # JSON
        status_dict = conv.document2dict(self.status_base.lbbase, result)
        # Manually add id_doc
        status_dict['_metadata'] = dict()
        status_dict['_metadata']['id_doc'] = status_id_doc

        return status_dict
Esempio n. 22
0
 def blacklist_to_dict(self):
     """
     Convert status object to Python dict
     :return:
     """
     return conv.document2dict(blacklist_base.lbbase, self)
Esempio n. 23
0
 def crimes_to_dict(self):
     """
     Convert crimes object to Python dict
     :return: dict for crime
     """
     return conv.document2dict(self.crimes_base.lbbase, self)
Esempio n. 24
0
 def analytics_to_dict(self):
     """
     Convert analytics object to Python dict
     :return: dict for crime
     """
     return conv.document2dict(self.analytics_base.lbbase, self)
Esempio n. 25
0
 def crimes_to_dict(self):
     """
     Convert crimes object to Python dict
     :return: dict for crime
     """
     return conv.document2dict(self.crimes_base.lbbase, self)
Esempio n. 26
0
    def update_status_files(self, lb_intercommunication_obj, key=None, value=None):
        """ Atualiza a estrutura de arquivos

        """
        add_key = key
        list_update_object = []
        value_attr = value
        for obj in self.get_files_doc(lb_intercommunication_obj):
            dict_document = document2dict(LbLibLightBaseSetCs().liblightbase_schemes_df('db_reg_anot_teses_arq'), obj )
            list_update_object = []
            for assunt_vinc in dict_document.get('mg_assunt_vinc'):
                if str(assunt_vinc.get('int_id_doc_assunt')) in self.list_id_reg_teses:
                    assunt_vinc[add_key] = value_attr
                list_update_object.append(assunt_vinc)
            # dict_lbdoc = {
            #             #"lb_ctrl_sh_slt": "_metadata",
            #             "lb_ctrl_op": "db_search",
            #             "lb_ctrl_db": "db_reg_anot_teses_arq",
            #             "lb_ctrl_qry": 'str_file_hash = ' + obj.str_file_hash,
            #             "lb_ctrl_cookie": lb_intercommunication_obj.user_credentials["lb_cookie"]
            #           }

            if key == 'bool_ativ':
                #verifica se todos da lista é False
                bool_ativ_file = not [mg_assunt.get('bool_ativ', True) for mg_assunt in list_update_object].count(False) == len(list_update_object)
                addParh =   {
                        "path": "bool_ativ_file",
                        "mode": "update",
                        "fn": None,
                        "args": [bool_ativ_file]
                    }

            elif key == 'bool_del':
                #bool_del_file = False
                #length = len(list_update_object)
                bool_del_file = [mg_assunt.get('bool_del', False) for mg_assunt in list_update_object].count(True) == len(list_update_object)

                addParh =   {
                        "path": "bool_del_file",
                        "mode": "update",
                        "fn": None,
                        "args": [bool_del_file]
                    }

            operation_lbdoc = {
                                    'lb_ctrl_op': 'update_collection',
                                    'lb_ctrl_db': 'db_reg_anot_teses_arq',
                                    'search': {'literal':'str_file_hash = \'' + obj.str_file_hash + '\'',
                                                    'limit': 1
                                                    },
                                    'path_operation': [
                                        {
                                         "path": "mg_assunt_vinc",
                                         "mode":"update",
                                         "fn": None,
                                         "args": [list_update_object]
                                        }
                                    ],
                                    'lb_ctrl_cookie': lb_intercommunication_obj.user_credentials["lb_cookie"]

                                }
            operation_lbdoc['path_operation'].append(addParh)
            submit_operations_df_return = LBDOC_OBJ.lbdoc_obj.submit_operations_df(None, operation_lbdoc, False)
            if submit_operations_df_return.lbdoc_return_objs.lbdoc_return_objs[-1].\
                            status == "success":
                continue
Esempio n. 27
0
    def update_status_files(self,
                            lb_intercommunication_obj,
                            key=None,
                            value=None):
        """ Atualiza a estrutura de arquivos

        """
        add_key = key
        list_update_object = []
        value_attr = value
        for obj in self.get_files_doc(lb_intercommunication_obj):
            dict_document = document2dict(
                LbLibLightBaseSetCs().liblightbase_schemes_df(
                    'db_reg_anot_teses_arq'), obj)
            list_update_object = []
            for assunt_vinc in dict_document.get('mg_assunt_vinc'):
                if str(assunt_vinc.get(
                        'int_id_doc_assunt')) in self.list_id_reg_teses:
                    assunt_vinc[add_key] = value_attr
                list_update_object.append(assunt_vinc)
            # dict_lbdoc = {
            #             #"lb_ctrl_sh_slt": "_metadata",
            #             "lb_ctrl_op": "db_search",
            #             "lb_ctrl_db": "db_reg_anot_teses_arq",
            #             "lb_ctrl_qry": 'str_file_hash = ' + obj.str_file_hash,
            #             "lb_ctrl_cookie": lb_intercommunication_obj.user_credentials["lb_cookie"]
            #           }

            if key == 'bool_ativ':
                #verifica se todos da lista é False
                bool_ativ_file = not [
                    mg_assunt.get('bool_ativ', True)
                    for mg_assunt in list_update_object
                ].count(False) == len(list_update_object)
                addParh = {
                    "path": "bool_ativ_file",
                    "mode": "update",
                    "fn": None,
                    "args": [bool_ativ_file]
                }

            elif key == 'bool_del':
                #bool_del_file = False
                #length = len(list_update_object)
                bool_del_file = [
                    mg_assunt.get('bool_del', False)
                    for mg_assunt in list_update_object
                ].count(True) == len(list_update_object)

                addParh = {
                    "path": "bool_del_file",
                    "mode": "update",
                    "fn": None,
                    "args": [bool_del_file]
                }

            operation_lbdoc = {
                'lb_ctrl_op':
                'update_collection',
                'lb_ctrl_db':
                'db_reg_anot_teses_arq',
                'search': {
                    'literal': 'str_file_hash = \'' + obj.str_file_hash + '\'',
                    'limit': 1
                },
                'path_operation': [{
                    "path": "mg_assunt_vinc",
                    "mode": "update",
                    "fn": None,
                    "args": [list_update_object]
                }],
                'lb_ctrl_cookie':
                lb_intercommunication_obj.user_credentials["lb_cookie"]
            }
            operation_lbdoc['path_operation'].append(addParh)
            submit_operations_df_return = LBDOC_OBJ.lbdoc_obj.submit_operations_df(
                None, operation_lbdoc, False)
            if submit_operations_df_return.lbdoc_return_objs.lbdoc_return_objs[-1].\
                            status == "success":
                continue
Esempio n. 28
0
    def test_x(self):
        Pessoa = self.base.metaclass()
        Gmulti = self.base.metaclass('gmulti')
        Dependente = self.base.metaclass('dependente')
        lbbase = self.base

        class Y(Gmulti):
            @property
            def teste(self):
                return Gmulti.teste.__get__(self)

            @teste.setter
            def teste(self, v):
                Gmulti.teste.__set__(self, v)

        class X(Pessoa):
            def __init__(self, **args):
                super(X, self).__init__(**args)

            @property
            def nome(self):
                return Pessoa.nome.__get__(self)

            @nome.setter
            def nome(self, v):
                Pessoa.nome.__set__(self, v)

            @property
            def dependente(self):
                return Pessoa.dependente.__get__(self)

            @dependente.setter
            def dependente(self, v):
                Pessoa.dependente.__set__(self, v)

            def set_dependentes(self):
                g1 = dict(teste='ww',
                          teste2=['dgfkdsgsghslkdghsk', 'dsgjsd.,gjsd.gjs'])
                g1_obj = dict2document(lbbase, g1, Gmulti)
                g2 = dict(teste='ww',
                          teste2=['dgfkdsgsghslkdghsk', 'dsgjsd.,gjsd.gjs'])
                g2_obj = dict2document(lbbase, g2, Gmulti)
                g3 = dict(teste='ww',
                          teste2=['dgfkdsgsghslkdghsk', 'dsgjsd.,gjsd.gjs'])
                g3_obj = dict2document(lbbase, g3, Gmulti)
                g4 = dict(teste='ww',
                          teste2=['dgfkdsgsghslkdghsk', 'dsgjsd.,gjsd.gjs'])
                g4_obj = dict2document(lbbase, g4, Gmulti)
                d1 = dict(nome_dep='xxx', gmulti=[g1, g2])
                d1_obj = dict2document(lbbase, d1, Dependente)
                d2 = dict(nome_dep='xxx', gmulti=[g3, g4])

                d2_obj = dict2document(lbbase, d2, Dependente)

                d1 = [d1_obj, d2_obj]
                self.dependente = d1

        x = X(nome='aa', carros=['d'])
        x.set_dependentes()
        j = document2json(self.base, x, indent=4)
        self.assertIsNotNone(j)
        fd = open('/tmp/document2.json', 'w+')
        fd.write(j)
        fd.close()

        p = document2dict(self.base, x)
        y = X(**p)
        self.assertIsInstance(y, X)
Esempio n. 29
0
    def store_twitter(self, status_list, tokenize=True):
        """
        Store twitter status in LB Database

        :param status_list: List of status to be stored
        :param tokenize: Whether we should tokenize it directly or not
        :return: True or None if it isn't possible to store it
        """
        for elm in status_list:
            status_json = self.status_to_json([elm])

            status = Status(
                origin='twitter',
                inclusion_date=datetime.datetime.now(),
                inclusion_datetime=datetime.datetime.now(),
                search_term=self.term,
                text=elm.text,
                source=status_json,
                base=self.status_base
            )

            retorno = status.create_status()

            if retorno is None:
                # log.error("Error inserting status %s on Base" % elm.text)
                log.error("Error inserting status  on Base" % elm.text)

                continue

            status_dict = conv.document2dict(self.status_base.lbbase, status)

            # Manually add id_doc
            status_dict['_metadata'] = dict()
            status_dict['_metadata']['id_doc'] = retorno

            if tokenize:
                # SRL tokenize
                if status_dict.get('text') is not None:
                    tokenized = srl.srl_tokenize(status_dict['text'])
                    if tokenized.get('arg_structures') is not None:
                        status_dict['arg_structures'] = tokenized.get('arg_structures')
                    if tokenized.get('tokens') is not None:
                        status_dict['tokens'] = tokenized.get('tokens')

            # Now try to find location
            status_dict = location.get_location(status_dict)

            # Process tokens if selected
            result = dictionary.process_tokens_dict(status_dict, self.dictionary_base)
            log.info("Corpus da tokenizacao calculado. id_doc = %s", retorno)
            status_dict = result['status']

            # Extract hashtags
            status_dict = self.status_base.get_hashtags_dict(status_dict)

            # Calculate category
            status_dict = status.get_category(status_dict)

            # Get brasil city information
            status_dict = self.status_base.process_geo_dict(
                id_doc=retorno,
                status_dict=status_dict
            )

            # Now update document back
            self.status_base.documentrest.update(retorno, json.dumps(status_dict))

        return retorno
Esempio n. 30
0
 def blacklist_to_dict(self):
     """
     Convert status object to Python dict
     :return:
     """
     return conv.document2dict(blacklist_base.lbbase, self)
Esempio n. 31
0
def insert_from_status(lbstatus, dictionary_base=None, outfile=None):
    # try:
    #     assert isinstance(lbstatus, StatusBase)
    # except AssertionError as e:
    #     log.error("You have to supply a status instance\n%s", e)
    #     return

    task_queue = Queue()
    done_queue = Queue()
    processes = int(lbstatus.processes)

    # As we are reprocessing tokens, it is necessary to clear frequency
    dic_base = dictionary.DictionaryBase(dic_base=dictionary_base)
    dic_base.remove_base()
    dic_base.create_base()

    # Lust send the GET request
    rest_url = lbstatus.documentrest.rest_url
    rest_url += "/" + lbstatus.lbbase._metadata.name + "/doc/"

    id_status_list = lbstatus.get_document_ids()
    if id_status_list is None:
        log.error("No status found. Import some status first")
        return False

    for elm in id_status_list:
        params = dict(status_id=elm, outfile=outfile, rest_url=rest_url + elm)
        task_queue.put(params)

    for i in range(processes):
        # Permite o processamento paralelo dos tokens
        Process(target=worker, args=(task_queue, done_queue)).start()

    # Load dictionary
    dic = corpora.Dictionary()
    if outfile is not None:
        if os.path.exists(outfile):
            dic.load(outfile)

    max_size = lbstatus.max_size
    # Merge results with this dictionary
    log.debug("Processing results from dictionary creation")
    for i in range(len(id_status_list)):
        # Update status after processing
        processed = done_queue.get()
        dic2 = processed['dic']
        result = processed['status']

        status_dict = conv.document2dict(lbstatus.lbbase, result)
        try:
            retorno = lbstatus.documentrest.update(params['status_id'],
                                                   json.dumps(status_dict))
        except HTTPError as e:
            log.error("Error updating document id = %d\n%s" %
                      (params['status_id'], e.message))

        dic.merge_with(dic2)
        log.debug("111111111111111111111: Novo dicionário %s", dic)
        if outfile is not None:
            # Serialize if it grows bigger than the amount of size
            if sys.getsizeof(dic, 0) >= max_size:
                log.info("Serializing dict as it reached max size %s",
                         max_size)
                dic.save(outfile)

    if outfile is not None:
        dic.save(outfile)

    # Tell child processes to stop
    for i in range(processes):
        task_queue.put('STOP')

    return True
Esempio n. 32
0
def insert_from_status(lbstatus,
                       dictionary_base=None,
                       outfile=None):
    # try:
    #     assert isinstance(lbstatus, StatusBase)
    # except AssertionError as e:
    #     log.error("You have to supply a status instance\n%s", e)
    #     return

    task_queue = Queue()
    done_queue = Queue()
    processes = int(lbstatus.processes)

    # As we are reprocessing tokens, it is necessary to clear frequency
    dic_base = dictionary.DictionaryBase(dic_base=dictionary_base)
    dic_base.remove_base()
    dic_base.create_base()

    # Lust send the GET request
    rest_url = lbstatus.documentrest.rest_url
    rest_url += "/" + lbstatus.lbbase._metadata.name + "/doc/"

    id_status_list = lbstatus.get_document_ids()
    if id_status_list is None:
        log.error("No status found. Import some status first")
        return False

    for elm in id_status_list:
        params = dict(
            status_id=elm,
            outfile=outfile,
            rest_url=rest_url + elm
        )
        task_queue.put(params)

    for i in range(processes):
        # Permite o processamento paralelo dos tokens
        Process(target=worker, args=(task_queue, done_queue)).start()

    # Load dictionary
    dic = corpora.Dictionary()
    if outfile is not None:
        if os.path.exists(outfile):
            dic.load(outfile)

    max_size = lbstatus.max_size
    # Merge results with this dictionary
    log.debug("Processing results from dictionary creation")
    for i in range(len(id_status_list)):
        # Update status after processing
        processed = done_queue.get()
        dic2 = processed['dic']
        result = processed['status']

        status_dict = conv.document2dict(lbstatus.lbbase, result)
        try:
            retorno = lbstatus.documentrest.update(params['status_id'], json.dumps(status_dict))
        except HTTPError as e:
            log.error("Error updating document id = %d\n%s" % (params['status_id'], e.message))

        dic.merge_with(dic2)
        log.debug("111111111111111111111: Novo dicionário %s", dic)
        if outfile is not None:
            # Serialize if it grows bigger than the amount of size
            if sys.getsizeof(dic, 0) >= max_size:
                log.info("Serializing dict as it reached max size %s", max_size)
                dic.save(outfile)

    if outfile is not None:
        dic.save(outfile)

    # Tell child processes to stop
    for i in range(processes):
        task_queue.put('STOP')

    return True
Esempio n. 33
0
    def store_twitter(self, status_list, tokenize=True):
        """
        Store twitter status in LB Database

        :param status_list: List of status to be stored
        :param tokenize: Whether we should tokenize it directly or not
        :return: True or None if it isn't possible to store it
        """
        for elm in status_list:
            status_json = self.status_to_json([elm])

            status = Status(origin='twitter',
                            inclusion_date=datetime.datetime.now(),
                            inclusion_datetime=datetime.datetime.now(),
                            search_term=self.term,
                            text=elm.text,
                            source=status_json,
                            base=self.status_base)

            retorno = status.create_status()

            if retorno is None:
                # log.error("Error inserting status %s on Base" % elm.text)
                log.error("Error inserting status  on Base" % elm.text)

                continue

            status_dict = conv.document2dict(self.status_base.lbbase, status)

            # Manually add id_doc
            status_dict['_metadata'] = dict()
            status_dict['_metadata']['id_doc'] = retorno

            if tokenize:
                # SRL tokenize
                if status_dict.get('text') is not None:
                    tokenized = srl.srl_tokenize(status_dict['text'])
                    if tokenized.get('arg_structures') is not None:
                        status_dict['arg_structures'] = tokenized.get(
                            'arg_structures')
                    if tokenized.get('tokens') is not None:
                        status_dict['tokens'] = tokenized.get('tokens')

            # Now try to find location
            status_dict = location.get_location(status_dict)

            # Process tokens if selected
            result = dictionary.process_tokens_dict(status_dict,
                                                    self.dictionary_base)
            log.info("Corpus da tokenizacao calculado. id_doc = %s", retorno)
            status_dict = result['status']

            # Extract hashtags
            status_dict = self.status_base.get_hashtags_dict(status_dict)

            # Calculate category
            status_dict = status.get_category(status_dict)

            # Get brasil city information
            status_dict = self.status_base.process_geo_dict(
                id_doc=retorno, status_dict=status_dict)

            # Now update document back
            self.status_base.documentrest.update(retorno,
                                                 json.dumps(status_dict))

        return retorno