Ejemplo n.º 1
0
    def __create_unified_data_file(self, filename_output):
        try:
            total_lines = 0
            input_filename_list = [
                self.filename_posts, self.filename_comments,
                self.filename_profiles_posts, self.filename_profiles_comments
            ]

            dataHandle = DataHandle()
            for filename_input in input_filename_list:
                document_input_list = dataHandle.getData(
                    filename_input=filename_input)
                total_lines += len(document_input_list)
                dataHandle.persistData(filename_output=filename_output,
                                       document_list=document_input_list,
                                       operation_type="a")

            ### Cria documento para indicar local das midias
            if total_lines > 0:
                alias_filepath_medias = '{}{}/{}/'.format(
                    "/data/jsons/", self.current_timestamp, "medias")
                document_input_list = [{
                    "tipo_documento":
                    "midia",
                    "local_armazenamento":
                    alias_filepath_medias
                }]
                dataHandle.persistData(filename_output=filename_output,
                                       document_list=document_input_list,
                                       operation_type="a")

        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print('\nErro: ',
                  e,
                  '\tDetalhes: ',
                  exc_type,
                  fname,
                  exc_tb.tb_lineno,
                  '\tData e hora: ',
                  datetime.now(),
                  flush=True)

            print("Finalizando script...")
            sys.exit(1)
Ejemplo n.º 2
0
    def create_collection_pipeline(self):
        try:
            start_time = str(datetime.now())
            dataHandle = DataHandle()

            start_time = str(
                dataHandle.getDateFormatted(string_datetime=start_time))

            # print("Processo de coleta iniciado em {}\tSalvando dados em {}".format(start_time, self.data_path_source_files), flush=True)

            self.alias_data_path_source_files = '{}{}/'.format(
                "/data/jsons/", self.current_timestamp)
            print("Processo de coleta iniciado em {}\tSalvando dados em {}".
                  format(start_time, self.alias_data_path_source_files),
                  flush=True)

            collection_types = []

            if len(self.user_list) > 0:
                collection_types.append("perfil")
            if len(self.hashtag_list) > 0:
                collection_types.append("hashtag")

            if len(collection_types) == 0:
                print(
                    "\nNenhum perfil ou hashtag informado para coleta. Finalizando script ",
                    flush=True)
                self.__create_error_file(
                    filename_output=self.filename_unified_data_file,
                    error_document={
                        "erro":
                        "Nenhum perfil ou hashtag informado para coleta.",
                        "detalhes": None,
                        "data_e_hora": str(datetime.now())
                    })
                sys.exit(1)

            for collection_type in collection_types:

                post_type_to_download_midias_and_comments = None

                if collection_type == "perfil":
                    ### COLETA 1.1 - PERFIL
                    document_input_list = self.user_list
                    filename_output = self.filename_profiles_posts

                    self.__execute_data_collection(
                        filename_output=filename_output,
                        dataHandle=dataHandle,
                        document_input_list=document_input_list,
                        debug_message="Inicio da coleta de perfil de usuarios",
                        document_type="profiles_posts")

                    ### COLETA 1.2 - POSTS DE PERFIL
                    document_input_list = dataHandle.getData(
                        filename_input=self.filename_profiles_posts,
                        attributes_to_select=['nome_do_usuario'])
                    filename_output = self.filename_posts

                    if len(document_input_list) > 0:
                        post_type_to_download_midias_and_comments = "posts_profile"

                        self.__execute_data_collection(
                            filename_output=filename_output,
                            dataHandle=dataHandle,
                            document_input_list=document_input_list,
                            debug_message=
                            "Inicio da coleta de posts de usuario",
                            document_type=
                            post_type_to_download_midias_and_comments)

                    else:
                        print(
                            "\nAtencao: Nao existem perfis armazenados para coletar posts.",
                            flush=True)

                if collection_type == "hashtag":
                    ### COLETA 1 -POSTS DE HASHTAGS

                    ### Verifica se login valido
                    try:
                        proxy_info = self.__get_proxy(does_not_increment=True)
                        instaloaderInstance = localinstaloader.Instaloader(
                            proxies=proxy_info)
                        instaloaderInstance.login(user=self.instagram_user,
                                                  passwd=self.instagram_passwd)
                    except Exception as e:
                        exc_type, exc_obj, exc_tb = sys.exc_info()
                        fname = os.path.split(
                            exc_tb.tb_frame.f_code.co_filename)[1]
                        print('\nErro: ',
                              e,
                              '\tDetalhes: ',
                              exc_type,
                              fname,
                              exc_tb.tb_lineno,
                              '\tData e hora: ',
                              datetime.now(),
                              flush=True)

                        exc_type, exc_obj, exc_tb = sys.exc_info()
                        error_document = self.__getErrorDocument(
                            exception_obj=e, exc_type=exc_type, exc_tb=exc_tb)

                        self.__create_error_file(
                            filename_output=self.filename_unified_data_file,
                            error_document=error_document)
                        print("Finalizando script.")
                        sys.exit(1)
                    else:
                        document_input_list = self.hashtag_list
                        filename_output = self.filename_posts

                        post_type_to_download_midias_and_comments = "posts_hashtag"

                        self.__execute_data_collection(
                            filename_output=filename_output,
                            dataHandle=dataHandle,
                            document_input_list=document_input_list,
                            debug_message=
                            "Inicio da coleta de posts com hashtag",
                            document_type=
                            post_type_to_download_midias_and_comments)

                ### COLETA 2 - MIDIA DOS POSTS
                filepath_output = self.filepath_medias
                post_document_input_list = []

                temp_post_document_input_list = dataHandle.getData(
                    filename_input=self.filename_posts,
                    attributes_to_select=[
                        'identificador', "identificador_midia", "tipo_midia",
                        "identificador_coleta"
                    ],
                    document_type=post_type_to_download_midias_and_comments)

                identifiers_to_download_midia = self.users_to_download_media if collection_type == "perfil" else self.hashtags_to_download_media

                ### Faz a verificacao de quais perfis ou palavras para coletar midias
                if len(identifiers_to_download_midia) > 0:
                    for temp_document in temp_post_document_input_list:
                        if temp_document[
                                "identificador_coleta"] in identifiers_to_download_midia:
                            post_document_input_list.append(temp_document)
                else:
                    post_document_input_list = temp_post_document_input_list

                if len(post_document_input_list) > 0:
                    self.__execute_data_collection(
                        filename_output=filepath_output,
                        dataHandle=dataHandle,
                        document_input_list=post_document_input_list,
                        debug_message="Inicio da coleta de media dos posts",
                        document_type="media")
                else:
                    print(
                        "\nAtencao: Nao existem posts armazenados para coletar midia.",
                        flush=True)

                ### COLETA 3 - COMENTARIOS DOS POSTS
                document_input_list = dataHandle.getData(
                    filename_input=self.filename_posts,
                    attributes_to_select=['identificador'],
                    document_type=post_type_to_download_midias_and_comments)
                filename_output = self.filename_comments
                comment_type_to_download_profiles = "comments_profile" if post_type_to_download_midias_and_comments == "posts_profile" else "comments_hashtag"

                if len(document_input_list) > 0:
                    self.__execute_data_collection(
                        filename_output=filename_output,
                        dataHandle=dataHandle,
                        document_input_list=document_input_list,
                        debug_message="Inicio da coleta de comments dos posts",
                        document_type=comment_type_to_download_profiles)
                else:
                    print(
                        "\nAtencao: Nao existem posts armazenados para coletar comentarios.",
                        flush=True)

                ### COLETA 4 - PERFIL DOS COMENTADORES
                document_input_list = dataHandle.getData(
                    filename_input=self.filename_comments,
                    attributes_to_select=['nome_do_usuario'],
                    document_type=comment_type_to_download_profiles)
                filename_output = self.filename_profiles_comments

                if len(document_input_list) > 0:
                    self.__execute_data_collection(
                        filename_output=filename_output,
                        dataHandle=dataHandle,
                        document_input_list=document_input_list,
                        debug_message=
                        "Inicio da coleta de perfil de comentadores",
                        document_type="profiles_comments")
                else:
                    print(
                        "\nAtencao: Nao existem comentarios armazenados para coletar perfis de comentadores.",
                        flush=True)

            self.__create_unified_data_file(
                filename_output=self.filename_unified_data_file)

        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print('\nErro: ',
                  e,
                  '\tDetalhes: ',
                  exc_type,
                  fname,
                  exc_tb.tb_lineno,
                  '\tData e hora: ',
                  datetime.now(),
                  flush=True)

            exc_type, exc_obj, exc_tb = sys.exc_info()
            error_document = self.__getErrorDocument(exception_obj=e,
                                                     exc_type=exc_type,
                                                     exc_tb=exc_tb)

            self.__create_error_file(
                filename_output=self.filename_unified_data_file,
                error_document=error_document)
            print("Finalizando script.")
            sys.exit(1)