Esempio n. 1
0
    def _get_user_by_id(self, user_id: int, proxy=None, timeout=50):
        session = self._requests
        url = self.__generate_user_name_link_by_id(user_id)
        logger.info("Obteniendo info del usuario con `id`", id=user_id)
        try:
            response = self.__make_request(session, url, proxy, timeout)
            return response

        except (
                requests.exceptions.HTTPError,
                requests.exceptions.Timeout,
                json.JSONDecodeError,
        ) as e:
            logger.error(
                "Error en la requests para el `user_id`",
                user_id=user_id,
                error=e,
            )
            return None
Esempio n. 2
0
    def scrape_post_comments(
        self,
        post_id: int,
        end_cursor="",
        retry=5,
        timeout=50,
        n_comments=50,
    ):

        session = self._requests

        post_code = InstagramPostsSerializer.get_code_from_id(post_id)

        next_page = True

        counter = 0

        while next_page and (counter < n_comments):
            try:
                variables = {
                    "shortcode": str(post_code),
                    "first": "50",
                    "after": "" if not end_cursor else end_cursor,
                }

                url = self.__generate_post_comments_link(variables)

                proxy = next(self.proxies_pool)

                response = session.get(
                    url,
                    proxies={"http": proxy},
                    headers=HEADERS,
                )

                response.raise_for_status()

                response = response.json()

                try:
                    edge_media_to_parent_comment = response["data"][
                        "shortcode_media"]["edge_media_to_parent_comment"]

                    return edge_media_to_parent_comment

                except KeyError:

                    return []

                for edge in edge_media_to_parent_comment["edges"]:

                    comment_node = edge["node"]

                    comment = PostComment(comment_node)

                    counter = +counter

                    yield comment

                next_page = edge_media_to_parent_comment["page_info"][
                    "has_next_page"]

                end_cursor = edge_media_to_parent_comment["page_info"][
                    "end_cursor"]

            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.HTTPError,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ChunkedEncodingError,
                    json.JSONDecodeError,
            ) as e:
                logger.error(
                    "Error mientras se hace el requests de comentarios",
                    error=e,
                )

                if retry > 0:
                    logger.info(
                        "Re intentando sacar comentarios, actualmente:",
                        cantidad_comentarios=counter,
                    )
                    yield from self.scrape_post_comments(
                        post_id=post_id,
                        n_comments=n_comments,
                        end_cursor=end_cursor,
                        retry=retry - 1,
                        timeout=timeout,
                    )

                return []
Esempio n. 3
0
    def scrape_hashtag(
        self,
        hashtag: str,
        end_cursor="",
        *,
        retry=5,
        timeout=50,
        date_limit=None,
    ):
        """Generador con los posts asociados a un hashtag.

        Args:
            hashtag: hashtag a buscar. Puede ser con `#` o sin el.
            end_cursor: Este argumento es para navegar a través de la API
                        de Instagram. Sirve para paginar. Se genera
                        automático en esta API. Primera pagina corresponde
                        a el valor `end_cursor = ""`.
            retry: En caso de errores en la requests, se vuelve a intentar tantas
                   veces como valga este parámetro.
            timeout: Tiempo de espera para la response.
            date_limit: Fecha límite donde buscar los posts.

        Returns:
            Si no hay resultados en la búsqueda, se retorna una lista vacía.

        Yields:
            json con la información de cada post encontrado.

        """
        session = self._requests
        next_page = True
        while next_page:
            try:
                proxy = next(self.proxies_pool
                             )  # por cada requests, seteamos un ip diferente
                url = self.__generate_hashtag_api_link(hashtag, end_cursor)
                logger.info(
                    "Haciendo requests para `url` desde el `proxy`",
                    url=url,
                    proxy=proxy,
                )

                response = self.__make_request(session, url, proxy, timeout)

                response = response.json()

                try:
                    edge_hashtag_to_media = response["graphql"]["hashtag"][
                        "edge_hashtag_to_media"]

                except KeyError:
                    return []

                for edge in edge_hashtag_to_media["edges"]:
                    post = OpenUserPost(edge["node"])
                    yield post
                    if date_limit and (post.time  # pylint: disable=no-member
                                       <= date_limit):
                        break

                next_page = edge_hashtag_to_media["page_info"]["has_next_page"]

                end_cursor = edge_hashtag_to_media["page_info"]["end_cursor"]
            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.HTTPError,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ChunkedEncodingError,
                    json.JSONDecodeError,
            ) as e:
                logger.error(
                    "Error mientras se hace el requests para la url",
                    url=url,
                    error=e,
                )

                if retry > 0:
                    logger.info("Re intentando para url", url=url)
                    yield from self.scrape_hashtag(
                        hashtag,
                        end_cursor=end_cursor,
                        retry=retry - 1,
                        timeout=timeout,
                        date_limit=date_limit,
                    )

                return []
Esempio n. 4
0
    def scrape_user_posts(
        self,
        user_id: int,
        n_posts=50,
        end_cursor="",
        timeout=50,
        retry=5,
    ):
        """Generador con los posts de un usuario.

        Args:
            user_id: id del usuario
            n_posts: Cantidad de posts que se desean capturar.
            end_cursor: Este argumento es para navegar a través de la API
                        de Instagram. Sirve para paginar. Se genera
                        automático en esta API. Primera pagina corresponde
                        a el valor `end_cursor = ""`.
            timeout: Tiempo de espera para la response.
            date_limit: Fecha límite donde buscar los posts.

        Returns:
            Si no hay resultados en la búsqueda, se retorna una lista vacía.

        Yields:
            json con la información de cada post del usuario.

        """

        session = self._requests
        next_page = True
        post_counter = 0
        while next_page and (post_counter < n_posts):
            try:
                proxy = next(self.proxies_pool)

                variables = {
                    "id": str(user_id),
                    "first": str(n_posts),
                    "after": str(end_cursor),
                }

                url = self.__generate_user_posts_link(variables)
                response = self.__make_request(session, url, proxy, timeout)
                response = response.json()

                try:

                    user = response["data"]["user"]

                    if not user:
                        # Ver cuando pasa este error
                        return []

                    edge_owner_to_timeline_media = user[
                        "edge_owner_to_timeline_media"]
                except KeyError:
                    return []

                for edge in edge_owner_to_timeline_media["edges"]:

                    post = UserTimeLinePost(edge["node"])

                    post_counter = +1

                    yield post

                end_cursor = edge_owner_to_timeline_media["page_info"][
                    "end_cursor"]

                next_page = edge_owner_to_timeline_media["page_info"][
                    "has_next_page"]

            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.HTTPError,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ChunkedEncodingError,
                    json.JSONDecodeError,
            ) as e:
                logger.error(
                    "Error mientras se hace el requests para la url",
                    error=e,
                )

                if retry > 0:
                    logger.info(
                        "Re intentando sacar posts, actualmente:",
                        cantidad_posts=post_counter,
                    )
                    yield from self.scrape_user_posts(
                        user_id=user_id,
                        n_posts=n_posts,
                        end_cursor=end_cursor,
                        retry=retry - 1,
                        timeout=timeout,
                    )

                return []
    def _gen_tweet(self, url, headers, date_limit, retry, params=None):
        """Generador que busca los posts en url de users o hashtags.

        Args:
            url: Links hacia la api de twitter de user o hashtags.
            headers: Headers de request http.
            params: Parámetros adicionales para la requests.
            date_limit: Fecha límite donde buscar posts.
            retry: Si por alguna razón falla una requests, intenta de nuevo
                   tantas veces como sea este valor.

        Returns:
            Lista vacia si es que no hay resultados

        Yields:
            Diccionarios con información de cada posts

        """

        session = self._requests
        proxy = next(self.proxies_pool)

        if not params:
            params = {}

        logger.info(
            "Haciendo requests para `url` desde el `proxy`",
            url=url,
            proxy=proxy,
        )

        response = session.get(url,
                               headers=headers,
                               params=params,
                               proxies={"http": proxy})

        while True:
            try:
                response.raise_for_status()
                response = response.json()
                html = HTML(
                    html=response["items_html"],
                    url="bunk",
                    default_encoding="utf-8",
                )
                tweets = []
                for tweet, profile in zip(
                        html.find(".stream-item"),
                        html.find(".js-profile-popup-actionable"),
                ):
                    try:
                        tweets.append(TwitterPostSerializer(tweet, profile))
                    except IndexError:
                        continue

                last_tweet = html.find(
                    ".stream-item")[-1].attrs["data-item-id"]
                for tweet in tweets:
                    if date_limit and tweet.date <= date_limit:
                        break
                    tweet.text = re.sub(r"(\S)http", "\g<1> http", tweet.text,
                                        1)
                    tweet.text = re.sub(
                        r"(\S)pic\.twitter",
                        "\g<1> pic.twitter",
                        tweet.text,
                        1,
                    )
                    yield tweet

                params = {"max_position": last_tweet}

                proxy = next(self.proxies_pool)
                logger.info(
                    "Haciendo requests para `url` desde el `proxy` desde `from_tweet`",
                    url=url,
                    proxy=proxy,
                    from_tweet=params["max_position"],
                )
                response = session.get(
                    url,
                    params=params,
                    headers=headers,
                    proxies={"http": proxy},
                )

            except ParserError:
                break

            except (
                    requests.exceptions.Timeout,
                    requests.exceptions.HTTPError,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ChunkedEncodingError,
                    json.JSONDecodeError,
            ) as e:
                logger.error(
                    "Error mientras se hace el requests para la url",
                    url=url,
                    error=e,
                )

                if retry > 0:
                    logger.info("Re intentando para url", url=url)
                    yield from self._gen_tweet(
                        url=url,
                        headers=headers,
                        params=params,
                        date_limit=date_limit,
                        retry=retry - 1,
                    )

                return []