Example #1
0
 def message_clean(self):
     try:
         return self._message_clean
     except AttributeError:
         self._message_clean, urls = remove_urls(self.message)
         # Add the URLs found to `self.urls`
         #self.urls = urls
         return self._message_clean
Example #2
0
    def _extract_urls_and_media(self):
        indices = list()  # list of indices to remove
        self._text_clean = self.text

        # Get indices from urls
        urls = self._entities.get('urls', ())
        for url in urls:
            #self.urls.append(url.get('expanded_url', ()))
            i = url.get('indices', ())
            indices.append((i[0], i[1]))

        # Get indices from media
        media = self._entities.get('media', ())
        for medium in media:
            #self.media.append(medium.get('expanded_url', ()))
            i = medium.get('indices', ())
            indices.append((i[0], i[1]))

        # Now `indices` is a list of tuples, where each tuple is (x, y) where x and y are the
        # start and end of the url to remove from `self.text`.
        # We must order `indices` based on the starting indices, reverse.
        # Then we can remove the urls from `self.text`.
        indices = sorted(indices, key=lambda x: x[1], reverse=True)
        for ix in indices:
            self._text_clean = self._text_clean[:ix[0]] + self._text_clean[ix[1]:]

        # Oldest tweets have no `urls` list in `entities` dictionary, they are just hard coded
        # in `self.text`. We must use a regex to remove them.
        # How do we know if this is an old tweet? We assume that any tweet with no `indices` is
        # an old-style tweet.
        # Also notice that if a tweet ends with the ellipsis character (u'\u2026'), or with
        # '...' for old-style tweets, this means that it is a retweet and it has been truncated.
        # If it is this case and if there a URL at the end of the status and this URL has
        # been truncated, then the `indices` contains only the info to cut out the ellipsis
        # character, like (139, 140), and not the url. So we need to process also this case.
        if not indices or self.text[-1:] == u'\u2026' or self.text[-3:] == '...':
            # Remove all URLs from `_text_clean`
            self._text_clean, urls = remove_urls(self._text_clean)
            # Add the URLs found to `self.urls`
            #self.urls.extend(urls)

            # Clean out any sort of 'http:/ ..' left at the end of the retweet like in:
            # "RT @googlemaps: Thx to the previewers who helped us build the #newGoogleMaps.
            # Beginning today it rolls out to users around the world http:/<ellipsis>"
            regex = u'\s*ht(t|tps?|tps?:|tps?:/|tps?://)?\s*(\.|\u2026)*$'
            self._text_clean = re.sub(regex, '', self._text_clean.strip()).strip()