class Twitter(object):
    """Class representing a inventory of books.

    Args:
      tweets_filename (str): File name containing tweets.
      stop_words_filename (str): File name containing stop words.

    Attributes:
      teets(list): List of original tweets.
      stop_words (list): List of stop words.
      indexer (Indexer): Object responsible for indexing tweets.
      searcher (Searcher): Object responsible for searching tweets.

    """

    _TWEET_META_TEXT_INDEX = 0
    _TWEET_META_SCREEN_NAME_INDEX = 1

    _NO_RESULTS_MESSAGE = "Sorry, no results."

    def __init__(self, tweets_filename, stop_words_filename):
        self.tweets = []
        self.tweets_filename = tweets_filename
        self.stop_words = self.__load_stop_words(stop_words_filename)
        self.indexer = Indexer(self.stop_words)
        self.searcher = []

    @timed
    def load_tweets(self):
        """Load tweets from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        docid = 0
        processor = TwitterDataPreprocessor()
        with open(self.tweets_filename) as catalog:
            for entry in catalog:
                # preprocessing
                p_entry = processor.preprocess(entry)

                text = p_entry[self._TWEET_META_TEXT_INDEX].strip()
                screen_name = ''
                if len(p_entry) > 1:
                    screen_name = p_entry[
                        self._TWEET_META_SCREEN_NAME_INDEX].strip()

                indexable_data = text + ' ' + screen_name
                original_data = entry

                tweet = Tweet(docid, indexable_data, original_data)
                self.tweets.append(tweet)
                docid += 1

    @timed
    def load_tweets_and_build_index(self):
        """Load tweets from a file name, build index, compute ranking and save them all.

        """
        self.load_tweets()
        self.indexer.build_and_save(self.tweets)

    @timed
    def load_tweets_and_load_index(self):
        """Load tweets from a file name and load index from a file name.

        """
        self.load_tweets()
        self.searcher = Searcher(self.tweets, self.stop_words)

    @timed
    def search_tweets(self, query, n_results=10):
        """Search tweets according to provided query of terms.

        The query is executed against the indexed tweets, and a list of tweets
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing tweets and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.searcher.search(query, n_results)

        if len(result) > 0:
            return "{:,}".format(self.searcher.search_count()) \
                + " results.\n\n" \
                + "".join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def tweets_count(self):
        """Return number of loaded tweets.

        Returns:
          int: Number of loaded tweets.

        """
        return len(self.tweets)

    def __load_stop_words(self, stop_words_filename):
        """Load stop words that will be filtered during docs processing.

        Stop words are words which are filtered out prior to
        processing of natural language data. There is not one definite
        list of stop words but we are using the list in `stop_words.txt` file.

        Returns:
          list str: List of English stop words.

        """
        stop_words = {}
        with open(stop_words_filename) as stop_words_file:
            for word in stop_words_file:
                stop_words[word.strip()] = True
        return stop_words
Example #2
0
class Twitter(object):
    """Class representing a inventory of books.

    Args:
      tweets_filename (str): File name containing tweets.
      stop_words_filename (str): File name containing stop words.

    Attributes:
      teets(list): List of original tweets.
      stop_words (list): List of stop words.
      indexer (Indexer): Object responsible for indexing tweets.
      searcher (Searcher): Object responsible for searching tweets.

    """

    _TWEET_META_TEXT_INDEX = 0
    _TWEET_META_SCREEN_NAME_INDEX = 1

    _NO_RESULTS_MESSAGE = "Sorry, no results."

    def __init__(self, tweets_filename, stop_words_filename):
        self.tweets             = []
        self.tweets_filename    = tweets_filename
        self.stop_words         = self.__load_stop_words(stop_words_filename)
        self.indexer            = Indexer(self.stop_words)
        self.searcher           = []

    @timed
    def load_tweets(self):
        """Load tweets from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        docid = 0
        processor = TwitterDataPreprocessor()
        with open(self.tweets_filename) as catalog:
            for entry in catalog:
                # preprocessing
                p_entry = processor.preprocess(entry)

                text = p_entry[self._TWEET_META_TEXT_INDEX].strip()
                screen_name = ''
                if len(p_entry) > 1:
                    screen_name = p_entry[self._TWEET_META_SCREEN_NAME_INDEX].strip()
                
                indexable_data = text + ' ' + screen_name
                original_data = entry

                tweet = Tweet(docid, indexable_data, original_data)
                self.tweets.append(tweet)
                docid += 1

    @timed
    def load_tweets_and_build_index(self):
        """Load tweets from a file name, build index, compute ranking and save them all.

        """
        self.load_tweets()
        self.indexer.build_and_save(self.tweets)

    @timed
    def load_tweets_and_load_index(self):
        """Load tweets from a file name and load index from a file name.

        """
        self.load_tweets()
        self.searcher = Searcher(self.tweets, self.stop_words)

    @timed
    def search_tweets(self, query, n_results=10):
        """Search tweets according to provided query of terms.

        The query is executed against the indexed tweets, and a list of tweets
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing tweets and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.searcher.search(query, n_results)

        if len(result) > 0:
            return "{:,}".format(self.searcher.search_count()) \
                + " results.\n\n" \
                + "".join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE        

    def tweets_count(self):
        """Return number of loaded tweets.

        Returns:
          int: Number of loaded tweets.

        """
        return len(self.tweets)
    
    def __load_stop_words(self, stop_words_filename):
        """Load stop words that will be filtered during docs processing.

        Stop words are words which are filtered out prior to
        processing of natural language data. There is not one definite
        list of stop words but we are using the list in `stop_words.txt` file.

        Returns:
          list str: List of English stop words.

        """
        stop_words = {}
        with open(stop_words_filename) as stop_words_file:
            for word in stop_words_file:
                stop_words[word.strip()] = True
        return stop_words