class Twitter(object): """Class representing a inventory of books. Args: tweets_filename (str): File name containing tweets. stop_words_filename (str): File name containing stop words. Attributes: teets(list): List of original tweets. stop_words (list): List of stop words. indexer (Indexer): Object responsible for indexing tweets. searcher (Searcher): Object responsible for searching tweets. """ _TWEET_META_TEXT_INDEX = 0 _TWEET_META_SCREEN_NAME_INDEX = 1 _NO_RESULTS_MESSAGE = "Sorry, no results." def __init__(self, tweets_filename, stop_words_filename): self.tweets = [] self.tweets_filename = tweets_filename self.stop_words = self.__load_stop_words(stop_words_filename) self.indexer = Indexer(self.stop_words) self.searcher = [] @timed def load_tweets(self): """Load tweets from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ docid = 0 processor = TwitterDataPreprocessor() with open(self.tweets_filename) as catalog: for entry in catalog: # preprocessing p_entry = processor.preprocess(entry) text = p_entry[self._TWEET_META_TEXT_INDEX].strip() screen_name = '' if len(p_entry) > 1: screen_name = p_entry[ self._TWEET_META_SCREEN_NAME_INDEX].strip() indexable_data = text + ' ' + screen_name original_data = entry tweet = Tweet(docid, indexable_data, original_data) self.tweets.append(tweet) docid += 1 @timed def load_tweets_and_build_index(self): """Load tweets from a file name, build index, compute ranking and save them all. """ self.load_tweets() self.indexer.build_and_save(self.tweets) @timed def load_tweets_and_load_index(self): """Load tweets from a file name and load index from a file name. """ self.load_tweets() self.searcher = Searcher(self.tweets, self.stop_words) @timed def search_tweets(self, query, n_results=10): """Search tweets according to provided query of terms. The query is executed against the indexed tweets, and a list of tweets compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing tweets and their respective tf-idf scores. """ result = '' if len(query) > 0: result = self.searcher.search(query, n_results) if len(result) > 0: return "{:,}".format(self.searcher.search_count()) \ + " results.\n\n" \ + "".join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def tweets_count(self): """Return number of loaded tweets. Returns: int: Number of loaded tweets. """ return len(self.tweets) def __load_stop_words(self, stop_words_filename): """Load stop words that will be filtered during docs processing. Stop words are words which are filtered out prior to processing of natural language data. There is not one definite list of stop words but we are using the list in `stop_words.txt` file. Returns: list str: List of English stop words. """ stop_words = {} with open(stop_words_filename) as stop_words_file: for word in stop_words_file: stop_words[word.strip()] = True return stop_words
class Twitter(object): """Class representing a inventory of books. Args: tweets_filename (str): File name containing tweets. stop_words_filename (str): File name containing stop words. Attributes: teets(list): List of original tweets. stop_words (list): List of stop words. indexer (Indexer): Object responsible for indexing tweets. searcher (Searcher): Object responsible for searching tweets. """ _TWEET_META_TEXT_INDEX = 0 _TWEET_META_SCREEN_NAME_INDEX = 1 _NO_RESULTS_MESSAGE = "Sorry, no results." def __init__(self, tweets_filename, stop_words_filename): self.tweets = [] self.tweets_filename = tweets_filename self.stop_words = self.__load_stop_words(stop_words_filename) self.indexer = Indexer(self.stop_words) self.searcher = [] @timed def load_tweets(self): """Load tweets from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ docid = 0 processor = TwitterDataPreprocessor() with open(self.tweets_filename) as catalog: for entry in catalog: # preprocessing p_entry = processor.preprocess(entry) text = p_entry[self._TWEET_META_TEXT_INDEX].strip() screen_name = '' if len(p_entry) > 1: screen_name = p_entry[self._TWEET_META_SCREEN_NAME_INDEX].strip() indexable_data = text + ' ' + screen_name original_data = entry tweet = Tweet(docid, indexable_data, original_data) self.tweets.append(tweet) docid += 1 @timed def load_tweets_and_build_index(self): """Load tweets from a file name, build index, compute ranking and save them all. """ self.load_tweets() self.indexer.build_and_save(self.tweets) @timed def load_tweets_and_load_index(self): """Load tweets from a file name and load index from a file name. """ self.load_tweets() self.searcher = Searcher(self.tweets, self.stop_words) @timed def search_tweets(self, query, n_results=10): """Search tweets according to provided query of terms. The query is executed against the indexed tweets, and a list of tweets compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing tweets and their respective tf-idf scores. """ result = '' if len(query) > 0: result = self.searcher.search(query, n_results) if len(result) > 0: return "{:,}".format(self.searcher.search_count()) \ + " results.\n\n" \ + "".join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def tweets_count(self): """Return number of loaded tweets. Returns: int: Number of loaded tweets. """ return len(self.tweets) def __load_stop_words(self, stop_words_filename): """Load stop words that will be filtered during docs processing. Stop words are words which are filtered out prior to processing of natural language data. There is not one definite list of stop words but we are using the list in `stop_words.txt` file. Returns: list str: List of English stop words. """ stop_words = {} with open(stop_words_filename) as stop_words_file: for word in stop_words_file: stop_words[word.strip()] = True return stop_words