def main(args): """ calulate the number of times the words show up :return: """ logging.info('starting words_tweeted') if not args.output_file: logging.error('no output file defined') sys.exit(-1) # create the counting structure wc = WordsCounter(args.output_file) # read file bringing the word list tfile = TweetFile(args.input_file) for words in tfile.get_words(): # add each word in the dictionary wc.process_tweet(words) # save the info to output wc.write_results() logging.debug('Finished writing to %s' % args.output_file) # log the program is finished logging.info('program finished')
def main(args): """ Calculate the median of number of unique words :return: """ logging.info('starting median_unique') if not args.output_file: logging.error('no output file defined') sys.exit(-1) # init vars mf = MedianFinder(args.output_file) # read file bringing the word list tfile = TweetFile(args.input_file) for words in tfile.get_words(): # make an unique set of words and get its length mf.process_tweet(words) # save to file the current median mf.write_results() # log the program is finished logging.info('program finished')
def on_data(self, data): """ This function is called when new data arrives. Once for each tweet :param data: :return: """ # get the text removing line breaks low = json.loads(data).get('text', '').lower().replace('\n', '').replace('\r', '') self.to_flush += 1 logging.debug(low) # if cache enabled if self.cache: self.cache.write(data) self.cache.write('\n') if self.to_flush == self.flush_period: self.cache.flush() # if line saving enabled if self.lines: self.lines.write(low.encode('utf-8')) self.lines.write('\n') if self.to_flush == self.flush_period: self.lines.flush() # reset flush counter and print progress if self.to_flush == self.flush_period: self.to_flush = 0 logging.info('Nr tweets processed: %d' % self.stats.num_processed) # process the line line = TweetFile.process_tweet_text(low) self.stats.median(line) self.stats.word_count(line) return True
def on_data(self, data): """ This function is called when new data arrives. Once for each tweet :param data: :return: """ # get the text removing line breaks low = json.loads(data).get('text', '').lower().replace('\n', '').replace('\r', '') self.to_flush +=1 logging.debug(low) # if cache enabled if self.cache: self.cache.write(data) self.cache.write('\n') if self.to_flush == self.flush_period: self.cache.flush() # if line saving enabled if self.lines: self.lines.write(low.encode('utf-8')) self.lines.write('\n') if self.to_flush == self.flush_period: self.lines.flush() # reset flush counter and print progress if self.to_flush == self.flush_period: self.to_flush = 0 logging.info('Nr tweets processed: %d' % self.stats.num_processed) # process the line line = TweetFile.process_tweet_text(low) self.stats.median(line) self.stats.word_count(line) return True