Beispiel #1
0
def main(args):
    """
    calulate the number of times the words show up
    :return:
    """
    logging.info('starting words_tweeted')
    if not args.output_file:
        logging.error('no output file defined')
        sys.exit(-1)

    # create the counting structure
    wc = WordsCounter(args.output_file)

    # read file bringing the word list
    tfile = TweetFile(args.input_file)
    for words in tfile.get_words():

        # add each word in the dictionary
        wc.process_tweet(words)

    # save the info to output
    wc.write_results()
    logging.debug('Finished writing to %s' % args.output_file)

    # log the program is finished
    logging.info('program finished')
Beispiel #2
0
def main(args):
    """
    Calculate the median of number of unique words
    :return:
    """
    logging.info('starting median_unique')
    if not args.output_file:
        logging.error('no output file defined')
        sys.exit(-1)

    # init vars
    mf = MedianFinder(args.output_file)

    # read file bringing the word list
    tfile = TweetFile(args.input_file)
    for words in tfile.get_words():
        # make an unique set of words and get its length
        mf.process_tweet(words)

        # save to file the current median
        mf.write_results()

    # log the program is finished
    logging.info('program finished')
Beispiel #3
0
def main(args):
    """
    Calculate the median of number of unique words
    :return:
    """
    logging.info('starting median_unique')
    if not args.output_file:
        logging.error('no output file defined')
        sys.exit(-1)

    # init vars
    mf = MedianFinder(args.output_file)

    # read file bringing the word list
    tfile = TweetFile(args.input_file)
    for words in tfile.get_words():
        # make an unique set of words and get its length
        mf.process_tweet(words)

        # save to file the current median
        mf.write_results()

    # log the program is finished
    logging.info('program finished')
Beispiel #4
0
    def on_data(self, data):
        """
        This function is called when new data arrives. Once for each tweet
        :param data:
        :return:
        """
        # get the text removing line breaks
        low = json.loads(data).get('text',
                                   '').lower().replace('\n',
                                                       '').replace('\r', '')
        self.to_flush += 1
        logging.debug(low)

        # if cache enabled
        if self.cache:
            self.cache.write(data)
            self.cache.write('\n')

            if self.to_flush == self.flush_period:
                self.cache.flush()

        # if line saving enabled
        if self.lines:
            self.lines.write(low.encode('utf-8'))
            self.lines.write('\n')

            if self.to_flush == self.flush_period:
                self.lines.flush()

        # reset flush counter and print progress
        if self.to_flush == self.flush_period:
            self.to_flush = 0
            logging.info('Nr tweets processed: %d' % self.stats.num_processed)

        # process the line
        line = TweetFile.process_tweet_text(low)
        self.stats.median(line)
        self.stats.word_count(line)

        return True
Beispiel #5
0
    def on_data(self, data):
        """
        This function is called when new data arrives. Once for each tweet
        :param data:
        :return:
        """
        # get the text removing line breaks
        low = json.loads(data).get('text', '').lower().replace('\n', '').replace('\r', '')
        self.to_flush +=1
        logging.debug(low)

        # if cache enabled
        if self.cache:
            self.cache.write(data)
            self.cache.write('\n')

            if self.to_flush == self.flush_period:
                self.cache.flush()


        # if line saving enabled
        if self.lines:
            self.lines.write(low.encode('utf-8'))
            self.lines.write('\n')

            if self.to_flush == self.flush_period:
                self.lines.flush()

        # reset flush counter and print progress
        if self.to_flush == self.flush_period:
            self.to_flush = 0
            logging.info('Nr tweets processed: %d' % self.stats.num_processed)

        # process the line
        line = TweetFile.process_tweet_text(low)
        self.stats.median(line)
        self.stats.word_count(line)

        return True