def get_all_counts_and_lengths(counts_file, lengths_file): #counts = None if not os.path.exists(counts_file) or not os.path.exists(lengths_file): print 'Counting all words in all descriptions.' #counts = count_all_words(DATA_FILE, counts_file) counts = defaultdict(int) lengths = defaultdict(int) data = datagetter.get_data(DATA_FILE) start = time.time() for ii, text in enumerate(data['FullDescription']): sentences = nltk.tokenize.sent_tokenize(text) total_words = 0 # Keeps track of total words in each ad for sentence in sentences: words = nltk.tokenize.word_tokenize(sentence) total_words += len(words) for word in words: counts[word] += 1 # Track number of instances of a word # Track number of sentences and total number of words per ad lengths[ii] = len(sentences), total_words if not ii % 1000: print 'Finished {:d} ads in {:.2f} seconds'.format(ii, time.time() - start) pickle.dump(counts, open(counts_file, "wb")) pickle.dump(lengths, open(lengths_file, "wb")) else: print 'Reading counts and lengths file.' counts = datagetter.read_file(counts_file) lengths = datagetter.read_file(lengths_file) return counts, lengths
################################################################################# # Script ################################################################################# if __name__ == '__main__': counts, lengths = get_all_counts_and_lengths('temp/total_word_counts.p', 'temp/sentence_lengths.p') if not counts: sys.stderr.write('Something went wrong reading the counts file.') sorted_counts = sorted(counts.iteritems(), key=operator.itemgetter(1), reverse=True) #for ii, pair in enumerate(sorted_counts[:100]): # print '{0}: {1} - {2}'.format(ii, pair[0], pair[1]) if not lengths: sys.stderr.write('Something went wrong reading the lengths file.') # print lengths data = datagetter.get_data() feats = get_word_count_feats(data, [word for word, val in sorted_counts[:500]], 'top500') print feats ''' data = datagetter.get_data() add_feats(data, sorted_counts) pickle.dump(data, open('annotated_data.p', 'wb')) print data <<<<<<< HEAD # plt.bar([ii for ii in range(100)], [val[1] for val in sorted_counts[:100]])