def words_to_cache(): print "Getting all words from UrbanDictionary..." count = 0 alphabet_todo = calculate_alphabet_todo(alphabet) for letter in alphabet_todo: print "Letter: %s" % letter f = codecs.open('data/words-%s' % letter, 'w', 'UTF-8') for j in UrbanDictionary.words_for_character(letter): if not j.endswith("..."): f.write(j + '\n') count += 1 if count % 100 == 0: print count
def stats_for_letter_to_cache(letter): total_words = file_lines('data/words-%s' % letter) print "Letter: %s" % letter words = codecs.open('data/words-%s' % letter, 'r', 'UTF-8') stats = codecs.open('data/stats-%s' % letter, 'a', 'UTF-8') logging = open('data/logging-%s' % letter, 'a') already_statted = file_lines('data/stats-%s' % letter) if already_statted != 0: # Skip already_statted lines from words since they are already processed for i in xrange(1, already_statted + 1): words.readline() logging.write("Skipped %d lines since they are already processed\n" % already_statted) else: # We are just beginning this file, write the header stats.write("lemma\tdefinitions\tvotes_up\tvotes_down\tvotes_total\n") count = already_statted for word in words: word = word.strip() count += 1 try: info = UrbanDictionary.statistics_for_lemma(word) out_string = "%s\t%d\t%d\t%d\t%d\n" % ( word, info['count_defs'], info['total_votes_up'], info['total_votes_down'], info['total_votes']) stats.write(out_string) except: logging.write("FAILED: %s\n" % word) logging.flush() stats.write("%s\tFAILED\n" % word) if count % 10 == 0: stats.flush() percentage = (float(count) / total_words) * 100 logging.write( "Processed %6d out of %6d || letter: %s || %2.2f || %s || %s\n" % (count, total_words, letter, percentage, time.strftime('%x %X'), word)) logging.flush() logging.write("Letter %s || DONE\n" % letter) logging.flush()
def stats_for_letter_to_cache(letter): total_words = file_lines('data/words-%s' % letter) print "Letter: %s" % letter words = codecs.open('data/words-%s' % letter, 'r', 'UTF-8') stats = codecs.open('data/stats-%s' % letter, 'a', 'UTF-8') logging = open('data/logging-%s' % letter, 'a') already_statted = file_lines('data/stats-%s' % letter) if already_statted != 0: # Skip already_statted lines from words since they are already processed for i in xrange(1, already_statted + 1): words.readline() logging.write("Skipped %d lines since they are already processed\n" % already_statted) else: # We are just beginning this file, write the header stats.write("lemma\tdefinitions\tvotes_up\tvotes_down\tvotes_total\n") count = already_statted for word in words: word = word.strip() count += 1 try: info = UrbanDictionary.statistics_for_lemma(word) out_string = "%s\t%d\t%d\t%d\t%d\n" % (word, info['count_defs'], info['total_votes_up'], info['total_votes_down'], info['total_votes']) stats.write(out_string) except: logging.write("FAILED: %s\n" % word) logging.flush() stats.write("%s\tFAILED\n" % word) if count % 10 == 0: stats.flush() percentage = (float(count) / total_words) * 100 logging.write("Processed %6d out of %6d || letter: %s || %2.2f || %s || %s\n" % (count, total_words, letter, percentage, time.strftime('%x %X'), word)) logging.flush() logging.write("Letter %s || DONE\n" % letter) logging.flush()