Ejemplo n.º 1
0
def words_to_cache():
    print "Getting all words from UrbanDictionary..."
    count = 0
    alphabet_todo = calculate_alphabet_todo(alphabet)
    for letter in alphabet_todo:
        print "Letter: %s" % letter
        f = codecs.open('data/words-%s' % letter, 'w', 'UTF-8')
        for j in UrbanDictionary.words_for_character(letter):
            if not j.endswith("..."):
                f.write(j + '\n')
                count += 1
                if count % 100 == 0:
                    print count
Ejemplo n.º 2
0
def words_to_cache():
    print "Getting all words from UrbanDictionary..."
    count = 0
    alphabet_todo = calculate_alphabet_todo(alphabet)
    for letter in alphabet_todo:
        print "Letter: %s" % letter
        f = codecs.open('data/words-%s' % letter, 'w', 'UTF-8')
        for j in UrbanDictionary.words_for_character(letter):
            if not j.endswith("..."):
                f.write(j + '\n')
                count += 1
                if count % 100 == 0:
                    print count
Ejemplo n.º 3
0
def stats_for_letter_to_cache(letter):
    total_words = file_lines('data/words-%s' % letter)

    print "Letter: %s" % letter

    words = codecs.open('data/words-%s' % letter, 'r', 'UTF-8')
    stats = codecs.open('data/stats-%s' % letter, 'a', 'UTF-8')
    logging = open('data/logging-%s' % letter, 'a')

    already_statted = file_lines('data/stats-%s' % letter)

    if already_statted != 0:
        # Skip already_statted lines from words since they are already processed
        for i in xrange(1, already_statted + 1):
            words.readline()
        logging.write("Skipped %d lines since they are already processed\n" %
                      already_statted)
    else:
        # We are just beginning this file, write the header
        stats.write("lemma\tdefinitions\tvotes_up\tvotes_down\tvotes_total\n")

    count = already_statted

    for word in words:
        word = word.strip()
        count += 1
        try:
            info = UrbanDictionary.statistics_for_lemma(word)
            out_string = "%s\t%d\t%d\t%d\t%d\n" % (
                word, info['count_defs'], info['total_votes_up'],
                info['total_votes_down'], info['total_votes'])
            stats.write(out_string)
        except:
            logging.write("FAILED: %s\n" % word)
            logging.flush()
            stats.write("%s\tFAILED\n" % word)

        if count % 10 == 0:
            stats.flush()
            percentage = (float(count) / total_words) * 100
            logging.write(
                "Processed %6d out of %6d || letter: %s || %2.2f || %s || %s\n"
                % (count, total_words, letter, percentage,
                   time.strftime('%x %X'), word))
            logging.flush()
    logging.write("Letter %s || DONE\n" % letter)
    logging.flush()
Ejemplo n.º 4
0
def stats_for_letter_to_cache(letter):
    total_words = file_lines('data/words-%s' % letter)
    
    print "Letter: %s" % letter
    
    words = codecs.open('data/words-%s' % letter, 'r', 'UTF-8')
    stats = codecs.open('data/stats-%s' % letter, 'a', 'UTF-8')
    logging = open('data/logging-%s' % letter, 'a')
    
    already_statted = file_lines('data/stats-%s' % letter)
    
    if already_statted != 0:
        # Skip already_statted lines from words since they are already processed
        for i in xrange(1, already_statted + 1):
            words.readline()
        logging.write("Skipped %d lines since they are already processed\n" % already_statted)
    else:
        # We are just beginning this file, write the header
        stats.write("lemma\tdefinitions\tvotes_up\tvotes_down\tvotes_total\n")
    
    count = already_statted
    
    for word in words:
        word = word.strip()
        count += 1 
        try:
            info = UrbanDictionary.statistics_for_lemma(word)
            out_string = "%s\t%d\t%d\t%d\t%d\n" % (word, info['count_defs'], info['total_votes_up'], info['total_votes_down'], info['total_votes'])
            stats.write(out_string)
        except:
            logging.write("FAILED: %s\n" % word)
            logging.flush()
            stats.write("%s\tFAILED\n" % word)
        
        if count % 10 == 0:
            stats.flush()
            percentage = (float(count) / total_words) * 100
            logging.write("Processed %6d out of %6d || letter: %s || %2.2f || %s || %s\n" % (count, total_words, letter, percentage, time.strftime('%x %X'), word))
            logging.flush()
    logging.write("Letter %s || DONE\n" % letter)
    logging.flush()