Ejemplo n.º 1
0
Archivo: do.py Proyecto: amitdo/pyspell
def wiki_stats(env):
    """
        Basic statistics from a wiki.

        Note: not tested with larger wikis!
    """
    from pyspell._utils import non_sk_words
    from simplewiki import wiki

    wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["input"]["wiki_xml"])
    wiki_out_freqs = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_freqs"])
    if not os.path.exists(wiki_input):
        raise Exception("Wiki input not found [%s]" % wiki_input)
    w = wiki(wiki_input)
    freqs = defaultdict(int)
    all_words = 0
    log_every_n = env["log_every_n"]
    for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)):
        if 0 == len(word) or non_sk_words(word):
            continue
        word = word.lower()
        all_words += 1
        freqs[word] += 1
        if 0 == all_words % log_every_n:
            perc = round((100. * float(len(freqs))) / all_words, 3)
            _logger.info(
                    "done [%8d] words ... [%8d][%.2f%%] unique words ... [%5d] pages",
                    all_words, len(freqs), perc, page_id
            )

    print "   # of all words: %6d" % all_words
    print "# of unique words: %6d" % len(freqs)
    import heapq
    nth = 100
    nth = 100
    too_few_occurrences = 20
    baseline = heapq.nlargest(nth, freqs.values())[-1]
    d = defaultdict(list)
    min_occurs_cnt = 0
    min_occurrences = [None, 100, 90, 80, 70, 60, 40, 20]
    min_occurrences_freq = defaultdict(int)
    json.dump(freqs, open(wiki_out_freqs, "w+"), encoding="utf-8")
    for k, v in freqs.iteritems():
        too_few_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(k))]
        if v < too_few_occurrences:
            min_occurrences_freq[len(k)] += 1
            min_occurs_cnt += 1
        if v >= baseline:
            d[v].append(k)
    print "# of unique words that occurred < %d times: %6d" % (
        too_few_occurrences, min_occurs_cnt
    )

    for k, v in min_occurrences_freq.iteritems():
        print "Words with len [%3d] occurred < too_few_occurrences [%4d] times" % (k, v)

    for k in sorted(d.keys(), reverse=True):
        for v in d[k]:
            print "%6s: %4d" % (v, k)
Ejemplo n.º 2
0
def wiki_stats(env):
    """
        Basic statistics from a wiki.

        Note: not tested with larger wikis!
    """
    from pyspell._utils import non_sk_words
    from simplewiki import wiki

    wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["input"]["wiki_xml"])
    wiki_out_freqs = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_freqs"])
    if not os.path.exists(wiki_input):
        raise Exception("Wiki input not found [%s]" % wiki_input)
    w = wiki(wiki_input)
    freqs = defaultdict(int)
    all_words = 0
    log_every_n = env["log_every_n"]
    for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)):
        if 0 == len(word) or non_sk_words(word):
            continue
        word = word.lower()
        all_words += 1
        freqs[word] += 1
        if 0 == all_words % log_every_n:
            perc = round((100. * float(len(freqs))) / all_words, 3)
            _logger.info(
                    "done [%8d] words ... [%8d][%.2f%%] unique words ... [%5d] pages",
                    all_words, len(freqs), perc, page_id
            )

    print "   # of all words: %6d" % all_words
    print "# of unique words: %6d" % len(freqs)
    import heapq
    nth = 100
    too_few_occurrences = 20
    baseline = heapq.nlargest(nth, freqs.values())[-1]
    d = defaultdict(list)
    min_occurs_cnt = 0
    min_occurrences = [None, 100, 90, 80, 70, 60, 40, 20]
    min_occurrences_freq = defaultdict(int)
    json.dump(freqs, open(wiki_out_freqs, "w+"), encoding="utf-8")
    for k, v in freqs.iteritems():
        too_few_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(k))]
        if v < too_few_occurrences:
            min_occurrences_freq[len(k)] += 1
            min_occurs_cnt += 1
        if v >= baseline:
            d[v].append(k)
    print "# of unique words that occurred < %d times: %6d" % (
        too_few_occurrences, min_occurs_cnt
    )

    for k, v in min_occurrences_freq.iteritems():
        print "Words with len [%3d] occurred < too_few_occurrences [%4d] times" % (k, v)

    for k in sorted(d.keys(), reverse=True):
        for v in d[k]:
            print "%6s: %4d" % (v, k)
Ejemplo n.º 3
0
Archivo: do.py Proyecto: amitdo/pyspell
def _is_important_valid_word(word, f, non_sk_words):
    dyn_min_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(word))]
    if dyn_min_occurrences <= f:
        if non_sk_words(word):
            # important and non valid
            return True, False
        # important and valid
        return True, True
    # not important for now and we do not know if valid
    return False, None
Ejemplo n.º 4
0
def _is_important_valid_word(word, f, non_sk_words):
    dyn_min_occurrences = min_occurrences[min(len(min_occurrences) - 1, len(word))]
    if dyn_min_occurrences <= f:
        if non_sk_words(word):
            # important and non valid
            return True, False
        # important and valid
        return True, True
    # not important for now and we do not know if valid
    return False, None
Ejemplo n.º 5
0
Archivo: do.py Proyecto: amitdo/pyspell
def wiki_words(env):
    """
        Gather most used words according to a specific definition
        Note: not tested with larger wikis!
    """
    from simplewiki import wiki
    from pyspell._utils import non_sk_words

    wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["wiki_xml"])
    wiki_words_output = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"])
    log_every_n = env["log_every_n"]
    if not os.path.exists(wiki_input):
        raise Exception("Wiki input not found [%s]" % wiki_input)
    w = wiki(wiki_input)

    done_occurrence = 1234567
    freqs = defaultdict(int)
    capital_freqs = defaultdict(int)
    with codecs.open(wiki_words_output, mode="w+", encoding="utf-8") as fout:
        for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)):
            if 0 == len(word):
                continue
            f = freqs[word]
            # skip already done or strange
            if done_occurrence == f or 0 > f:
                continue

            if sentence_start and word[0].isupper():
                capital_freqs[word] += 1
                continue

            freqs[word] = f + 1
            is_important, is_valid = _is_important_valid_word(w, f, non_sk_words)
            if is_important:
                if not is_valid:
                    # do not bother with that one again
                    freqs[word] = -1
                    continue
                freqs[word] = done_occurrence

                # have we already output the same but lowercase?
                if not word.islower() and done_occurrence == freqs[word.lower()]:
                    _logger.warn(u"Processing non-lower word [%s] but lower has been already processed", word)

                if word.islower() and done_occurrence == freqs[word[0].upper() + word[1:]]:
                    _logger.warn(u"Processing lower word [%s] but non-lower has been already processed", word)

                if not word.islower():
                    iword = word.lower()
                    iwordfreq = freqs[iword]
                    if 0 < iwordfreq and float(f) / float(iwordfreq) <= 2.:
                        _logger.warn(
                                u"Capital first being processed [%s][%d] but non capital is not 0 [%d]",
                                word, f, iwordfreq
                        )
                    if word in capital_freqs:
                        del capital_freqs[word]

                fout.write(word + u"\n")
                if 0 == pos % log_every_n:
                    _logger.info("done [%8d] words ... [%5d] pages", pos, page_id)

    _logger.info("Could not get capitals right:")
    for k, v in capital_freqs.iteritems():
        if v > min_occurrences[-1] / 3:
            if non_sk_words(k):
                continue
            if done_occurrence == freqs[k.lower()]:
                continue
            _logger.info("%10s: %2d", k, v)
Ejemplo n.º 6
0
def wiki_words(env):
    """
        Gather most used words according to a specific definition
        Note: not tested with larger wikis!
    """
    from simplewiki import wiki
    from pyspell._utils import non_sk_words

    wiki_input = os.path.join(env["start_dir"], env["input"]["dir"], env["wiki_xml"])
    wiki_words_output = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"])
    log_every_n = env["log_every_n"]
    if not os.path.exists(wiki_input):
        raise Exception("Wiki input not found [%s]" % wiki_input)
    w = wiki(wiki_input)

    done_occurrence = 1234567
    freqs = defaultdict(int)
    capital_freqs = defaultdict(int)
    with codecs.open(wiki_words_output, mode="w+", encoding="utf-8") as fout:
        for pos, (wordorig, word, sentence_start, page_id) in enumerate(w.words(True)):
            if 0 == len(word):
                continue
            f = freqs[word]
            # skip already done or strange
            if done_occurrence == f or 0 > f:
                continue

            if sentence_start and word[0].isupper():
                capital_freqs[word] += 1
                continue

            freqs[word] = f + 1
            is_important, is_valid = _is_important_valid_word(w, f, non_sk_words)
            if is_important:
                if not is_valid:
                    # do not bother with that one again
                    freqs[word] = -1
                    continue
                freqs[word] = done_occurrence

                # have we already output the same but lowercase?
                if not word.islower() and done_occurrence == freqs[word.lower()]:
                    _logger.warn(u"Processing non-lower word [%s] but lower has been already processed", word)

                if word.islower() and done_occurrence == freqs[word[0].upper() + word[1:]]:
                    _logger.warn(u"Processing lower word [%s] but non-lower has been already processed", word)

                if not word.islower():
                    iword = word.lower()
                    iwordfreq = freqs[iword]
                    if 0 < iwordfreq and float(f) / float(iwordfreq) <= 2.:
                        _logger.warn(
                                u"Capital first being processed [%s][%d] but non capital is not 0 [%d]",
                                word, f, iwordfreq
                        )
                    if word in capital_freqs:
                        del capital_freqs[word]

                fout.write(word + u"\n")
                if 0 == pos % log_every_n:
                    _logger.info("done [%8d] words ... [%5d] pages", pos, page_id)

    _logger.info("Could not get capitals right:")
    for k, v in capital_freqs.iteritems():
        if v > min_occurrences[-1] / 3:
            if non_sk_words(k):
                continue
            if done_occurrence == freqs[k.lower()]:
                continue
            _logger.info("%10s: %2d", k, v)