Beispiel #1
0
def ngrams_plus_wikipedia():
    ngrams = ngrams_plus_dictionary()
    for line in codecs.open("enwiki-titles", encoding="utf-8"):
        line = line.strip().upper()
        if line:
            if any([ord(c) > 127 for c in line]):
                continue
            chars = []
            space = True
            parens = 0
            for ch in line:
                if parens:
                    if ch == "(":
                        parens += 1
                    if ch == ")":
                        parens -= 1
                else:
                    if ch in string.uppercase:
                        chars.append(ch)
                        space = False
                    elif ch == "(":
                        parens += 1
                    elif space == False:
                        chars.append(" ")
                        space = True
            if chars:
                text = "".join(chars)
                nwords = len(text.split())
                bagnum = make_bag(text)
                if (bagnum not in ngrams) or (ngrams[bagnum][2] < 1000):
                    ngrams[bagnum] = (text, nwords, 1000)
                    print text
    return ngrams
def ngrams_plus_wikipedia():
    print 'Loading.'
    ngrams = pickle.load(open('coanagram_data.pickle'))
    print 'Done loading.'
    for line in codecs.open('enwiki-titles', encoding='utf-8'):
        line = line.strip().upper()
        if line:
            if any([ord(c) > 127 for c in line]): continue
            chars = []
            space = True
            parens = 0
            for ch in line:
                if parens:
                    if ch == '(': parens += 1
                    if ch == ')': parens -= 1
                else:
                    if ch in string.uppercase:
                        chars.append(ch)
                        space = False
                    elif ch == '(':
                        parens += 1
                    elif space == False:
                        chars.append(' ')
                        space = True
            if chars:
                text = ''.join(chars)
                nwords = len(text.split())
                bagnum = make_bag(text)
                if (bagnum not in ngrams) or (ngrams[bagnum][0][2] < 1000):
                    ngrams[bagnum] = [(text, nwords, 1000)]
                    print text
    return ngrams
Beispiel #3
0
def ngrams_plus_dictionary():
    ngrams = ngram_data()
    for line in open("enable1.txt"):
        if line.strip():
            text = line.strip().upper()
            bagnum = make_bag(text)
            if bagnum not in ngrams:
                ngrams[bagnum] = (text, 1, 100)
                print text
    return ngrams
Beispiel #4
0
def complex_anagram(text):
    bagnum = make_bag(text)
    firsttry = simple_anagram_numeric(bagnum)
    if firsttry:
        return firsttry[0], 1, firsttry[2]

    bestfreq = 0
    besttext = None
    for text, words, freq in complex_anagram_gen(bagnum):
        if "/" not in text and freq > bestfreq:
            besttext, bestfreq = text, freq
    return besttext, 2, bestfreq
Beispiel #5
0
def ngram_data():
    ngrams = {}
    for filename in "1grams.txt", "2grams.txt", "3grams.txt":
        for line in codecs.open("ngrams/" + filename, encoding="utf-8"):
            if line.strip():
                words, freq = eval(line)
                nwords = len(words)
                if freq >= 10000:
                    text = " ".join(words)
                    bagnum = make_bag(text)
                    if bagnum not in ngrams:
                        ngrams[bagnum] = (text, nwords, freq)
                    elif freq > ngrams[bagnum][2]:
                        # we found a better anagram, let's see it
                        ngrams[bagnum] = (text, nwords, freq)
                        print (text, freq, bagnum)
    return ngrams
Beispiel #6
0
def ngram_data():
    ngrams = {}
    for filename in ['1grams.txt', '2grams.txt', '3grams.txt']:
        for line in codecs.open('ngrams/'+filename, encoding='utf-8'):
            if line.strip():
                words, freq = eval(line)
                nwords = len(words)
                if freq >= 10000:
                    text = ' '.join(words)
                    bagnum = make_bag(text)
                    bagtuple=(text, nwords, freq)
                    if bagnum not in ngrams:
                        ngrams[bagnum] = [bagtuple]
                    elif (freq > ngrams[bagnum][-1][2]):
                        # we found a better anagram, let's see if it stays in the top n
                        ngrams[bagnum].append(bagtuple)
                        ngrams[bagnum].sort(key=bagtuple_compare)
                        ngrams[bagnum] = ngrams[bagnum][:3]
                        if bagtuple in ngrams[bagnum]:
                            print bagtuple
    return ngrams
Beispiel #7
0
def multi_anagram(text, n=10):
    got = []
    bagnum = make_bag(text)
    firsttry = simple_anagram_numeric(bagnum)
    if firsttry:
        got.append((-1, firsttry[2], firsttry[0]))
    for text, words, freq in complex_anagram_gen(bagnum):
        got.append((-2, freq, text))
    got.sort()
    best = []
    used = set()

    for i in range(1, len(got) + 1):
        text = got[-i][2]
        ordered = " ".join(sorted(text.split()))
        if ordered not in used:
            used.add(ordered)
            best.append(got[-i])
            if len(used) >= n:
                break
    return best
Beispiel #8
0
def multi_anagram(text, n=10):
    got = []
    bagnum = make_bag(text)
    firsttry = simple_anagram_numeric(bagnum)
    for text, words, freq in firsttry:
        got.append((-1, freq, text))
    for text, words, freq in complex_anagram_gen(bagnum):
        got.append((-2, freq, text))
    got.sort()
    best = []
    used = set()

    for i in range(1, n*2):
        text = got[-i][2]
        while '/' in text:
            # got a reasonable phrase plus a garble of leftover letters
            # try to do something with the rest
            print '\tRe-anagramming:', text
            before, after = text.split('/')
            reanagram = multi_anagram(after, 1)
            if reanagram:
                newtext = before+' '+reanagram[0][2]
                newfreq = min(got[-i][1], reanagram[0][1])
                got[-i] = (-3, newfreq, newtext)
                text = got[-i][2]
            else: break
    got.sort()

    for i in range(1, len(got)+1):
        text = got[-i][2]
        if '/' in text:
            # edge case where we have a garble we didn't expand
            # because it was too far down the list
            continue
        ordered = ' '.join(sorted(text.split()))
        if ordered not in used:
            used.add(ordered)
            best.append(got[-i])
            if len(used) >= n: break
    return best
Beispiel #9
0
def simple_anagram(text):
    bagnum = make_bag(text)
    return simple_anagram_numeric(bagnum)