Exemple #1
0
def spellcheck_urls(dict_file,
                    urls,
                    output_file_name,
                    max_weight=9,
                    debug=False):
    from urllib import urlopen
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from itertools import repeat
    from multiprocessing import Pool
    import codecs
    all_tokens = []
    output_file = codecs.open(output_file_name, 'w', 'utf-8')
    #print "numbre of urls: ", len(urls)
    for url in urls:
        raw = urlopen(url).read().decode('utf-8')
        n = 0
        lines = raw.split("\n")
        #if debug:
        #print 'page:', url
        #for line in lines:
        #    print line
        tokens = Dehyphenate(lines)
        #if tokens[-1][-1] = '-':
        #       tokens = tokens[:-1]

        #if debug:
        #    for token in tokens:
        #        print token
        all_tokens = all_tokens + delete_non_greek_tokens(tokens)
    if debug:
        for token in all_tokens:
            print token
    vocab = sorted(set(all_tokens))
    print "vocab of ", len(vocab), " words"
    #for word in vocab:
    #  print word
    vocab = [word for word in vocab if not is_uc_word(word)]
    vocab = [word.rstrip() for word in vocab]
    vocab = [word for word in vocab if not word[-1] == '-']
    vocab = [word for word in vocab if not len(word) == 1]
    print "non-capital words: ", len(vocab)
    if debug:
        print "Are they capitalized?"
        from greek_tools import is_capitalized
        for wordIn in vocab:
            #wordIn = preprocess_word(wordIn)
            print wordIn, is_capitalized(wordIn)
    print "making dicts"
    import time
    start_time = time.time()
    word_dicts = makeDict(dict_file)
    dict_time = time.time() - start_time
    minutes = dict_time / 60.0
    print "dict building took", minutes, " minutes."
    vocab_chunks = list(chunks(vocab, len(vocab) / 8))
    print "vocab is ", len(vocab)
    processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts),
                                 repeat(max_weight))
    print "there are ", len(processed_vocab_chunks), "chunks"
    start_time = time.time()
    # print "dictionary of ", len(dict_words), "words"
    # vocab = [preprocess_word(a_word) for a_word in vocab]
    # why doesn't this trimm all the ones that pass spellcheck?
    # vocab = sorted(set(vocab).difference(set(dict_words)))
    # print "vocab trimmed of dictionary words to ", len(vocab)
    p = Pool(processes=10)
    output = p.map(process_vocab, processed_vocab_chunks)
    for output_chunk in output:
        output_file.write(output_chunk)
    pool_time = time.time() - start_time
    minutes = pool_time / 60.0
    print "processing took", minutes, " minutes"
Exemple #2
0
def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False):
    from urllib import urlopen
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from itertools import repeat
    from multiprocessing import Pool
    import codecs
    all_tokens = []
    output_file= codecs.open(output_file_name, 'w', 'utf-8')
    #print "numbre of urls: ", len(urls)
    for url in urls:
        raw = urlopen(url).read().decode('utf-8')
        n = 0
        lines = raw.split("\n")
        #if debug:
            #print 'page:', url
            #for line in lines:
            #    print line
        tokens  =  Dehyphenate(lines)
	#if tokens[-1][-1] = '-':
        #       tokens = tokens[:-1]

        #if debug:
        #    for token in tokens:
        #        print token
        all_tokens = all_tokens + delete_non_greek_tokens(tokens)
    if debug:
        for token in all_tokens:
            print token
    vocab = sorted(set(all_tokens))
    print "vocab of ", len(vocab), " words"
    #for word in vocab:
    #  print word
    vocab = [word for word in vocab if not is_uc_word(word)]
    vocab = [word.rstrip() for word in vocab]
    vocab = [word for word in vocab if not  word[-1] == '-']
    vocab = [word for word in vocab if not len(word) == 1 ]
    print "non-capital words: ", len(vocab)
    if debug:
      print "Are they capitalized?"
      from greek_tools import is_capitalized
      for wordIn in vocab:
        #wordIn = preprocess_word(wordIn)
        print wordIn, is_capitalized(wordIn)
    print "making dicts"
    import time
    start_time = time.time()
    word_dicts = makeDict(dict_file)
    dict_time = time.time() - start_time
    minutes = dict_time / 60.0
    print "dict building took", minutes, " minutes."
    vocab_chunks = list(chunks(vocab, len(vocab) / 8))
    print "vocab is ", len(vocab)
    processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight))
    print "there are ", len(processed_vocab_chunks), "chunks"
    start_time = time.time()
    # print "dictionary of ", len(dict_words), "words"
    # vocab = [preprocess_word(a_word) for a_word in vocab]
    # why doesn't this trimm all the ones that pass spellcheck?
    # vocab = sorted(set(vocab).difference(set(dict_words)))
    # print "vocab trimmed of dictionary words to ", len(vocab)
    p = Pool(processes=10)
    output = p.map(process_vocab,processed_vocab_chunks)
    for output_chunk in output:
        output_file.write(output_chunk)
    pool_time = time.time() - start_time
    minutes = pool_time / 60.0
    print "processing took", minutes, " minutes"
    # print "lines: ", len(lines)
    for line in lines:
        #       print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
        # for token in line_tokens:
        #       print token.encode('utf-8'), " | "
        # n = n + 1
        text_array.append(line_tokens)

    # now try to match hyphenated lines with their
    # correpsonding beginning lines
n = 0
for line in text_array:
    if len(line) > 0:
        if line[-1][-1] == "-":
            try:
                line[-1] = line[-1][:-1] + text_array[n + 1][0]
                text_array[n + 1] = text_array[n + 1][1:]
            except IndexError as e:
                print e
    n = n + 1
    # now flatten the 2d array
tokens = [item for sublist in text_array for item in sublist]
tokens = delete_non_greek_tokens(tokens)
for token in tokens:
    fdist.inc(token)

print "most common: ", fdist.max().encode("utf-8")
for item in fdist.keys():
    print item.encode("utf-8"), fdist.freq(item)
Exemple #4
0
    #print "lines: ", len(lines)
    for line in lines:
        #       print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
        #for token in line_tokens:
        #       print token.encode('utf-8'), " | "
        #n = n + 1
        text_array.append(line_tokens)

#now try to match hyphenated lines with their
#correpsonding beginning lines
n = 0
for line in text_array:
    if len(line) > 0:
        if line[-1][-1] == '-':
            try:
                line[-1] = line[-1][:-1] + text_array[n + 1][0]
                text_array[n + 1] = text_array[n + 1][1:]
            except IndexError as e:
                print e
    n = n + 1
#now flatten the 2d array
tokens = [item for sublist in text_array for item in sublist]
tokens = delete_non_greek_tokens(tokens)
for token in tokens:
    fdist.inc(token)

print "most common: ", fdist.max().encode('utf-8')
for item in fdist.keys():
    print item.encode('utf-8'), fdist.freq(item)