def Dehyphenate(lines): from greek_tools import split_text_token, is_number import string import re import nltk from nltk.tokenize import RegexpTokenizer n = 0 tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True) # lines = raw.split("\n") text_array = [] # print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) # line_tokens = [regex.sub('',tok) for tok in line_tokens] # for token in line_tokens: # print token.encode('utf-8'), " | " n = n + 1 text_array.append(line_tokens) # print "Done printing lines" # now try to match hyphenated lines with their # correpsonding beginning lines #But first, get rid of numbers at the end of lines, because #they are often in fact blocking the dehyphenation process for line in text_array: try: if is_number(line[-1]): line = line[:-2] except IndexError: pass n = 0 for line in text_array[:-2]: # print line try: # print "last token: ", line[-1].encode('utf-8') if line[-1][-1] == '-': next_non_empty_line = n + 1 while (len(text_array[next_non_empty_line]) < 1): next_non_empty_line += 1 # print "line is ", n, "next non empty is: ", next_non_empty_line # print "it looks like ",text_array[next_non_empty_line], " and has # size", len(text_array[next_non_empty_line]) line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0] text_array[ next_non_empty_line] = text_array[next_non_empty_line][1:] # print "\tadded to form ",line[-1].encode('utf-8') except IndexError: pass n = n + 1 # now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] # now remove extraneous punctuation tokens = [split_text_token(tok)[1] for tok in tokens] # now remove tokens that are not Greek # print "printing tokens" # for token in tokens: # for word in tokens: # print word.encode('utf-8') return tokens
def add_word(word_count, word): word_no_punct = split_text_token(word)[1] word_no_punct = preprocess_word(word_no_punct) if len(word_no_punct) > 0: word = word_no_punct if True: #is_greek_string(word): if word not in word_count: word_count[word] = 1 else: word_count[word] += 1 return word_count
#fileOut = codecs.open(fileOut_name, 'w','utf-8') fileOut = open(fileOut_name, 'w') print "checking", fileIn_name, "sending to ", fileOut_name treeIn = etree.parse(fileIn) root = treeIn.getroot() hocr_word_elements = treeIn.xpath( "//html:span[@class='ocr_word'] | //span[@class='ocr_word']", namespaces={'html': "http://www.w3.org/1999/xhtml"}) for word_element in hocr_word_elements: #print word_element.text try: word = unicodedata.normalize('NFD', word_element.text) except TypeError: word = unicodedata.normalize('NFD', unicode(word_element.text)) #dump(word) parts = split_text_token(word) #for part in parts: #print '\t',part try: #print "trying to check", parts[1] error_word = parts[1] parts = (parts[0], spellcheck_dict[parts[1]], parts[2]) print "replaced", error_word, "with", parts[1] dump(error_word) print dump(parts[1]) word_element.set('data-pre-spellcheck', word) # dump(parts[1]) except KeyError: #print "no check" pass
def in_dict_lower(dictionary,word): from greek_tools import split_text_token return split_text_token(word)[1].replace('\'',u'’').lower() in dictionary
fileOut_name = os.path.join(dir_out,simplified_name) fileIn= codecs.open(fileIn_name,'r','utf-8') #fileOut = codecs.open(fileOut_name, 'w','utf-8') fileOut = open(fileOut_name,'w') print "checking", fileIn_name, "sending to ", fileOut_name treeIn = etree.parse(fileIn) root = treeIn.getroot() hocr_word_elements = treeIn.xpath("//html:span[@class='ocr_word'] | //span[@class='ocr_word']",namespaces={'html':"http://www.w3.org/1999/xhtml"}) for word_element in hocr_word_elements: #print word_element.text try: word = unicodedata.normalize('NFD',word_element.text) except TypeError: word = unicodedata.normalize('NFD',unicode(word_element.text)) #dump(word) parts = split_text_token(word) #for part in parts: #print '\t',part try: #print "trying to check", parts[1] error_word = parts[1] parts = (parts[0], spellcheck_dict[parts[1]], parts[2]) print "replaced", error_word, "with", parts[1] dump(error_word) print dump(parts[1]) word_element.set('data-pre-spellcheck',word) # dump(parts[1]) except KeyError: #print "no check" pass