def Dehyphenate(lines): from greek_tools import split_text_token, is_number import string import re import nltk from nltk.tokenize import RegexpTokenizer n = 0 tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True) # lines = raw.split("\n") text_array = [] # print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) # line_tokens = [regex.sub('',tok) for tok in line_tokens] # for token in line_tokens: # print token.encode('utf-8'), " | " n = n + 1 text_array.append(line_tokens) # print "Done printing lines" # now try to match hyphenated lines with their # correpsonding beginning lines #But first, get rid of numbers at the end of lines, because #they are often in fact blocking the dehyphenation process for line in text_array: try: if is_number(line[-1]): line = line[:-2] except IndexError: pass n = 0 for line in text_array[:-2]: # print line try: # print "last token: ", line[-1].encode('utf-8') if line[-1][-1] == '-': next_non_empty_line = n + 1 while (len(text_array[next_non_empty_line]) < 1): next_non_empty_line += 1 # print "line is ", n, "next non empty is: ", next_non_empty_line # print "it looks like ",text_array[next_non_empty_line], " and has # size", len(text_array[next_non_empty_line]) line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0] text_array[ next_non_empty_line] = text_array[next_non_empty_line][1:] # print "\tadded to form ",line[-1].encode('utf-8') except IndexError: pass n = n + 1 # now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] # now remove extraneous punctuation tokens = [split_text_token(tok)[1] for tok in tokens] # now remove tokens that are not Greek # print "printing tokens" # for token in tokens: # for word in tokens: # print word.encode('utf-8') return tokens
def grecify_left(right_lines): import unicodedata print "doing grecify" print 'linematches length: ', len(right_lines) from greek_tools import is_greek_string, is_number for lines in right_lines: try: for match in lines.line_matches: (left_match, right_match) = match right_test_word = "" right_test_word = ' '.join([a.text for a in right_match]) left_test_word = "" left_test_word = ' '.join([a.text for a in left_match]) print "test_words: ", left_test_word, right_test_word left_is_number = is_number(left_test_word) print '\t', left_test_word, "is a number?", left_is_number print '\t', right_test_word, "is greek?", is_greek_string( right_test_word) if is_greek_string(right_test_word): # and not left_is_number: print '\t', "replacing left" right_pre_spellcheck = "" for a_word in right_match: print 'checking', a_word.element.text if a_word.element.get('data-pre-spellcheck'): print '\t adding', a_word.element.get( 'data-pre-spellcheck') right_pre_spellcheck += a_word.element.get( 'data-pre-spellcheck') #right_pre_spellcheck = ' '.join([a.data-pre-spellcheck for a in right_match]) #store the latin script original in a data attribute so that if our identification is bad, manual editing can fix it left_match[0].element.set('data-lat-original', left_match[0].element.text) left_match[0].element.set('data-pre-spellcheck', right_pre_spellcheck) left_match[0].element.text = unicodedata.normalize( 'NFD', right_test_word) left_match[0].element.set("lang", "grc") left_match[0].element.set( "{http://www.w3.org/XML/1998/namespace}lang", "grc") #if there are additional elements in the source document that were matched, #we need to remove these for match in left_match[1:]: match.element.getparent().remove(match.element) #we don't think this is a Greek word. Nonetheless, let's store the Greek output else: left_match[0].element.set( 'data-rigaudon-output', unicodedata.normalize('NFD', unicode(right_test_word))) #maybe there isn't a line_matches attribute. In which case, keep the left #value except AttributeError, e: print e pass
def grecify_left(right_lines): import unicodedata print "doing grecify" print 'linematches length: ', len(right_lines) from greek_tools import is_greek_string, is_number for lines in right_lines: try: for match in lines.line_matches: (left_match, right_match) = match right_test_word = "" right_test_word = ' '.join([a.text for a in right_match]) left_test_word = "" left_test_word = ' '.join([a.text for a in left_match]) print "test_words: ", left_test_word, right_test_word left_is_number = is_number(left_test_word) print '\t', left_test_word, "is a number?", left_is_number print '\t', right_test_word, "is greek?", is_greek_string(right_test_word) if is_greek_string(right_test_word):# and not left_is_number: print '\t', "replacing left" right_pre_spellcheck = "" for a_word in right_match: print 'checking', a_word.element.text if a_word.element.get('data-pre-spellcheck'): print '\t adding', a_word.element.get('data-pre-spellcheck') right_pre_spellcheck += a_word.element.get('data-pre-spellcheck') #right_pre_spellcheck = ' '.join([a.data-pre-spellcheck for a in right_match]) #store the latin script original in a data attribute so that if our identification is bad, manual editing can fix it left_match[0].element.set('data-lat-original',left_match[0].element.text) left_match[0].element.set('data-pre-spellcheck',right_pre_spellcheck) left_match[0].element.text = unicodedata.normalize('NFD',right_test_word) left_match[0].element.set("lang","grc") left_match[0].element.set("{http://www.w3.org/XML/1998/namespace}lang","grc") #if there are additional elements in the source document that were matched, #we need to remove these for match in left_match[1:]: match.element.getparent().remove(match.element) #we don't think this is a Greek word. Nonetheless, let's store the Greek output else: left_match[0].element.set('data-rigaudon-output',unicodedata.normalize('NFD',unicode(right_test_word))) #maybe there isn't a line_matches attribute. In which case, keep the left #value except AttributeError, e: print e pass