Ejemplo n.º 1
0
def Dehyphenate(lines):
    from greek_tools import split_text_token, is_number
    import string
    import re
    import nltk
    from nltk.tokenize import RegexpTokenizer
    n = 0
    tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True)
    # lines = raw.split("\n")
    text_array = []
#        print "lines: ", len(lines)
    for line in lines:
  #              print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
#		line_tokens = [regex.sub('',tok) for tok in line_tokens]
#                for token in line_tokens:
#                       print token.encode('utf-8'), " | "
        n = n + 1
        text_array.append(line_tokens)
 #       print "Done printing lines"
    # now try to match hyphenated lines with their
    # correpsonding beginning lines
    #But first, get rid of numbers at the end of lines, because
    #they are often in fact blocking the dehyphenation process
    for line in text_array:
	try:
        	if is_number(line[-1]):
			line = line[:-2]
	except IndexError:
		pass

    n = 0
    for line in text_array[:-2]:
   #             print line
        try:
            # print "last token: ", line[-1].encode('utf-8')
            if line[-1][-1] == '-':
                next_non_empty_line = n + 1
                while (len(text_array[next_non_empty_line]) < 1):
                    next_non_empty_line += 1
     #                           print "line is ", n, "next non empty is: ", next_non_empty_line
      # print "it looks like ",text_array[next_non_empty_line], " and has
      # size", len(text_array[next_non_empty_line])
                line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0]
                text_array[
                    next_non_empty_line] = text_array[next_non_empty_line][1:]
                # print "\tadded to form ",line[-1].encode('utf-8')
        except IndexError:
            pass
        n = n + 1
    # now flatten the 2d array
    tokens = [item for sublist in text_array for item in sublist]
    # now remove extraneous punctuation
    tokens = [split_text_token(tok)[1] for tok in tokens]
    # now remove tokens that are not Greek
#        print "printing tokens"
#        for token in tokens:
#                for word in tokens:
#                        print word.encode('utf-8')
    return tokens
Ejemplo n.º 2
0
def Dehyphenate(lines):
    from greek_tools import split_text_token, is_number
    import string
    import re
    import nltk
    from nltk.tokenize import RegexpTokenizer
    n = 0
    tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True)
    # lines = raw.split("\n")
    text_array = []
#        print "lines: ", len(lines)
    for line in lines:
  #              print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
#		line_tokens = [regex.sub('',tok) for tok in line_tokens]
#                for token in line_tokens:
#                       print token.encode('utf-8'), " | "
        n = n + 1
        text_array.append(line_tokens)
 #       print "Done printing lines"
    # now try to match hyphenated lines with their
    # correpsonding beginning lines
    #But first, get rid of numbers at the end of lines, because
    #they are often in fact blocking the dehyphenation process
    for line in text_array:
	try:
        	if is_number(line[-1]):
			line = line[:-2]
	except IndexError:
		pass

    n = 0
    for line in text_array[:-2]:
   #             print line
        try:
            # print "last token: ", line[-1].encode('utf-8')
            if line[-1][-1] == '-':
                next_non_empty_line = n + 1
                while (len(text_array[next_non_empty_line]) < 1):
                    next_non_empty_line += 1
     #                           print "line is ", n, "next non empty is: ", next_non_empty_line
      # print "it looks like ",text_array[next_non_empty_line], " and has
      # size", len(text_array[next_non_empty_line])
                line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0]
                text_array[
                    next_non_empty_line] = text_array[next_non_empty_line][1:]
                # print "\tadded to form ",line[-1].encode('utf-8')
        except IndexError:
            pass
        n = n + 1
    # now flatten the 2d array
    tokens = [item for sublist in text_array for item in sublist]
    # now remove extraneous punctuation
    tokens = [split_text_token(tok)[1] for tok in tokens]
    # now remove tokens that are not Greek
#        print "printing tokens"
#        for token in tokens:
#                for word in tokens:
#                        print word.encode('utf-8')
    return tokens
Ejemplo n.º 3
0
def grecify_left(right_lines):
    import unicodedata
    print "doing grecify"
    print 'linematches length: ', len(right_lines)
    from greek_tools import is_greek_string, is_number

    for lines in right_lines:
        try:
            for match in lines.line_matches:
                (left_match, right_match) = match
                right_test_word = ""
                right_test_word = ' '.join([a.text for a in right_match])
                left_test_word = ""
                left_test_word = ' '.join([a.text for a in left_match])
                print "test_words: ", left_test_word, right_test_word
                left_is_number = is_number(left_test_word)
                print '\t', left_test_word, "is a number?", left_is_number
                print '\t', right_test_word, "is greek?", is_greek_string(
                    right_test_word)
                if is_greek_string(right_test_word):  # and not left_is_number:
                    print '\t', "replacing left"
                    right_pre_spellcheck = ""
                    for a_word in right_match:
                        print 'checking', a_word.element.text
                        if a_word.element.get('data-pre-spellcheck'):
                            print '\t adding', a_word.element.get(
                                'data-pre-spellcheck')
                            right_pre_spellcheck += a_word.element.get(
                                'data-pre-spellcheck')


#right_pre_spellcheck = ' '.join([a.data-pre-spellcheck for a in right_match])
#store the latin script original in a data attribute so that if our identification is bad, manual editing can fix it
                    left_match[0].element.set('data-lat-original',
                                              left_match[0].element.text)
                    left_match[0].element.set('data-pre-spellcheck',
                                              right_pre_spellcheck)
                    left_match[0].element.text = unicodedata.normalize(
                        'NFD', right_test_word)
                    left_match[0].element.set("lang", "grc")
                    left_match[0].element.set(
                        "{http://www.w3.org/XML/1998/namespace}lang", "grc")
                    #if there are additional elements in the source document that were matched,
                    #we need to remove these
                    for match in left_match[1:]:
                        match.element.getparent().remove(match.element)
                #we don't think this is a Greek word. Nonetheless, let's store the Greek output
                else:
                    left_match[0].element.set(
                        'data-rigaudon-output',
                        unicodedata.normalize('NFD', unicode(right_test_word)))

        #maybe there isn't a line_matches attribute. In which case, keep the left
        #value
        except AttributeError, e:
            print e
            pass
Ejemplo n.º 4
0
def grecify_left(right_lines):
    import unicodedata
    print "doing grecify"
    print 'linematches length: ', len(right_lines)
    from greek_tools import is_greek_string, is_number
    
    for lines in right_lines:
        try:
            for match in lines.line_matches:
                (left_match, right_match) = match
                right_test_word = ""
                right_test_word = ' '.join([a.text for a in right_match])
                left_test_word = ""
                left_test_word = ' '.join([a.text for a in left_match])
                print "test_words: ", left_test_word, right_test_word
                left_is_number = is_number(left_test_word)
                print '\t', left_test_word, "is a number?", left_is_number
		print '\t', right_test_word, "is greek?", is_greek_string(right_test_word)
                if is_greek_string(right_test_word):# and not left_is_number:
                    print '\t', "replacing left"
                    right_pre_spellcheck = ""
                    for a_word in right_match:
                        print 'checking', a_word.element.text
                        if a_word.element.get('data-pre-spellcheck'):
                           print '\t adding', a_word.element.get('data-pre-spellcheck')
                           right_pre_spellcheck += a_word.element.get('data-pre-spellcheck')
#right_pre_spellcheck = ' '.join([a.data-pre-spellcheck for a in right_match])
                    #store the latin script original in a data attribute so that if our identification is bad, manual editing can fix it
                    left_match[0].element.set('data-lat-original',left_match[0].element.text)
                    left_match[0].element.set('data-pre-spellcheck',right_pre_spellcheck)
                    left_match[0].element.text = unicodedata.normalize('NFD',right_test_word)
                    left_match[0].element.set("lang","grc")
                    left_match[0].element.set("{http://www.w3.org/XML/1998/namespace}lang","grc")
                    #if there are additional elements in the source document that were matched, 
                    #we need to remove these 
                    for match in left_match[1:]:
                            match.element.getparent().remove(match.element)
                #we don't think this is a Greek word. Nonetheless, let's store the Greek output
                else:
		    left_match[0].element.set('data-rigaudon-output',unicodedata.normalize('NFD',unicode(right_test_word)))
		    

        #maybe there isn't a line_matches attribute. In which case, keep the left
        #value
        except AttributeError, e:
            print e
            pass