Esempio n. 1
0
def Dehyphenate(lines):
    from greek_tools import split_text_token, is_number
    import string
    import re
    import nltk
    from nltk.tokenize import RegexpTokenizer
    n = 0
    tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True)
    # lines = raw.split("\n")
    text_array = []
#        print "lines: ", len(lines)
    for line in lines:
  #              print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
#		line_tokens = [regex.sub('',tok) for tok in line_tokens]
#                for token in line_tokens:
#                       print token.encode('utf-8'), " | "
        n = n + 1
        text_array.append(line_tokens)
 #       print "Done printing lines"
    # now try to match hyphenated lines with their
    # correpsonding beginning lines
    #But first, get rid of numbers at the end of lines, because
    #they are often in fact blocking the dehyphenation process
    for line in text_array:
	try:
        	if is_number(line[-1]):
			line = line[:-2]
	except IndexError:
		pass

    n = 0
    for line in text_array[:-2]:
   #             print line
        try:
            # print "last token: ", line[-1].encode('utf-8')
            if line[-1][-1] == '-':
                next_non_empty_line = n + 1
                while (len(text_array[next_non_empty_line]) < 1):
                    next_non_empty_line += 1
     #                           print "line is ", n, "next non empty is: ", next_non_empty_line
      # print "it looks like ",text_array[next_non_empty_line], " and has
      # size", len(text_array[next_non_empty_line])
                line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0]
                text_array[
                    next_non_empty_line] = text_array[next_non_empty_line][1:]
                # print "\tadded to form ",line[-1].encode('utf-8')
        except IndexError:
            pass
        n = n + 1
    # now flatten the 2d array
    tokens = [item for sublist in text_array for item in sublist]
    # now remove extraneous punctuation
    tokens = [split_text_token(tok)[1] for tok in tokens]
    # now remove tokens that are not Greek
#        print "printing tokens"
#        for token in tokens:
#                for word in tokens:
#                        print word.encode('utf-8')
    return tokens
Esempio n. 2
0
def Dehyphenate(lines):
    from greek_tools import split_text_token, is_number
    import string
    import re
    import nltk
    from nltk.tokenize import RegexpTokenizer
    n = 0
    tokenizer = RegexpTokenizer('[)(;><.\s]+', gaps=True)
    # lines = raw.split("\n")
    text_array = []
#        print "lines: ", len(lines)
    for line in lines:
  #              print n, line.encode('utf-8')
        line_tokens = tokenizer.tokenize(line)
#		line_tokens = [regex.sub('',tok) for tok in line_tokens]
#                for token in line_tokens:
#                       print token.encode('utf-8'), " | "
        n = n + 1
        text_array.append(line_tokens)
 #       print "Done printing lines"
    # now try to match hyphenated lines with their
    # correpsonding beginning lines
    #But first, get rid of numbers at the end of lines, because
    #they are often in fact blocking the dehyphenation process
    for line in text_array:
	try:
        	if is_number(line[-1]):
			line = line[:-2]
	except IndexError:
		pass

    n = 0
    for line in text_array[:-2]:
   #             print line
        try:
            # print "last token: ", line[-1].encode('utf-8')
            if line[-1][-1] == '-':
                next_non_empty_line = n + 1
                while (len(text_array[next_non_empty_line]) < 1):
                    next_non_empty_line += 1
     #                           print "line is ", n, "next non empty is: ", next_non_empty_line
      # print "it looks like ",text_array[next_non_empty_line], " and has
      # size", len(text_array[next_non_empty_line])
                line[-1] = line[-1][:-1] + text_array[next_non_empty_line][0]
                text_array[
                    next_non_empty_line] = text_array[next_non_empty_line][1:]
                # print "\tadded to form ",line[-1].encode('utf-8')
        except IndexError:
            pass
        n = n + 1
    # now flatten the 2d array
    tokens = [item for sublist in text_array for item in sublist]
    # now remove extraneous punctuation
    tokens = [split_text_token(tok)[1] for tok in tokens]
    # now remove tokens that are not Greek
#        print "printing tokens"
#        for token in tokens:
#                for word in tokens:
#                        print word.encode('utf-8')
    return tokens
def add_word(word_count, word):
    
    word_no_punct = split_text_token(word)[1]
    word_no_punct = preprocess_word(word_no_punct)
    if len(word_no_punct) > 0:
        word = word_no_punct
    if True: #is_greek_string(word):
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1
    return word_count
Esempio n. 4
0
 #fileOut = codecs.open(fileOut_name, 'w','utf-8')
 fileOut = open(fileOut_name, 'w')
 print "checking", fileIn_name, "sending to ", fileOut_name
 treeIn = etree.parse(fileIn)
 root = treeIn.getroot()
 hocr_word_elements = treeIn.xpath(
     "//html:span[@class='ocr_word'] | //span[@class='ocr_word']",
     namespaces={'html': "http://www.w3.org/1999/xhtml"})
 for word_element in hocr_word_elements:
     #print word_element.text
     try:
         word = unicodedata.normalize('NFD', word_element.text)
     except TypeError:
         word = unicodedata.normalize('NFD', unicode(word_element.text))
     #dump(word)
     parts = split_text_token(word)
     #for part in parts:
     #print '\t',part
     try:
         #print "trying to check", parts[1]
         error_word = parts[1]
         parts = (parts[0], spellcheck_dict[parts[1]], parts[2])
         print "replaced", error_word, "with", parts[1]
         dump(error_word)
         print
         dump(parts[1])
         word_element.set('data-pre-spellcheck', word)
     #  dump(parts[1])
     except KeyError:
         #print "no check"
         pass
Esempio n. 5
0
def in_dict_lower(dictionary,word):
    from greek_tools import split_text_token
    return split_text_token(word)[1].replace('\'',u'’').lower() in dictionary
Esempio n. 6
0
def in_dict_lower(dictionary,word):
    from greek_tools import split_text_token
    return split_text_token(word)[1].replace('\'',u'’').lower() in dictionary
Esempio n. 7
0
                fileOut_name = os.path.join(dir_out,simplified_name)
                fileIn= codecs.open(fileIn_name,'r','utf-8')
                #fileOut = codecs.open(fileOut_name, 'w','utf-8')
                fileOut = open(fileOut_name,'w')
		print "checking", fileIn_name, "sending to ", fileOut_name
                treeIn = etree.parse(fileIn)
                root = treeIn.getroot()
                hocr_word_elements = treeIn.xpath("//html:span[@class='ocr_word'] | //span[@class='ocr_word']",namespaces={'html':"http://www.w3.org/1999/xhtml"})
                for word_element in hocr_word_elements:
                   #print word_element.text
                   try:
                      word = unicodedata.normalize('NFD',word_element.text)
                   except TypeError:
                      word = unicodedata.normalize('NFD',unicode(word_element.text))
                   #dump(word)
                   parts = split_text_token(word)
                   #for part in parts:
                        #print '\t',part
                   try:
                      #print "trying to check", parts[1]
                      error_word = parts[1]
                      parts = (parts[0], spellcheck_dict[parts[1]], parts[2])
                      print "replaced", error_word, "with", parts[1]
                      dump(error_word)
                      print 
                      dump(parts[1])
                      word_element.set('data-pre-spellcheck',word)
                    #  dump(parts[1])
                   except KeyError:
                      #print "no check"
                      pass