def check_match(self): """ Check match between word and word_diac """ if REGEX_DIACS.sub("", self.word_diac) != self.word: sys.stderr.write( "Warning: word " + self.word + " != word_diac " + self.word_diac + " after removing diacritics. Attempting to correct\n" ) self.unnormalize() if REGEX_DIACS.sub("", self.word_diac) != self.word: sys.stderr.write( "Warning: could not correct, word " + self.word + " != word_diac " + self.word_diac + ". Using undiacritized word_diac as word.\n" ) self.word = REGEX_DIACS.sub("", self.word_diac) if REGEX_DIACS.sub("", self.word_diac) != self.word: sys.stderr.write("Warning: still word " + self.word + " != word_diac " + self.word_diac + "\n")
def __init__(self, input_string, comment, index, gold_solution=None, lookup_word=None): self.input_string = input_string self.comment = comment self.index = index self.gold_solution = gold_solution self.lookup_word = lookup_word # if this is an Arabic script word if lookup_word: self.word = REGEX_DIACS.sub("", lookup_word) if gold_solution: match = REGEX_SOLUTION_DIAC.match(gold_solution) if not match: sys.stderr.write( "Warning: could not find diacritized solution in: " + gold_solution + ". " + "Writing lookup word as is: " + lookup_word + "\n" ) self.word_diac = lookup_word else: self.word_diac = match.groups()[0] self.check_match() # there may be no solution if the word is unknown, so just write the lookup word else: self.word_diac = lookup_word # this is a non-Arabic script word else: # TODO consider marking as Lating words (and exclude later) self.word = input_string self.word_diac = input_string
def check_match(self): """ Check match between word and word_diac """ if REGEX_DIACS.sub('', self.word_diac) != self.word: sys.stderr.write('Warning: word ' + self.word + ' != word_diac ' + self.word_diac + \ ' after removing diacritics. Attempting to correct\n') self.unnormalize() if REGEX_DIACS.sub('', self.word_diac) != self.word: sys.stderr.write('Warning: could not correct, word ' + self.word + ' != word_diac ' + \ self.word_diac + '. Using undiacritized word_diac as word.\n') self.word = REGEX_DIACS.sub('', self.word_diac) if REGEX_DIACS.sub('', self.word_diac) != self.word: sys.stderr.write('Warning: still word ' + self.word + ' != word_diac ' + self.word_diac + '\n')
def __init__(self, input_string, comment, index, gold_solution=None, lookup_word=None): self.input_string = input_string self.comment = comment self.index = index self.gold_solution = gold_solution self.lookup_word = lookup_word # if this is an Arabic script word if lookup_word: self.word = REGEX_DIACS.sub('', lookup_word) if gold_solution: match = REGEX_SOLUTION_DIAC.match(gold_solution) if not match: sys.stderr.write('Warning: could not find diacritized solution in: ' + gold_solution + '. ' + \ 'Writing lookup word as is: ' + lookup_word + '\n') self.word_diac = lookup_word else: self.word_diac = match.groups()[0] self.check_match() # there may be no solution if the word is unknown, so just write the lookup word else: self.word_diac = lookup_word # this is a non-Arabic script word else: # TODO consider marking as Lating words (and exclude later) self.word = input_string self.word_diac = input_string
def extract_data(rdi_bw_filename, output_word_filename, output_word_diac_filename): """ Extract data from an RDI file :param rdi_bw_filename: file containing raw Arabic text, preprocessed by MADA preprocessor (keeping diacritics) :param output_word_filename: file to write words without diacritics :param output_word_diac_filename: file to wrote words with diacritics :return: """ print 'extracting data from:', rdi_bw_filename g_word = open(output_word_filename, 'w') g_word_diac = open(output_word_diac_filename, 'w') with open(rdi_bw_filename) as f: for line in f: for token in line.strip().split(): if token.startswith(MADA_LATIN_TAG): sys.stderr.write('Warning: found Latin word: ' + token + '. skipping word.\n') continue word_str = REGEX_DIACS.sub('', token) word_diac_str = token if word_str == '' or word_diac_str == '': sys.stderr.write('Warning: empty word_str ' + word_str + ' or word_diac_str ' + word_diac_str + \ '. skipping word.\n') continue g_word.write(word_str + '\n') g_word_diac.write(word_diac_str + '\n') g_word.write('\n') g_word_diac.write('\n') g_word.close() g_word_diac.close() print 'written words to file:', output_word_filename print 'written words diac to file:', output_word_diac_filename