Ejemplo n.º 1
0
    def check_match(self):
        """
        Check match between word and word_diac
        """

        if REGEX_DIACS.sub("", self.word_diac) != self.word:
            sys.stderr.write(
                "Warning: word "
                + self.word
                + " != word_diac "
                + self.word_diac
                + " after removing diacritics. Attempting to correct\n"
            )
            self.unnormalize()
        if REGEX_DIACS.sub("", self.word_diac) != self.word:
            sys.stderr.write(
                "Warning: could not correct, word "
                + self.word
                + " != word_diac "
                + self.word_diac
                + ". Using undiacritized word_diac as word.\n"
            )
            self.word = REGEX_DIACS.sub("", self.word_diac)
        if REGEX_DIACS.sub("", self.word_diac) != self.word:
            sys.stderr.write("Warning: still word " + self.word + " != word_diac " + self.word_diac + "\n")
Ejemplo n.º 2
0
    def __init__(self, input_string, comment, index, gold_solution=None, lookup_word=None):

        self.input_string = input_string
        self.comment = comment
        self.index = index
        self.gold_solution = gold_solution
        self.lookup_word = lookup_word
        # if this is an Arabic script word
        if lookup_word:
            self.word = REGEX_DIACS.sub("", lookup_word)
            if gold_solution:
                match = REGEX_SOLUTION_DIAC.match(gold_solution)
                if not match:
                    sys.stderr.write(
                        "Warning: could not find diacritized solution in: "
                        + gold_solution
                        + ". "
                        + "Writing lookup word as is: "
                        + lookup_word
                        + "\n"
                    )
                    self.word_diac = lookup_word
                else:
                    self.word_diac = match.groups()[0]
                    self.check_match()

            # there may be no solution if the word is unknown, so just write the lookup word
            else:
                self.word_diac = lookup_word

        # this is a non-Arabic script word
        else:
            # TODO consider marking as Lating words (and exclude later)
            self.word = input_string
            self.word_diac = input_string
Ejemplo n.º 3
0
    def check_match(self):
        """
        Check match between word and word_diac
        """

        if REGEX_DIACS.sub('', self.word_diac) != self.word:
            sys.stderr.write('Warning: word ' + self.word + ' != word_diac ' + self.word_diac + \
                             ' after removing diacritics. Attempting to correct\n')
            self.unnormalize()
        if REGEX_DIACS.sub('', self.word_diac) != self.word:
            sys.stderr.write('Warning: could not correct, word ' + self.word + ' != word_diac ' + \
                             self.word_diac + '. Using undiacritized word_diac as word.\n')
            self.word = REGEX_DIACS.sub('', self.word_diac)
        if REGEX_DIACS.sub('', self.word_diac) != self.word:
            sys.stderr.write('Warning: still word ' + self.word +
                             ' != word_diac ' + self.word_diac + '\n')
Ejemplo n.º 4
0
    def __init__(self,
                 input_string,
                 comment,
                 index,
                 gold_solution=None,
                 lookup_word=None):

        self.input_string = input_string
        self.comment = comment
        self.index = index
        self.gold_solution = gold_solution
        self.lookup_word = lookup_word
        # if this is an Arabic script word
        if lookup_word:
            self.word = REGEX_DIACS.sub('', lookup_word)
            if gold_solution:
                match = REGEX_SOLUTION_DIAC.match(gold_solution)
                if not match:
                    sys.stderr.write('Warning: could not find diacritized solution in: ' + gold_solution + '. ' + \
                                     'Writing lookup word as is: ' + lookup_word + '\n')
                    self.word_diac = lookup_word
                else:
                    self.word_diac = match.groups()[0]
                    self.check_match()

            # there may be no solution if the word is unknown, so just write the lookup word
            else:
                self.word_diac = lookup_word

        # this is a non-Arabic script word
        else:
            # TODO consider marking as Lating words (and exclude later)
            self.word = input_string
            self.word_diac = input_string
Ejemplo n.º 5
0
def extract_data(rdi_bw_filename, output_word_filename, output_word_diac_filename):
    """
    Extract data from an RDI file

    :param rdi_bw_filename: file containing raw Arabic text, preprocessed by MADA preprocessor (keeping diacritics)
    :param output_word_filename: file to write words without diacritics
    :param output_word_diac_filename: file to wrote words with diacritics
    :return:
    """

    print 'extracting data from:', rdi_bw_filename
    g_word = open(output_word_filename, 'w')
    g_word_diac = open(output_word_diac_filename, 'w')
    with open(rdi_bw_filename) as f:
        for line in f:
            for token in line.strip().split():
                if token.startswith(MADA_LATIN_TAG):
                    sys.stderr.write('Warning: found Latin word: ' + token + '. skipping word.\n')
                    continue
                word_str = REGEX_DIACS.sub('', token)
                word_diac_str = token
                if word_str == '' or word_diac_str == '':
                    sys.stderr.write('Warning: empty word_str ' + word_str + ' or word_diac_str ' + word_diac_str + \
                                     '. skipping word.\n')
                    continue
                g_word.write(word_str + '\n')
                g_word_diac.write(word_diac_str + '\n')
            g_word.write('\n')
            g_word_diac.write('\n')
    g_word.close()
    g_word_diac.close()

    print 'written words to file:', output_word_filename
    print 'written words diac to file:', output_word_diac_filename
def extract_data(rdi_bw_filename, output_word_filename,
                 output_word_diac_filename):
    """
    Extract data from an RDI file

    :param rdi_bw_filename: file containing raw Arabic text, preprocessed by MADA preprocessor (keeping diacritics)
    :param output_word_filename: file to write words without diacritics
    :param output_word_diac_filename: file to wrote words with diacritics
    :return:
    """

    print 'extracting data from:', rdi_bw_filename
    g_word = open(output_word_filename, 'w')
    g_word_diac = open(output_word_diac_filename, 'w')
    with open(rdi_bw_filename) as f:
        for line in f:
            for token in line.strip().split():
                if token.startswith(MADA_LATIN_TAG):
                    sys.stderr.write('Warning: found Latin word: ' + token +
                                     '. skipping word.\n')
                    continue
                word_str = REGEX_DIACS.sub('', token)
                word_diac_str = token
                if word_str == '' or word_diac_str == '':
                    sys.stderr.write('Warning: empty word_str ' + word_str + ' or word_diac_str ' + word_diac_str + \
                                     '. skipping word.\n')
                    continue
                g_word.write(word_str + '\n')
                g_word_diac.write(word_diac_str + '\n')
            g_word.write('\n')
            g_word_diac.write('\n')
    g_word.close()
    g_word_diac.close()

    print 'written words to file:', output_word_filename
    print 'written words diac to file:', output_word_diac_filename