Esempio n. 1
0
def remove_nikud(text):
    # text_processor = TextProcessor(DEFAULT_CONFIG_PATH)
    #     # model2text = ModelInputToSentence(DEFAULT_CONFIG_PATH)
    #     # inputs, labels = text_processor.prepare_text(text)
    #     # empty_labels = [[]*len(labels)]
    #     # result = model2text.input_and_labels_to_sentence(inputs, empty_labels)
    #     # return result

    cid = CharIdsSingleton.get_instance()
    result = ""
    for c in text:
        if not cid.is_nikud(c):
            result += c
    return result
Esempio n. 2
0
def get_prediction_for_text(text):
    #TODO: think where to put this
    path = "C:/nikud/model/resources/config.xlsx"
    CharIdsSingleton(path)

    text_processor = TextProcessor()
    model2text = ModelInputToSentence()

    result = ""
    all_inputs, _ = text_processor.prepare_text(text)
    predictions = []
    for inputs in all_inputs:
        labels = predict(inputs)
        decoded = model2text.input_and_labels_to_sentence(inputs, labels)
        result += decoded
    return result
Esempio n. 3
0
 def setUpClass(cls):
     path = "../resources/letters.xlsx"
     CharIdsSingleton(path)
Esempio n. 4
0
 def __init__(self):
     self.cid = CharIdsSingleton.get_instance()
Esempio n. 5
0
 def setUpClass(cls):
     path = "../resources/test_charIdsConfig.xlsx"
     CharIdsSingleton(path)
Esempio n. 6
0
 def setUp(self):
     self.cid = CharIdsSingleton.get_instance()
Esempio n. 7
0
 def setUpClass(cls):
     config_path = "../resources\letters.xlsx"
     CharIdsSingleton(config_path)
Esempio n. 8
0
 def __init__(self):
     self.cid = CharIdsSingleton.get_instance()
     self.cp = ChunkToInputLabel()
Esempio n. 9
0
    def get_text(self, path):
        ext = os.path.splitext(path)[-1].lower()
        if ext == ".docx":
            text = self.get_text_from_docx_file(path)
        else:
            raise ValueError("This file's extension is not supported: " +
                             path + "(supported extensions: .docx")
        return text

    def collect_from_file(self, path):
        file_name = Path(path).stem
        new_file_path = "C:/nikud/model/data/transcipts/" + file_name + "_arabic.txt"
        new_file = open(new_file_path, "w", encoding="utf8")
        text = self.get_text(path)
        for word in text.split():
            if self.isArabicInHebrewLetters(word):
                new_file.write(" " + word)
        new_file.close()

    def collect_from_dir(self, dir_path):
        directory = os.fsencode(dir_path)
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            self.collect_from_file(dir_path + "/" + filename)


path = "../resources/config.xlsx"
CharIdsSingleton(path)
abuGal = ExtractArabicFromFiles()
abuGal.collect_from_dir("sources/חיאת עילתנא/docx")
Esempio n. 10
0
 def setUp(self):
     self.cid = CharIdsSingleton.get_instance()
     self.sentence2model = SentenceToModelInput()
Esempio n. 11
0
 def __init__(self, str):
     self.str = str
     self.cid = CharIdsSingleton.get_instance()
     ##self.verify_word(str)
     self.letter_indices = self.find_letter_indices_in_word()