def remove_nikud(text): # text_processor = TextProcessor(DEFAULT_CONFIG_PATH) # # model2text = ModelInputToSentence(DEFAULT_CONFIG_PATH) # # inputs, labels = text_processor.prepare_text(text) # # empty_labels = [[]*len(labels)] # # result = model2text.input_and_labels_to_sentence(inputs, empty_labels) # # return result cid = CharIdsSingleton.get_instance() result = "" for c in text: if not cid.is_nikud(c): result += c return result
def get_prediction_for_text(text): #TODO: think where to put this path = "C:/nikud/model/resources/config.xlsx" CharIdsSingleton(path) text_processor = TextProcessor() model2text = ModelInputToSentence() result = "" all_inputs, _ = text_processor.prepare_text(text) predictions = [] for inputs in all_inputs: labels = predict(inputs) decoded = model2text.input_and_labels_to_sentence(inputs, labels) result += decoded return result
def setUpClass(cls): path = "../resources/letters.xlsx" CharIdsSingleton(path)
def __init__(self): self.cid = CharIdsSingleton.get_instance()
def setUpClass(cls): path = "../resources/test_charIdsConfig.xlsx" CharIdsSingleton(path)
def setUp(self): self.cid = CharIdsSingleton.get_instance()
def setUpClass(cls): config_path = "../resources\letters.xlsx" CharIdsSingleton(config_path)
def __init__(self): self.cid = CharIdsSingleton.get_instance() self.cp = ChunkToInputLabel()
def get_text(self, path): ext = os.path.splitext(path)[-1].lower() if ext == ".docx": text = self.get_text_from_docx_file(path) else: raise ValueError("This file's extension is not supported: " + path + "(supported extensions: .docx") return text def collect_from_file(self, path): file_name = Path(path).stem new_file_path = "C:/nikud/model/data/transcipts/" + file_name + "_arabic.txt" new_file = open(new_file_path, "w", encoding="utf8") text = self.get_text(path) for word in text.split(): if self.isArabicInHebrewLetters(word): new_file.write(" " + word) new_file.close() def collect_from_dir(self, dir_path): directory = os.fsencode(dir_path) for file in os.listdir(directory): filename = os.fsdecode(file) self.collect_from_file(dir_path + "/" + filename) path = "../resources/config.xlsx" CharIdsSingleton(path) abuGal = ExtractArabicFromFiles() abuGal.collect_from_dir("sources/חיאת עילתנא/docx")
def setUp(self): self.cid = CharIdsSingleton.get_instance() self.sentence2model = SentenceToModelInput()
def __init__(self, str): self.str = str self.cid = CharIdsSingleton.get_instance() ##self.verify_word(str) self.letter_indices = self.find_letter_indices_in_word()