def init_GUI(self, parent): parent.title(Lang.get('Processing')) # select nice style, if possible s = ttk.Style(parent) styles = s.theme_names() s.configure('TLabel', bg='white') s.configure('TNotebook', background='#ffffff') s.configure('TNotebook.Tab', background='#ffffff') parent.configure(background='white') self.selectedActionList = tk.Listbox(self.parent) self.selectedActionList.grid(row=1, rowspan=6, column=0, columnspan=6, sticky=tk.W + tk.E) self.selectedActionList.config(width=self.actionsListWidth) self.populateActionsList() self.openSelectedActionsButton = tk.Button( self.parent, text=Lang.get('Open selected Configuration'), command=self.openSelectedAction, **self.getItemStyle()).grid(row=7, column=0, sticky=tk.E + tk.W)
def readLang(path, params, Lang1='eng', Lang2='fra', reverse=False): """ 读取训练数据,建立两个词典。 params: reverse:如果是法语-->英语,则需要reverse """ print('reading training data...') lines = open(path, encoding='utf-8').read().strip().split('\n') dataLen = len(lines) randomSortSeq = random.sample(range(dataLen), dataLen) #shuffle the data pairs = [] for index in randomSortSeq: pairs.append([normalizeString(s) for s in lines[index].split('\t')]) #pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] #如果是法语-->英语,则需要reverse if reverse: print('reversing language...') pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(Lang2, params.copy()) output_lang = Lang(Lang1, params.copy()) else: input_lang = Lang(Lang1, params.copy()) output_lang = Lang(Lang2, params.copy()) return input_lang, output_lang, pairs
def handleExecutionError(self, exception, taskQueue, debuggingResultQueue, errorQueue): """ Inquires whether operators would like to continue or to skip all remaining files :param taskQueue: :param debuggingResultQueue: :param errorQueue: :return: """ spssExecutionError = '' # CalledProcessError features the program's output in 'output' if hasattr(exception, 'output'): spssExecutionError = str(exception.output) errMsg = Lang.get('The following error occured, please check the log for details' + \ '\n Would you like to continue with the next file (Yes) or cancel processing (No) for all remaining files?') print(spssExecutionError + '\n' + str(exception)) wantsToContinue = tk.messagebox.askyesno( Lang.get("Error occured, execution halted"), errMsg) if not (wantsToContinue): # clear the queue while not taskQueue.empty(): taskQueue.get_nowait( ) # as docs say: Remove and return an item from the queue. print( Lang.get( 'Execution of remaining files has been aborted as by user\'s choice.' ))
def __getitem__(self, i): capLang = Lang(name="EnglishCaptions") pair = self.getFeatureVocabPair(self.path, i) capLang.addSentence(pair[1]) training_pair = tensorsFromPair(capLang, pair) return training_pair
def populateTaskQueue(self): #populate execution log as well self.executionLog = [] self.executionLog.append( Lang.get('Execution log on {}').format(datetime.datetime.now())) self.executionLog.append( Lang.get('Configuration dump: ') + os.linesep + self.config.toJSON()) self.executionLog.append(os.linesep + Lang.get('Execution log:')) # keep track of files output by SPSS # used to spot aberrations in filesize self.outputFilePaths = [] self.inputFilePaths = [] #for debugging, only very first file will be processed if (self.config.opt['simulateProcessing']): #if we do not accumulate, choose only one if not (self.config.opt['accumulateData']): self.config.opt['inputFiles'] = self.config.opt['inputFiles'][ 0:1] #otherwise (we are accumulating), we need at least two else: self.config.opt['inputFiles'] = self.config.opt['inputFiles'][ 0:2] # we need copy here: we may reload the configuration file and do not want to # remove files permanently from the configuration after having processed them once inputFilesToUse = self.config.opt['inputFiles'][:] self.totalFileNum = 0 #if we accumulate data, move and rename the very last entry, but do not touch the others if (self.config.opt['accumulateData']): self.moveRenameAccumulationFile(inputFilesToUse) self.start_time = time.time() # fill up queue of tasks/files for filePath in inputFilesToUse: self.defineDefaultPlaceholders(filePath) outputFilePath = self.getOutputFilePath(filePath) # attention: pickling in Python is seriously broken. passing self.config will mess up the configuration # (there are literally values missing) # parsing it to JSON and converting back works just fine. configStr = self.config.toJSON() self.queue.put([filePath, outputFilePath, configStr]) # reconstruct file output path # used to spot aberrations in file size after processing self.outputFilePaths.append(outputFilePath) self.inputFilePaths.append(filePath) self.totalFileNum += 1
def getOutputFilePath(self, oldFilePath): """ constructs output file path from given path and inputRegexPattern; also populates inputFileNameMatch """ path = os.path.dirname(oldFilePath) file = os.path.basename(oldFilePath) # extract information from input file path using regex m = re.match(self.config.opt['inputRegexPattern'], file) self.inputFileNameMatch = m if (m == None): msg = Lang.get( "Could not match input filename with pattern. Please check defined and used placeholders. Affected file: " ) + oldFilePath self.err(msg) # find all spots in the output file name to replace # they have the form <name1>, <name2> ... namedGroupPattern = '<([\w]*)>' placeholders = re.findall(namedGroupPattern, self.config.opt['outputFilePattern']) outputFileName = self.config.opt['outputFilePattern'] #replace the placeholders for placeholder in placeholders: try: #check whether it is a predefined placeholder if (placeholder in self.config.opt['placeholders']): replacement = self.config.opt['placeholders'][placeholder] else: replacement = m.group(placeholder) # if placeholder is PREDEFINED (already exists in above array), this is a serisous error # predefined placeholders may NOT be overwritten. if (placeholder in self.config.opt['placeholders']): msg = Lang.get( 'Predefined placeholders must not be redefined: attempted to define placeholder, which already exists: ' ) raise RuntimeError(msg + ' ' + placeholder) except IndexError: BatchProcessor.err( Lang.get( "Placeholder in ouput file pattern refers to a placeholder which has not been defined" )) # perhaps the placeholder is not in use after all (defined but not populated) if (replacement != None): outputFileName = outputFileName.replace( '<' + placeholder + '>', replacement) return self.config.opt['outputDir'] + '/' + outputFileName
def runPreprocessingChecks(self): if len(self.config.opt['inputFiles']) == 0: self.err(Lang.get("You did not select any files")) return False #@todo:check whether REGEX matches all files return True
def loadRawCommandsFromFile(cls, config): """ Returns list of commands from file (which must be UTF-8 encoded) :param config: key 'spssFile' will be used :return: list of commands """ with io.open(config.opt['spssFile'], "r+b") as f: if (f == None): cls.err(Lang.get("Unable to obtain file handle for SPSS file")) else: spssCommands = f.read() # remove BOM byte, should it be there spssCommands = spssCommands.decode("utf-8-sig") usesWindowsNewlines = "\r\n" in spssCommands #by definition, commands end with "." and a newline if not (usesWindowsNewlines): spssCommands = spssCommands.split("\n") else: spssCommands = spssCommands.split("\r\n") spssCommands = BatchProcessor.removeCommentsFromCommands(spssCommands) return BatchProcessor.mergeLinesIntoCommands(spssCommands)
def read_languages(lang1, lang2, reverse=False): print("reading files...") lines = open("data/{}-{}.txt".format(lang1, lang2), encoding='utf-8').read().strip().split('\n') pairs = [[normalize_string(l) for l in line.split('\t')] for line in lines] if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def read_langs(lang1, lang2, dataset, mode): # Reading the file and splitting into lines print("Reading lines...") lines = open('dataset/%s_%s.en-it.txt' % (dataset, mode)).read().strip().split('\n') # Splitting every line into sentences pair and normalizing pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines] # Returning sentences pairs (and possibly language structures) if lang1 is None and lang2 is None: input_lang = Lang('en') output_lang = Lang('it') return input_lang, output_lang, pairs else: return pairs
def loadFromFile(self, f): self.opt = json.load(f) # check config file version if (self.opt['programVersion'] > self.getCurrentVersion()): BatchProcessor.err( Lang.get( "The config file was created using a newer program version. Settings might be ignored and behavior may change. To avoid surprises, please updated the BatchProcessor." ))
def runProcessing(self): print(Lang.get('Started processing...')) self.gui.GUIToConfig() totalUsedTime = 0.0 if not (self.runPreprocessingChecks()): return False self.config.opt['simulateProcessing'] = ( self.gui.simulateProcessingVar.get() == 1) self.populateTaskQueue() self.trackProgress() # show debugging information upon completion if (self.config.opt['simulateProcessing']): # need try/catch here; depending on error, queue may be empty try: debuggingInfo = self.debuggingResultQueue.get_nowait() self.showDebuggingInformation(debuggingInfo) except queue.Empty as e: pass else: # give SPSS a few seconds to complete last file ... time.sleep(3) filesToRedo = self.spotOutputFileSizeAberrations() if (len(filesToRedo) == 0): # transfer information from queue to log (i.e. workers => backend) self.transferLogQueue() completedMsg = Lang.get( 'Processing for {} files completed in {:.2f} seconds' ).format(len(self.config.opt['inputFiles']), totalUsedTime) self.executionLog.append(completedMsg) tk.messagebox.showinfo(Lang.get('Processing completed'), completedMsg) else: tk.messagebox.showinfo( Lang.get('Incomplete Files detected'), Lang.get( 'Detected filesize aberration. Reprocessing incomplete files ...' )) self.redoIncompleteFiles(filesToRedo)
def readLangs(path, lang1, lang2, reverse=False): print("Reading lines...") # Read the file and split into lines lines = open(path, encoding='utf-8').read().strip().split('\n') # Split every line into pairs and normalize pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def getInputFileOptions(self): options = {} options['defaultextension'] = '.txt' options['filetypes'] = [('all files', '.*'), ('text files', '*.txt'), ('csv files', '*.csv'), ('sav files', '*.sav'), \ ('custom pattern', self.inputSearchPattern.get())] options['title'] = Lang.get('Add input files') options['multiple'] = True options['initialdir'] = self.conf('defaultInputDir') return options
def saveConfig(self): """ prompts the user for a filename and saves the configuration there """ self.GUIToConfig() f = tk.filedialog.asksaveasfile(mode='w', filetypes=[('JSON files', '*.json')], defaultextension='.json') if (f): f.write(self.backend.config.toJSON()) f.close() tk.messagebox.showinfo( Lang.get("Configuration file saved"), Lang.get("The configuration file has been successfully saved")) else: self.err( Lang.get( 'You did not select a desetination file or the file could not be saved' ))
def saveProcessingLog(self): """ transfers events from log queue to main log instance, asks operator for file name and exports log """ self.backend.transferLogQueue() fileName = tk.filedialog.asksaveasfilename( initialdir=self.conf('defaultOutDir'), title=Lang.get('Select Logfile'), filetypes=[("Log files", "*.txt"), ("all files", "*.*")]) if fileName is not None: with open(fileName, "w") as logFile: logFile.write(os.linesep.join(self.backend.executionLog)) logFile.close() tk.messagebox.showinfo( Lang.get("Logfile saved"), Lang.get( "The Logfile has been saved at the specified destination" ))
def openSelectedAction(self): selection = self.selectedActionList.curselection() if (not (len(selection) == 1)): self.mainWindow.err(Lang.get('You must select exactly one entry.')) else: selectedIndex = selection[0] actionPath = self.mainWindow.state['actions']['recentActions'][ selectedIndex] self.parent.destroy() self.mainWindow.showBatchProcessor() self.mainWindow.gui.loadConfigFromFile(actionPath)
def readLangs(lang1, lang2, reverse=False): print("Reading lines...") # Read the file and split into lines #lines = open('./NLP Tutorial/data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n') lines = open('./NLP Tutorial/data/%s-%s_pairs.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n') pairs = [line.split('\t') for line in lines] # Split every line into pairs and normalize #pairs = [[normalizeString(string) for string in line.split('\t')[:2]] for line in lines] # reverse pairs, make Lang instanstances if reverse: pairs = [list(reversed(pair)) for pair in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def showDebuggingInformation(self, debuggingInfo): """ Spawns new window, allows to inspect parameters and generated code for a single file :param debuggingInfo: dictionary with keys 'placeholders' and 'commands' :return: """ t = tk.Toplevel(self.gui.parent) t.wm_title(Lang.get("Debugging information")) #organize placeholders and resulting source code into panes n = ttk.Notebook(t) n.add(self.gui.createFrameWithText(n, debuggingInfo['placeholders']), text=Lang.get('Placeholders')) n.add(self.gui.createFrameWithText(n, debuggingInfo['commands']), text=Lang.get('Sourcecode')) n.pack(expand=1, fill="both") # propagate changes to GUI self.gui.parent.update()
def read_langs(en_file, ja_file, n_processes=4): """ Reads corpuses and returns a Lang object for each language and all normalized sentence pairs. """ en_lines = open(en_file, encoding="utf8", errors="ignore").read().split("\n") ja_lines = open(ja_file, encoding="utf8", errors="ignore").read().split("\n") pool = mp.Pool(processes=n_processes) interval = len(en_lines) // n_processes results = [ pool.apply_async( normalize, args=(en_lines[i * interval:(i + 1) * interval], ja_lines[i * interval:(i + 1) * interval])) for i in range(n_processes) ] pairs = [] for p in results: pairs += p.get() en = Lang("en") ja = Lang("ja") return en, ja, pairs
def readLangs(training_set): print("Reading lines...") # Read the file and split into lines lines = open('%s' % (training_set), encoding='utf-8'). \ read().strip().split('\n') # Split every line into pairs and normalize pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] lang = Lang('english') return lang, pairs
def loadConfig(self): """ Inquires about configuration file, loads specified file :return: """ filePath = tk.filedialog.askopenfilename( filetypes=[('JSON files', '*.json')], initialdir=self.conf('defaultConfigDir')) if (filePath): self.loadConfigFromFile(filePath) else: self.err( Lang.get( 'You did not select a config file or the file could not be opened' ))
def readLangs(train_x, train_y): print("Reading lines...") # Read the file and split into lines lines_x = open('%s' % (train_x), encoding='utf-8'). \ read().strip().split('\n') lines_y = open('%s' % (train_y), encoding='utf-8'). \ read().strip().split('\n') lines = lines_x for i in range(len(lines_y)): lines[i] = lines[i] + "\t" + lines_y[i] # get pairs and normalize pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] lang = Lang('english') return lang, pairs
def __init__(self, gui, parent, workerProcess, logQueue, taskQueue, debuggingResultQueue, errorQueue): """ initialize with process and queue; please note that as TK handles _cannot_ be pickled, we need to create structures related to multiprocessing _before_ initializing the GUI (i.e. this class) :param gui: BatchProcessorGUI instance :param parent: TKinter frame instance (to draw into) :param workerProcess: :param logQueue: :param taskQueue: :param debuggingResultQueue: :param errorQueue: """ self.p = workerProcess self.queue, self.logQueue, self.debuggingResultQueue, self.errorQueue = taskQueue, logQueue, debuggingResultQueue, errorQueue self.config = Configuration() parent.title(Lang.get("BatchProcessing")) self.gui = gui self.executionLog = []
def trackProgress(self): """ Indicates computation progress using progress bar in main window. Relies on time needed for already processed files; performs linear extrapolation """ alreadyProcessedFiles = 0 # keep updating the progress bar # at least one of debugging result queue OR errorQueue is expected to fill as well # queue may be empty immediately, as worker may "snatch" task before this function is even called. while (not (self.queue.empty()) or (self.debuggingResultQueue.empty() and self.errorQueue.empty())): processedAsOfNow = (self.totalFileNum - self.queue.qsize()) processedJustNow = processedAsOfNow - alreadyProcessedFiles alreadyProcessedFiles = processedAsOfNow # advance progressbar self.gui.pb.step(processedJustNow / float(self.totalFileNum) * 100.0) totalUsedTime = (time.time() - self.start_time) # update estimated time estimate = self.estimateRemainingTime(self.totalFileNum, alreadyProcessedFiles, totalUsedTime) self.gui.remainingTimeLabel.config( text=Lang.get('Remaining time: %.2f seconds ') % estimate) # propagate changes to GUI self.gui.parent.update() time.sleep(0.5) # when processing is finished, reset progress bar # by default, maximum progress bar can reach is 100 - thus, decrease by just that amount. self.gui.pb.step(-100.0) # propagate changes to GUI self.gui.parent.update()
def initGUI(self): self.mainWindow.configure(background='white') self.mainWindow.title("SPSS Toolbox") self.centerFrame = tkinter.Frame(self.mainWindow) self.centerFrame.configure(padx = 50, pady = 50, height=200, background='white') spssToolboxLabel = tk.Label(self.centerFrame, text=Lang.get("SPSS Toolbox"), font = ('Times', 25, 'bold'), **self.getItemStyle()) spssToolboxLabel.grid(row=0, column=0, sticky=tk.W + tk.E); self.actionsFrame = tkinter.Frame(self.centerFrame) self.recentActionsButton = tk.Button(self.actionsFrame, text=Lang.get("Last Actions"), command=self.selectAmongLastActions, **self.getItemStyle()) self.recentActionsButton.grid(row=0, column=0, sticky=tk.W + tk.E) self.redoLastActionButton = tk.Button(self.actionsFrame, text=Lang.get("Redo Last"), command=self.redoAction, **self.getItemStyle()) self.redoLastActionButton.grid(row=0, column=1, sticky=tk.W+ tk.E) #assign non-zero weights (1 for both columns) to allow buttons to take up extra space self.actionsFrame.grid(row = 5, column = 0, sticky=tk.W+ tk.E) self.actionsFrame.grid_columnconfigure(0, weight=1) self.actionsFrame.grid_columnconfigure(1, weight=1) self.loadConfigurationButton = tk.Button(self.centerFrame, text=Lang.get("Load Configuration"), command=self.loadConfiguration, **self.getItemStyle()) self.loadConfigurationButton.grid(row=6, column=0, sticky=tk.W + tk.E) self.newConfigurationButton = tk.Button(self.centerFrame, text=Lang.get("New Configuration"), command=self.spawnNewConfiguration, **self.getItemStyle()) self.newConfigurationButton.grid(row=7, column=0, sticky=tk.W+ tk.E) self.helpButton = tk.Button(self.centerFrame, text=Lang.get("Help"), command=self.showHelp, **self.getItemStyle()) self.helpButton.grid(row=8, column=0, sticky=tk.W + tk.E) self.pad(self.centerFrame, 5) spssToolboxLabel.grid(pady = (0, 30)) self.centerFrame.pack() self.adaptGUIToState()
def PrepareData(file): pairs = GetfeatureVocabPair(file) capLang = Lang(name="EnglishCaptions") for pair in pairs: capLang.addSentence(pair[1]) return capLang, pairs
def runSPSSProcessOnFile(cls, inputFilePath, outputFilePath, config, logQueue, debuggingResultQueue, errorQueue): """ process single given file with SPSS template and save to output File returns the time it used up (in seconds) """ #create dedicated TK instance; tk _always_ requires a window, however we just want message Boxes # create and hide main window root = tk.Tk() root.withdraw() start_time = time.time() logMsg = Lang.get("Processing ") + inputFilePath + "..." print(logMsg) logQueue.put(logMsg) # read in commands spssCommands = BatchProcessor.loadRawCommandsFromFile(config) allCommands = [] BatchProcessor.instantiatePlaceholders(config, inputFilePath, outputFilePath) #execute file command by command for command in spssCommands: #ignore encoding line if (command.find('Encoding') != -1): continue command = BatchProcessor.applyPlaceholders(command, config) allCommands.append(command) print(Lang.get("Executing: "), command) # redirect output # redirecting here will also capture SPSS's errors f = io.StringIO() outDir = config.opt['defaultCaptureOutputOutDir'] if outDir != 'none': redirect_stdout(f) #try: pointPlusNewline = '.' + os.linesep debuggingResultQueue.put({ 'placeholders': config.ObjToJSON(config.opt['placeholders']), 'commands': pointPlusNewline.join(allCommands) }) if (not (config.opt['simulateProcessing'])): BatchProcessor.executor.execute(allCommands) #except subprocess.CalledProcessError as e: #halt processing #errorQueue.put(e) #logQueue.put(Lang.get('Error occurred; execution incomplete')) #cls.saveOutputToFile(config, f) #raise cls.saveOutputToFile(config, f) cls.saveCommandsToSyntaxFile(config, allCommands) usedTime = (time.time() - start_time) logQueue.put( Lang.get('Processing finished in {:.2f}s').format(usedTime)) return usedTime
def err(errMsg): """ Shows given error message in Messagebox """ tk.messagebox.showerror(Lang.get("Error"), errMsg)
print("WARNNING!! PDTB_explicit_data length is zero, please check") #data_explicit_training, maxlength_ex = generate_standard_data([parsed_explicit_data, PDTB_explicit_data], label_type) data_implicit_training, maxlength_im_training = generate_standard_data( [PDTB_implicit_data_training], label_type) data_implicit_testing, maxlength_im_testing = generate_standard_data( [PDTB_implicit_data_testing], label_type) data = data_implicit_training #data_explicit_training + data_implicit_training maxlength = maxlength_im_training #maxlength_ex if maxlength_ex > maxlength_im_training else maxlength_im_training data_test = data_implicit_testing maxlength_test = maxlength_im_testing #data = data[0:1000] #data_test = data_test[0:1000] print("training data size:", len(data)) print("testing data size:", len(data_test)) lang = Lang('corpus') label = Label("label") for num, one_data in enumerate(data): lang.index_words(one_data[0]) lang.index_words(one_data[1]) label.index_labels(one_data[2], label_type=label_type, label_2_pridict=label_2_pridict) if label_type.lower() == "conn": label.record_conn_sense( one_data[2], one_data[3] ) #one_data[2] is always label, if label_type != conn, one_data[2] is sense, one_data[3] is conn else: label.record_conn_sense( one_data[3], one_data[4]