def init_GUI(self, parent):
        parent.title(Lang.get('Processing'))
        # select nice style, if possible
        s = ttk.Style(parent)
        styles = s.theme_names()

        s.configure('TLabel', bg='white')
        s.configure('TNotebook', background='#ffffff')
        s.configure('TNotebook.Tab', background='#ffffff')
        parent.configure(background='white')

        self.selectedActionList = tk.Listbox(self.parent)
        self.selectedActionList.grid(row=1,
                                     rowspan=6,
                                     column=0,
                                     columnspan=6,
                                     sticky=tk.W + tk.E)
        self.selectedActionList.config(width=self.actionsListWidth)
        self.populateActionsList()

        self.openSelectedActionsButton = tk.Button(
            self.parent,
            text=Lang.get('Open selected Configuration'),
            command=self.openSelectedAction,
            **self.getItemStyle()).grid(row=7, column=0, sticky=tk.E + tk.W)
Exemple #2
0
def readLang(path, params, Lang1='eng', Lang2='fra', reverse=False):
    """
    读取训练数据,建立两个词典。
    params:
        reverse:如果是法语-->英语,则需要reverse
    """

    print('reading training data...')
    lines = open(path, encoding='utf-8').read().strip().split('\n')

    dataLen = len(lines)
    randomSortSeq = random.sample(range(dataLen), dataLen)

    #shuffle the data
    pairs = []
    for index in randomSortSeq:
        pairs.append([normalizeString(s) for s in lines[index].split('\t')])

    #pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    #如果是法语-->英语,则需要reverse
    if reverse:
        print('reversing language...')
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(Lang2, params.copy())
        output_lang = Lang(Lang1, params.copy())
    else:
        input_lang = Lang(Lang1, params.copy())
        output_lang = Lang(Lang2, params.copy())

    return input_lang, output_lang, pairs
Exemple #3
0
    def handleExecutionError(self, exception, taskQueue, debuggingResultQueue,
                             errorQueue):
        """
        Inquires whether operators would like to continue or to skip all remaining files
        :param taskQueue:
        :param debuggingResultQueue:
        :param errorQueue:
        :return:
        """
        spssExecutionError = ''
        # CalledProcessError features the program's output in 'output'
        if hasattr(exception, 'output'):
            spssExecutionError = str(exception.output)
        errMsg =    Lang.get('The following error occured, please check the log for details' + \
        '\n Would you like to continue with the next file (Yes) or cancel processing (No) for all remaining files?')
        print(spssExecutionError + '\n' + str(exception))

        wantsToContinue = tk.messagebox.askyesno(
            Lang.get("Error occured, execution halted"), errMsg)
        if not (wantsToContinue):
            # clear the queue
            while not taskQueue.empty():
                taskQueue.get_nowait(
                )  # as docs say: Remove and return an item from the queue.
            print(
                Lang.get(
                    'Execution of remaining files has been aborted as by user\'s choice.'
                ))
    def __getitem__(self, i):
        capLang = Lang(name="EnglishCaptions")
        pair = self.getFeatureVocabPair(self.path, i)
        capLang.addSentence(pair[1])

        training_pair = tensorsFromPair(capLang, pair)

        return training_pair
    def populateTaskQueue(self):
        #populate execution log as well
        self.executionLog = []
        self.executionLog.append(
            Lang.get('Execution log on {}').format(datetime.datetime.now()))
        self.executionLog.append(
            Lang.get('Configuration dump: ') + os.linesep +
            self.config.toJSON())
        self.executionLog.append(os.linesep + Lang.get('Execution log:'))

        # keep track of files output by SPSS
        # used to spot aberrations in filesize
        self.outputFilePaths = []
        self.inputFilePaths = []

        #for debugging, only very first file will be processed
        if (self.config.opt['simulateProcessing']):

            #if we do not accumulate, choose only one
            if not (self.config.opt['accumulateData']):
                self.config.opt['inputFiles'] = self.config.opt['inputFiles'][
                    0:1]
            #otherwise (we are accumulating), we need at least two
            else:
                self.config.opt['inputFiles'] = self.config.opt['inputFiles'][
                    0:2]

        # we need copy here: we may reload the configuration file and do not want to
        # remove files permanently from the configuration after having processed them once
        inputFilesToUse = self.config.opt['inputFiles'][:]
        self.totalFileNum = 0

        #if we accumulate data, move and rename the very last entry, but do not touch the others
        if (self.config.opt['accumulateData']):
            self.moveRenameAccumulationFile(inputFilesToUse)

        self.start_time = time.time()
        # fill up queue of tasks/files
        for filePath in inputFilesToUse:
            self.defineDefaultPlaceholders(filePath)
            outputFilePath = self.getOutputFilePath(filePath)
            # attention: pickling in Python is seriously broken. passing self.config will mess up the configuration
            # (there are literally values missing)
            # parsing it to JSON and converting back works just fine.
            configStr = self.config.toJSON()
            self.queue.put([filePath, outputFilePath, configStr])

            # reconstruct file output path
            # used to spot aberrations in file size after processing
            self.outputFilePaths.append(outputFilePath)
            self.inputFilePaths.append(filePath)

            self.totalFileNum += 1
    def getOutputFilePath(self, oldFilePath):
        """
        constructs output file path from given path and inputRegexPattern; also populates inputFileNameMatch
        """
        path = os.path.dirname(oldFilePath)
        file = os.path.basename(oldFilePath)
        # extract information from input file path using regex
        m = re.match(self.config.opt['inputRegexPattern'], file)
        self.inputFileNameMatch = m

        if (m == None):
            msg = Lang.get(
                "Could not match input filename with pattern. Please check defined and used placeholders. Affected file: "
            ) + oldFilePath
            self.err(msg)

        # find all spots in the output file name to replace
        # they have the form <name1>, <name2> ...
        namedGroupPattern = '<([\w]*)>'
        placeholders = re.findall(namedGroupPattern,
                                  self.config.opt['outputFilePattern'])

        outputFileName = self.config.opt['outputFilePattern']
        #replace the placeholders
        for placeholder in placeholders:
            try:
                #check whether it is a predefined placeholder
                if (placeholder in self.config.opt['placeholders']):
                    replacement = self.config.opt['placeholders'][placeholder]
                else:
                    replacement = m.group(placeholder)
                    # if placeholder is PREDEFINED (already exists in above array), this is a serisous error
                    # predefined placeholders may NOT be overwritten.
                    if (placeholder in self.config.opt['placeholders']):
                        msg = Lang.get(
                            'Predefined placeholders must not be redefined: attempted to define placeholder,  which already exists: '
                        )
                        raise RuntimeError(msg + ' ' + placeholder)
            except IndexError:
                BatchProcessor.err(
                    Lang.get(
                        "Placeholder in ouput file pattern refers to a placeholder which has not been defined"
                    ))

            # perhaps the placeholder is not in use after all (defined but not populated)
            if (replacement != None):
                outputFileName = outputFileName.replace(
                    '<' + placeholder + '>', replacement)
        return self.config.opt['outputDir'] + '/' + outputFileName
    def runPreprocessingChecks(self):
        if len(self.config.opt['inputFiles']) == 0:
            self.err(Lang.get("You did not select any files"))
            return False

        #@todo:check whether REGEX matches all files
        return True
    def loadRawCommandsFromFile(cls, config):
        """
        Returns list of commands from file (which must be UTF-8 encoded)
        :param config: key 'spssFile' will be used
        :return: list of commands
        """
        with io.open(config.opt['spssFile'], "r+b") as f:
            if (f == None):
                cls.err(Lang.get("Unable to obtain file handle for SPSS file"))
            else:
                spssCommands = f.read()

                # remove BOM byte, should it be there
                spssCommands = spssCommands.decode("utf-8-sig")

        usesWindowsNewlines = "\r\n" in spssCommands

        #by definition, commands end with "." and a newline
        if not (usesWindowsNewlines):
            spssCommands = spssCommands.split("\n")
        else:
            spssCommands = spssCommands.split("\r\n")

        spssCommands = BatchProcessor.removeCommentsFromCommands(spssCommands)
        return BatchProcessor.mergeLinesIntoCommands(spssCommands)
Exemple #9
0
def read_languages(lang1, lang2, reverse=False):
    print("reading files...")

    lines = open("data/{}-{}.txt".format(lang1, lang2),
                 encoding='utf-8').read().strip().split('\n')
    pairs = [[normalize_string(l) for l in line.split('\t')] for line in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
Exemple #10
0
def read_langs(lang1, lang2, dataset, mode):
    # Reading the file and splitting into lines
    print("Reading lines...")
    lines = open('dataset/%s_%s.en-it.txt' %
                 (dataset, mode)).read().strip().split('\n')

    # Splitting every line into sentences pair and normalizing
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

    # Returning sentences pairs (and possibly language structures)
    if lang1 is None and lang2 is None:
        input_lang = Lang('en')
        output_lang = Lang('it')
        return input_lang, output_lang, pairs
    else:
        return pairs
Exemple #11
0
 def loadFromFile(self, f):
     self.opt = json.load(f)
     # check config file version
     if (self.opt['programVersion'] > self.getCurrentVersion()):
         BatchProcessor.err(
             Lang.get(
                 "The config file was created using a newer program version. Settings might be ignored and behavior may change. To avoid surprises, please updated the BatchProcessor."
             ))
    def runProcessing(self):
        print(Lang.get('Started processing...'))
        self.gui.GUIToConfig()

        totalUsedTime = 0.0

        if not (self.runPreprocessingChecks()):
            return False

        self.config.opt['simulateProcessing'] = (
            self.gui.simulateProcessingVar.get() == 1)

        self.populateTaskQueue()
        self.trackProgress()

        # show debugging information upon completion
        if (self.config.opt['simulateProcessing']):
            # need try/catch here; depending on error, queue may be empty
            try:
                debuggingInfo = self.debuggingResultQueue.get_nowait()
                self.showDebuggingInformation(debuggingInfo)
            except queue.Empty as e:
                pass
        else:
            # give SPSS a few seconds to complete last file ...
            time.sleep(3)
            filesToRedo = self.spotOutputFileSizeAberrations()

            if (len(filesToRedo) == 0):
                # transfer information from queue to log (i.e. workers => backend)
                self.transferLogQueue()
                completedMsg = Lang.get(
                    'Processing for {} files completed in {:.2f} seconds'
                ).format(len(self.config.opt['inputFiles']), totalUsedTime)
                self.executionLog.append(completedMsg)
                tk.messagebox.showinfo(Lang.get('Processing completed'),
                                       completedMsg)
            else:
                tk.messagebox.showinfo(
                    Lang.get('Incomplete Files detected'),
                    Lang.get(
                        'Detected filesize aberration. Reprocessing incomplete files ...'
                    ))
                self.redoIncompleteFiles(filesToRedo)
Exemple #13
0
def readLangs(path, lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(path, encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
Exemple #14
0
 def getInputFileOptions(self):
     options = {}
     options['defaultextension'] = '.txt'
     options['filetypes'] = [('all files', '.*'), ('text files', '*.txt'), ('csv files', '*.csv'),
                             ('sav files', '*.sav'), \
                             ('custom pattern', self.inputSearchPattern.get())]
     options['title'] = Lang.get('Add input files')
     options['multiple'] = True
     options['initialdir'] = self.conf('defaultInputDir')
     return options
Exemple #15
0
 def saveConfig(self):
     """
     prompts the user for a filename and saves the configuration there
     """
     self.GUIToConfig()
     f = tk.filedialog.asksaveasfile(mode='w',
                                     filetypes=[('JSON files', '*.json')],
                                     defaultextension='.json')
     if (f):
         f.write(self.backend.config.toJSON())
         f.close()
         tk.messagebox.showinfo(
             Lang.get("Configuration file saved"),
             Lang.get("The configuration file has been successfully saved"))
     else:
         self.err(
             Lang.get(
                 'You did not select a desetination file or the file could not be saved'
             ))
Exemple #16
0
    def saveProcessingLog(self):
        """
        transfers events from log queue to main log instance, asks operator for file name and exports log
        """
        self.backend.transferLogQueue()
        fileName = tk.filedialog.asksaveasfilename(
            initialdir=self.conf('defaultOutDir'),
            title=Lang.get('Select Logfile'),
            filetypes=[("Log files", "*.txt"), ("all files", "*.*")])

        if fileName is not None:
            with open(fileName, "w") as logFile:
                logFile.write(os.linesep.join(self.backend.executionLog))
                logFile.close()
                tk.messagebox.showinfo(
                    Lang.get("Logfile saved"),
                    Lang.get(
                        "The Logfile has been saved at the specified destination"
                    ))
 def openSelectedAction(self):
     selection = self.selectedActionList.curselection()
     if (not (len(selection) == 1)):
         self.mainWindow.err(Lang.get('You must select exactly one entry.'))
     else:
         selectedIndex = selection[0]
         actionPath = self.mainWindow.state['actions']['recentActions'][
             selectedIndex]
         self.parent.destroy()
         self.mainWindow.showBatchProcessor()
         self.mainWindow.gui.loadConfigFromFile(actionPath)
Exemple #18
0
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    #lines = open('./NLP Tutorial/data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    lines = open('./NLP Tutorial/data/%s-%s_pairs.txt' % (lang1, lang2),
                 encoding='utf-8').read().strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    # Split every line into pairs and normalize
    #pairs = [[normalizeString(string) for string in line.split('\t')[:2]] for line in lines]
    # reverse pairs, make Lang instanstances
    if reverse:
        pairs = [list(reversed(pair)) for pair in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
    def showDebuggingInformation(self, debuggingInfo):
        """
        Spawns new window, allows to inspect parameters and generated code for a single file
        :param debuggingInfo: dictionary with keys 'placeholders' and 'commands'
        :return:
        """
        t = tk.Toplevel(self.gui.parent)
        t.wm_title(Lang.get("Debugging information"))

        #organize placeholders and resulting source code into panes
        n = ttk.Notebook(t)

        n.add(self.gui.createFrameWithText(n, debuggingInfo['placeholders']),
              text=Lang.get('Placeholders'))
        n.add(self.gui.createFrameWithText(n, debuggingInfo['commands']),
              text=Lang.get('Sourcecode'))
        n.pack(expand=1, fill="both")

        # propagate changes to GUI
        self.gui.parent.update()
def read_langs(en_file, ja_file, n_processes=4):
    """ Reads corpuses and returns a Lang object for each language and all normalized sentence pairs.
    """
    en_lines = open(en_file, encoding="utf8", errors="ignore").read().split("\n")
    ja_lines = open(ja_file, encoding="utf8", errors="ignore").read().split("\n")

    pool = mp.Pool(processes=n_processes)
    interval = len(en_lines) // n_processes
    results = [
        pool.apply_async(
            normalize, args=(en_lines[i * interval:(i + 1) * interval], ja_lines[i * interval:(i + 1) * interval]))
        for i in range(n_processes)
    ]
    pairs = []
    for p in results:
        pairs += p.get()

    en = Lang("en")
    ja = Lang("ja")

    return en, ja, pairs
def readLangs(training_set):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s' % (training_set), encoding='utf-8'). \
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]


    lang = Lang('english')

    return lang, pairs
Exemple #22
0
 def loadConfig(self):
     """
     Inquires about configuration file, loads specified file
     :return:
     """
     filePath = tk.filedialog.askopenfilename(
         filetypes=[('JSON files', '*.json')],
         initialdir=self.conf('defaultConfigDir'))
     if (filePath):
         self.loadConfigFromFile(filePath)
     else:
         self.err(
             Lang.get(
                 'You did not select a config file or the file could not be opened'
             ))
Exemple #23
0
def readLangs(train_x, train_y):
    print("Reading lines...")

    # Read the file and split into lines
    lines_x = open('%s' % (train_x), encoding='utf-8'). \
        read().strip().split('\n')
    lines_y = open('%s' % (train_y), encoding='utf-8'). \
        read().strip().split('\n')

    lines = lines_x
    for i in range(len(lines_y)):
        lines[i] = lines[i] + "\t" + lines_y[i]

    # get pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    lang = Lang('english')

    return lang, pairs
    def __init__(self, gui, parent, workerProcess, logQueue, taskQueue,
                 debuggingResultQueue, errorQueue):
        """
        initialize with process and queue; please note that as TK handles _cannot_ be pickled, we need to create
        structures related to multiprocessing _before_ initializing the GUI (i.e. this class)
        :param gui: BatchProcessorGUI instance
        :param parent: TKinter frame instance (to draw into)
        :param workerProcess:
        :param logQueue:
        :param taskQueue:
        :param debuggingResultQueue:
        :param errorQueue:
        """
        self.p = workerProcess
        self.queue, self.logQueue, self.debuggingResultQueue, self.errorQueue = taskQueue, logQueue, debuggingResultQueue, errorQueue
        self.config = Configuration()

        parent.title(Lang.get("BatchProcessing"))
        self.gui = gui

        self.executionLog = []
    def trackProgress(self):
        """
        Indicates computation progress using progress bar in main window. Relies on time needed for already processed
        files; performs linear extrapolation
        """
        alreadyProcessedFiles = 0
        # keep updating the progress bar
        # at least one of debugging result queue OR errorQueue is expected to fill as well
        # queue may be empty immediately, as worker may "snatch" task before this function is even called.
        while (not (self.queue.empty()) or (self.debuggingResultQueue.empty()
                                            and self.errorQueue.empty())):
            processedAsOfNow = (self.totalFileNum - self.queue.qsize())
            processedJustNow = processedAsOfNow - alreadyProcessedFiles
            alreadyProcessedFiles = processedAsOfNow

            # advance progressbar
            self.gui.pb.step(processedJustNow / float(self.totalFileNum) *
                             100.0)
            totalUsedTime = (time.time() - self.start_time)

            # update estimated time
            estimate = self.estimateRemainingTime(self.totalFileNum,
                                                  alreadyProcessedFiles,
                                                  totalUsedTime)
            self.gui.remainingTimeLabel.config(
                text=Lang.get('Remaining time: %.2f seconds ') % estimate)

            # propagate changes to GUI
            self.gui.parent.update()
            time.sleep(0.5)

        # when processing is finished, reset progress bar
        # by default, maximum progress bar can reach is 100 - thus, decrease by just that amount.
        self.gui.pb.step(-100.0)
        # propagate changes to GUI
        self.gui.parent.update()
Exemple #26
0
    def initGUI(self):
        self.mainWindow.configure(background='white')
        self.mainWindow.title("SPSS Toolbox")

        self.centerFrame = tkinter.Frame(self.mainWindow)
        self.centerFrame.configure(padx = 50, pady = 50, height=200, background='white')

        spssToolboxLabel = tk.Label(self.centerFrame, text=Lang.get("SPSS Toolbox"), font = ('Times', 25, 'bold'), **self.getItemStyle())
        spssToolboxLabel.grid(row=0, column=0, sticky=tk.W + tk.E);


        self.actionsFrame = tkinter.Frame(self.centerFrame)
        self.recentActionsButton = tk.Button(self.actionsFrame, text=Lang.get("Last Actions"),
                                              command=self.selectAmongLastActions, **self.getItemStyle())
        self.recentActionsButton.grid(row=0, column=0, sticky=tk.W + tk.E)

        self.redoLastActionButton = tk.Button(self.actionsFrame, text=Lang.get("Redo Last"),
                                      command=self.redoAction, **self.getItemStyle())
        self.redoLastActionButton.grid(row=0, column=1, sticky=tk.W+ tk.E)

        #assign non-zero weights (1 for both columns) to allow buttons to take up extra space
        self.actionsFrame.grid(row = 5, column = 0, sticky=tk.W+ tk.E)
        self.actionsFrame.grid_columnconfigure(0, weight=1)
        self.actionsFrame.grid_columnconfigure(1, weight=1)


        self.loadConfigurationButton = tk.Button(self.centerFrame, text=Lang.get("Load Configuration"),
                                           command=self.loadConfiguration, **self.getItemStyle())
        self.loadConfigurationButton.grid(row=6, column=0, sticky=tk.W + tk.E)

        self.newConfigurationButton = tk.Button(self.centerFrame, text=Lang.get("New Configuration"),
                                                 command=self.spawnNewConfiguration, **self.getItemStyle())
        self.newConfigurationButton.grid(row=7, column=0, sticky=tk.W+ tk.E)

        self.helpButton = tk.Button(self.centerFrame, text=Lang.get("Help"),
                                                command=self.showHelp, **self.getItemStyle())
        self.helpButton.grid(row=8, column=0, sticky=tk.W + tk.E)


        self.pad(self.centerFrame, 5)
        spssToolboxLabel.grid(pady = (0, 30))
        self.centerFrame.pack()
        self.adaptGUIToState()
def PrepareData(file):
    pairs = GetfeatureVocabPair(file)
    capLang = Lang(name="EnglishCaptions")
    for pair in pairs:
        capLang.addSentence(pair[1])
    return capLang, pairs
    def runSPSSProcessOnFile(cls, inputFilePath, outputFilePath, config,
                             logQueue, debuggingResultQueue, errorQueue):
        """
        process single given file with SPSS template and save to output File
        returns the time it used up (in seconds)
        """
        #create dedicated TK instance; tk _always_ requires a window, however we just want message Boxes
        # create and hide main window
        root = tk.Tk()
        root.withdraw()

        start_time = time.time()

        logMsg = Lang.get("Processing ") + inputFilePath + "..."
        print(logMsg)
        logQueue.put(logMsg)

        # read in commands
        spssCommands = BatchProcessor.loadRawCommandsFromFile(config)
        allCommands = []
        BatchProcessor.instantiatePlaceholders(config, inputFilePath,
                                               outputFilePath)

        #execute file command by command
        for command in spssCommands:
            #ignore encoding line
            if (command.find('Encoding') != -1):
                continue

            command = BatchProcessor.applyPlaceholders(command, config)
            allCommands.append(command)
            print(Lang.get("Executing: "), command)

        # redirect output
        # redirecting here will also capture SPSS's errors
        f = io.StringIO()
        outDir = config.opt['defaultCaptureOutputOutDir']
        if outDir != 'none':
            redirect_stdout(f)
        #try:
        pointPlusNewline = '.' + os.linesep
        debuggingResultQueue.put({
            'placeholders':
            config.ObjToJSON(config.opt['placeholders']),
            'commands':
            pointPlusNewline.join(allCommands)
        })

        if (not (config.opt['simulateProcessing'])):
            BatchProcessor.executor.execute(allCommands)

        #except subprocess.CalledProcessError as e:
        #halt processing
        #errorQueue.put(e)
        #logQueue.put(Lang.get('Error occurred; execution incomplete'))
        #cls.saveOutputToFile(config, f)
        #raise

        cls.saveOutputToFile(config, f)
        cls.saveCommandsToSyntaxFile(config, allCommands)

        usedTime = (time.time() - start_time)
        logQueue.put(
            Lang.get('Processing finished in {:.2f}s').format(usedTime))

        return usedTime
 def err(errMsg):
     """
     Shows given error message in Messagebox
     """
     tk.messagebox.showerror(Lang.get("Error"), errMsg)
Exemple #30
0
        print("WARNNING!! PDTB_explicit_data length is zero, please check")

    #data_explicit_training, maxlength_ex = generate_standard_data([parsed_explicit_data, PDTB_explicit_data], label_type)
    data_implicit_training, maxlength_im_training = generate_standard_data(
        [PDTB_implicit_data_training], label_type)
    data_implicit_testing, maxlength_im_testing = generate_standard_data(
        [PDTB_implicit_data_testing], label_type)
    data = data_implicit_training  #data_explicit_training + data_implicit_training
    maxlength = maxlength_im_training  #maxlength_ex if maxlength_ex > maxlength_im_training else maxlength_im_training
    data_test = data_implicit_testing
    maxlength_test = maxlength_im_testing
    #data = data[0:1000]
    #data_test = data_test[0:1000]
    print("training data size:", len(data))
    print("testing data size:", len(data_test))
    lang = Lang('corpus')
    label = Label("label")

    for num, one_data in enumerate(data):
        lang.index_words(one_data[0])
        lang.index_words(one_data[1])
        label.index_labels(one_data[2],
                           label_type=label_type,
                           label_2_pridict=label_2_pridict)
        if label_type.lower() == "conn":
            label.record_conn_sense(
                one_data[2], one_data[3]
            )  #one_data[2] is always label, if label_type != conn, one_data[2] is sense, one_data[3] is conn
        else:
            label.record_conn_sense(
                one_data[3], one_data[4]