def synthesize(configMap, anaFile, synFile, report=None): error_list = [] targetProject = ReadConfig.getConfigVal(configMap, 'TargetProject', report) clean = ReadConfig.getConfigVal(configMap, 'CleanUpUnknownTargetWords', report) if not (targetProject and clean): error_list.append(('Configuration file problem.', 2)) return error_list if clean[0].lower() == 'y': cleanUpText = True else: cleanUpText = False # Create other files we need for STAMP partPath = os.path.join(tempfile.gettempdir(), targetProject) cmdFileName = create_synthesis_files(partPath) # Synthesize the target text error_list.append(('Synthesizing the target text...', 0)) # run STAMP to synthesize the results. E.g. stamp32" -f Gilaki-Thesis_ctrl_files. txt -i pes_verbs.ana -o pes_verbs.syn # this assumes stamp32.exe is in the current working directory. call(['stamp32.exe', '-f', cmdFileName, '-i', anaFile, '-o', synFile]) error_list.append(('Fixing up the target text...', 0)) # Replace underscores with spaces in the Synthesized file # Underscores were added for multiword entries that contained a space fix_up_text(synFile, cleanUpText) return error_list
def MainFunction(DB, report, modify=True): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # get the path to the source file srcFile = ReadConfig.getConfigVal(configMap, 'AnalyzedTextOutputFile', report) if not srcFile: return # get the path to the target file tgtFile = ReadConfig.getConfigVal(configMap, 'TargetTranferResultsFile', report) if not tgtFile: return # see if we have advanced transfer going on by seeing if the .t3x file is present advanced = False postchunk_rules_file = Utils.OUTPUT_FOLDER + '\\transfer_rules.t3x' # Check if the file exists. if os.path.isfile(postchunk_rules_file): advanced = True # get temporary file name for html results htmlFile = os.path.join(tempfile.gettempdir(), 'FlexTransFileViewer.html') # Show the window app = QtGui.QApplication(sys.argv) window = Main(srcFile, tgtFile, htmlFile, advanced) window.show() app.exec_()
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'TargetPrefixGlossListFile', report) if not outFileVal: return myPath = os.path.join(tempfile.gettempdir(), outFileVal) try: f_out = open(myPath, 'w') except IOError as e: report.Error('There was a problem creating the Target Prefix Gloss List File: '+myPath+'. Please check the configuration file setting.') TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
def extract_target_lex(DB, configMap, report=None): error_list = [] TargetDB = FLExDBAccess() # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: error_list.append(('Configuration file problem with TargetProject.', 2)) return error_list # See if the target project is a valid database name. if targetProj not in DB.GetDatabaseNames(): error_list.append(('The Target Database does not exist. Please check the configuration file.', 2)) return error_list try: # Open the target database TargetDB.OpenDatabase(targetProj, verbose = True) if not targetProj: error_list.append(('Problem accessing the target project.', 2)) return error_list except FDA_DatabaseError, e: error_list.append((e.message, 2)) error_list.append(('There was an error opening target database: '+targetProj+'.', 2)) return error_list
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'TargetPrefixGlossListFile', report) if not outFileVal: return error_list = catalog_affixes(DB, configMap, outFileVal, report) # output info, warnings, errors for msg in error_list: # msg is a pair -- string & code if msg[1] == 0: report.Info(msg[0]) elif msg[1] == 1: report.Warning(msg[0]) else: # error=2 report.Error(msg[0])
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return targetANA = ReadConfig.getConfigVal(configMap, 'TargetOutputANAFile', report) prefixFile = ReadConfig.getConfigVal(configMap, 'TargetPrefixGlossListFile', report) complexForms1st = ReadConfig.getConfigVal(configMap, 'TargetComplexFormsWithInflectionOn1stElement', report) complexForms2nd = ReadConfig.getConfigVal(configMap, 'TargetComplexFormsWithInflectionOn2ndElement', report) transferResults = ReadConfig.getConfigVal(configMap, 'TargetTranferResultsFile', report) sentPunct = ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report) if not (targetANA and prefixFile and transferResults and sentPunct): return # Check the validity of the complex forms lists if complexForms1st and not ReadConfig.configValIsList(configMap, 'TargetComplexFormsWithInflectionOn1stElement', report): return if complexForms2nd and not ReadConfig.configValIsList(configMap, 'TargetComplexFormsWithInflectionOn2ndElement', report): return TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Allow the synthesis and ana files to not be in the temp folder if a slash is present targetANA = ReadConfig.getConfigVal(configMap, 'TargetOutputANAFile', report) targetSynthesis = ReadConfig.getConfigVal(configMap, 'TargetOutputSynthesisFile', report) if not (targetANA and targetSynthesis): return anaFile = Utils.build_path_default_to_temp(targetANA) synFile = Utils.build_path_default_to_temp(targetSynthesis) # Extract the target lexicon error_list = extract_target_lex(DB, configMap, report) # Synthesize the new target text err_list = synthesize(configMap, anaFile, synFile, report) error_list.extend(err_list) # output info, warnings, errors for triplet in error_list: msg = triplet[0] code = triplet[1] # sometimes we'll have a url to output in the error/warning if len(triplet) == 3: url = triplet[2] else: url = None if code == 0: report.Info(msg, url) elif code == 1: report.Warning(msg, url) else: # error=2 report.Error(msg, url)
def catalog_affixes(DB, configMap, filePath, report=None): error_list = [] morphNames = ReadConfig.getConfigVal(configMap, 'TargetMorphNamesCountedAsRoots', report) if not morphNames: error_list.append(('Problem reading the configuration file for the property: TargetMorphNamesCountedAsRoots', 2)) return error_list TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: error_list.append(('Problem accessing the target project.', 2)) return error_list TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: error_list.append(('There was an error opening target database: '+targetProj+'.', 2)) error_list.append((e.message, 2)) return error_list
def MainFunction(DB, report, modify=True): if not modify: report.Error('You need to run this module in "modify mode."') return # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Get need configuration file properties text_desired_eng = ReadConfig.getConfigVal(configMap, 'SourceTextName', report) sourceMorphNames = ReadConfig.getConfigVal(configMap, 'SourceMorphNamesCountedAsRoots', report) linkField = ReadConfig.getConfigVal(configMap, 'SourceCustomFieldForEntryLink', report) numField = ReadConfig.getConfigVal(configMap, 'SourceCustomFieldForSenseNum', report) targetMorphNames = ReadConfig.getConfigVal(configMap, 'TargetMorphNamesCountedAsRoots', report) if not (text_desired_eng and linkField and numField and text_desired_eng and sourceMorphNames): return # Find the desired text foundText = False for text in DB.ObjectsIn(ITextRepository): if text_desired_eng == ITsString(text.Name.BestAnalysisAlternative).Text: foundText = True break; if not foundText: report.Error('The text named: '+text_desired_eng+' not found.') return senseEquivField = DB.LexiconGetSenseCustomFieldNamed(linkField) senseNumField = DB.LexiconGetSenseCustomFieldNamed(numField) if not (senseEquivField): report.Error(linkField + " field doesn't exist. Please read the instructions.") if not (senseNumField): report.Error(numField + " field doesn't exist. Please read the instructions.") if not (senseEquivField and senseNumField): return TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, modify, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
def MainFunction(DB, report, modifyAllowed): # Constants for building the output lines in the dictionary file. s1 = ' <e><p><l>' s1i =' <e><i>' s2 = '<s n="' s3 = '"/></l><r>' s4 = '"/></r></p></e>' s4a ='"/>' s4b='</r></p></e>' s4i ='"/></i></e>' # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return catSub = ReadConfig.getConfigVal(configMap, 'CategoryAbbrevSubstitutionList', report) linkField = ReadConfig.getConfigVal(configMap, 'SourceCustomFieldForEntryLink', report) senseNumField = ReadConfig.getConfigVal(configMap, 'SourceCustomFieldForSenseNum', report) sourceMorphNames = ReadConfig.getConfigVal(configMap, 'SourceMorphNamesCountedAsRoots', report) sentPunct = ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report) if not (linkField and senseNumField and sourceMorphNames and sentPunct): return # Transform the straight list of category abbreviations to a list of tuples catSubList = [] if catSub: try: for i in range(0,len(catSub),2): catSubList.append((catSub[i],catSub[i+1])) except: report.Error('Ill-formed property: "CategoryAbbrevSubstitutionList". Expected pairs of categories.') return TargetDB = FLExDBAccess() # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return # See if the target project is a valid database name. if targetProj not in DB.GetDatabaseNames(): report.Error('The Target Database does not exist. Please check the configuration file.') return try: TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return report.Info('Using: '+targetProj+' as the target database.') targetProject = ReadConfig.getConfigVal(configMap, 'TargetProject', report) targetANA = ReadConfig.getConfigVal(configMap, 'TargetOutputANAFile', report) targetSynthesis = ReadConfig.getConfigVal(configMap, 'TargetOutputSynthesisFile', report) morphNames = ReadConfig.getConfigVal(configMap, 'TargetMorphNamesCountedAsRoots', report) clean = ReadConfig.getConfigVal(configMap, 'CleanUpUnknownTargetWords', report) if not (targetProject and targetANA and targetSynthesis and morphNames and clean): return if clean[0].lower() == 'y': cleanUpText = True else: cleanUpText = False partPath = os.path.join(tempfile.gettempdir(), targetProject) anaFile = os.path.join(tempfile.gettempdir(), targetANA)
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'AnalyzedTextOutputFile', report) if not outFileVal: return #fullPathTextOutputFile = os.path.join(tempfile.gettempdir(), outFileVal) fullPathTextOutputFile = outFileVal try: f_out = open(fullPathTextOutputFile, 'w') except IOError: report.Error('There is a problem with the Analyzed Text Output File path: '+fullPathTextOutputFile+'. Please check the configuration file setting.') return # Find the desired text text_desired_eng = ReadConfig.getConfigVal(configMap, 'SourceTextName', report) if not text_desired_eng: return foundText = False for text in DB.ObjectsIn(ITextRepository): if text_desired_eng == ITsString(text.Name.BestAnalysisAlternative).Text: foundText = True contents = text.ContentsOA break; if not foundText: # check if it's scripture text for section in DB.ObjectsIn(IScrSectionRepository): if text_desired_eng == ITsString(section.ContentOA.Title.BestAnalysisAlternative).Text: contents = section.ContentOA foundText = True break # Pattern not found if not foundText: report.Error('The text named: '+text_desired_eng+' not found.') return # Get punctuation string sent_punct = unicode(ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report), "utf-8") if not sent_punct: return # Process the text report.Info("Exporting analyses...") typesList = ReadConfig.getConfigVal(configMap, 'SourceComplexTypes', report) if not typesList: typesList = [] elif not ReadConfig.configValIsList(configMap, 'SourceComplexTypes', report): return getSurfaceForm = False outputStrList = Utils.get_interlin_data(DB, report, sent_punct, contents, typesList, getSurfaceForm) report.Info("Export of " + text_desired_eng + " complete.") # Write out all the words for outStr in outputStrList: # Split compound words outStr = Utils.split_compounds(outStr) f_out.write(outStr.encode('utf-8')) f_out.close()
def MainFunction(DB, report, modify=True): transFile = 'Output\\transfer_rules.t1x' # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Get the path to the bilingual file if 'BilingualDictOutputFile' not in configMap or configMap['BilingualDictOutputFile'] == '': report.Error('Did not find the entry BilingualDictOutputFile in the configuration file') return bilingFile = ReadConfig.getConfigVal(configMap, 'BilingualDictOutputFile', report) # Make a backup copy of the transfer rule file shutil.copy2(transFile, transFile+'.old') # Read in the bilingual lexicon XML file try: bilingEtree = ET.parse(bilingFile) except IOError: report.Error('Could not open the Bilingual Dictionary File: '+bilingFile+'. Make sure you run the Extract Bilingual Lexicon module first.') return bilingRoot = bilingEtree.getroot() # Read in the transfer rule file try: transEtree = ET.parse(transFile) except IOError: report.Error('There is a problem with the Transfr Rule File: '+transFile+'.') return transRoot = transEtree.getroot() # Find the sdefs (symbol definitions) element in the bilingual file sdefs = bilingRoot.find('sdefs') # Find the section-def-attrs (attribute definition section) in the transfer rules file section_def_attrs = transRoot.find("section-def-attrs") # See if a def-attr (attribute definition) element exists that is called a_gram_cat def_attr = transRoot.find(".//*[@n='a_gram_cat']") # If it doesn't exist create it and add it to section-def-attrs if def_attr is None: def_attr = ET.Element('def-attr') def_attr.attrib['n'] = 'a_gram_cat' section_def_attrs.append(def_attr) # Loop through all of the symbol definition (sdef) elements in the bilingual file for my_sdef in sdefs: # Get the c (comment) and n (value) attributes for the current sdef # Create an attr-item element new_attr_item = ET.Element('attr-item') # Set its c and tags attributes new_attr_item.attrib['c'] = my_sdef.attrib['c'] new_attr_item.attrib['tags'] = my_sdef.attrib['n'] # Append the attr-item element to the gram cat def_attr def_attr.append(new_attr_item) # Write the transfer rule file ff = codecs.open(transFile, 'w', 'utf-8') ff.write('<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE transfer PUBLIC "-//XMLmind//DTD transfer//EN"\n"transfer.dtd">\n') transEtree.write(ff, 'utf-8')
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'AnalyzedTextOutputFile', report) if not outFileVal: return #fullPathTextOutputFile = os.path.join(tempfile.gettempdir(), outFileVal) fullPathTextOutputFile = outFileVal try: f_out = open(fullPathTextOutputFile, 'w') except IOError: report.Error('There is a problem with the Analyzed Text Output File path: '+fullPathTextOutputFile+'. Please check the configuration file setting.') return # Find the desired text text_desired_eng = ReadConfig.getConfigVal(configMap, 'SourceTextName', report) if not text_desired_eng: return foundText = False for text in DB.ObjectsIn(ITextRepository): if text_desired_eng == ITsString(text.Name.BestAnalysisAlternative).Text: foundText = True break; if not foundText: report.Error('The text named: '+text_desired_eng+' not found.') return # Get punctuation string sent_punct = ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report) if not sent_punct: return prev_pv_list = [] prev_e = None outputStrList = [] ccc = 0 # current_complex_component # Process the text report.Info("Exporting analyses...") typesList = ReadConfig.getConfigVal(configMap, 'SourceComplexTypes', report) if not typesList: typesList = [] elif not ReadConfig.configValIsList(configMap, 'SourceComplexTypes', report): return prevEndOffset = 0 # count analysis objects ss = SegmentServices.StTextAnnotationNavigator(text.ContentsOA) for obj_cnt,analysisOccurance in enumerate(ss.GetAnalysisOccurrencesAdvancingInStText()): pass report.ProgressStart(obj_cnt+1) ss = SegmentServices.StTextAnnotationNavigator(text.ContentsOA) for prog_cnt,analysisOccurance in enumerate(ss.GetAnalysisOccurrencesAdvancingInStText()): report.ProgressUpdate(prog_cnt) outStr = affixStr = '' if prevEndOffset > 0: numSpaces = analysisOccurance.GetMyBeginOffsetInPara() - prevEndOffset if numSpaces > 0: outputStrList.append(' '*numSpaces) elif numSpaces < 0: # new paragraph outputStrList.append('\n') prevEndOffset = analysisOccurance.GetMyEndOffsetInPara() if analysisOccurance.Analysis.ClassName == "PunctuationForm": text_punct = ITsString(analysisOccurance.Analysis.Form).Text # See if one or more symbols is part of the user-defined sentence punctuation. If so output the # punctuation as part of a data stream along with the symbol/tag <sent> # convert to lists and take the set intersection if set(list(text_punct)).intersection(set(list(sent_punct))): outStr = "^"+text_punct+"<sent>$" # If not, assume this is non-sentence punctuation and just output the punctuation without a "symbol" e.g. <xxx> else: outStr = text_punct outputStrList.append(outStr) continue if analysisOccurance.Analysis.ClassName == "WfiGloss": wfiAnalysis = analysisOccurance.Analysis.Analysis # Same as Owner elif analysisOccurance.Analysis.ClassName == "WfiAnalysis": wfiAnalysis = analysisOccurance.Analysis # We get into this block if there are no analyses for the word or a analysis suggestion hasn't been accepted. elif analysisOccurance.Analysis.ClassName == "WfiWordform": outStr = ITsString(analysisOccurance.Analysis.Form.BestVernacularAlternative).Text report.Warning('No analysis found for the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' outputStrList.append('^'+outStr+'$') continue else: wfiAnalysis = None # Go through each morpheme in the word (i.e. bundle) for bundle in wfiAnalysis.MorphBundlesOS: if bundle.SenseRA: if bundle.MsaRA: # Get the LexEntry object e = bundleEntry = bundle.MorphRA.Owner # For a stem we just want the headword and it's POS if bundle.MsaRA.ClassName == 'MoStemMsa': # Check for valid POS if not bundle.MsaRA.PartOfSpeechRA: outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No POS found for the word: '+ outStr + ' Treating this is an unknown word.', DB.BuildGotoURL(e)) outStr += '<UNK>' break if bundle.MorphRA: # Go from variant(s) to entry/variant that has a sense # We are only dealing with senses, so we have to get to one. # Along the way collect inflection features associated with # irregularly inflected variant forms so they can be outputted inflFeatAbbrevs = [] e = GetEntryWithSense(e, inflFeatAbbrevs) # See if we have an enclitic or proclitic if ITsString(e.LexemeFormOA.MorphTypeRA.Name.BestAnalysisAlternative).Text in ('proclitic','enclitic'): # Get the clitic gloss. Substitute periods with underscores to make it easier in Apertium. affixStr += '<' + re.sub(r'\.', r'_',ITsString(bundle.SenseRA.Gloss.BestAnalysisAlternative).Text) +'>' # Otherwise we have a root or stem or phrase else: pv_list = [] shared_complex_e = None # Check for adjacent words that point to the same complex form # If the form is a phrasal verb use it as the headword to output if e.ComplexFormEntries.Count > 0: # each word could be part of multiple complex forms (e.g. ra -> char ra, ra raftan for complex_e in e.ComplexFormEntries: if complex_e.EntryRefsOS: # find the complex entry ref (there could be one or more variant entry refs listed along side the complex entry) for entryRef in complex_e.EntryRefsOS: if entryRef.RefType == 1: # 1=complex form, 0=variant if entryRef.ComplexEntryTypesRS: # there could be multiple types assigned to a complex form (e.g. Phrasal Verb, Derivative) # just see if one of them is Phrasal Verb for complexType in entryRef.ComplexEntryTypesRS: if ITsString(complexType.Name.BestAnalysisAlternative).Text in typesList: pos_in_list = get_position_in_component_list(e, complex_e) # The entry we are on has to be at the right postion in the complex form's component list if pos_in_list == ccc: pv_list.append(complex_e) break; # See if we ended up with any phrasal verbs if len(pv_list) == 0: # no phrasal verbs prev_pv_list = [] ccc = 0 else: # yes, we have phrasal verbs if ccc == 0: saved1stbaselineWord = ITsString(analysisOccurance.BaselineText).Text ccc += 1 # First make sure that the entry of the last word isn't the same as this word. In that case, of course there are going to be shared complex forms, but we are only interested in different entries forming a phrasal verb. # See if the previous word had a link to a complex phrasal verb if prev_e != e and len(prev_pv_list) > 0: found = False # See if there is a match between something on the list for the # previous word and this word. for i in range(0, len(prev_pv_list)): for j in range(0, len(pv_list)): if prev_pv_list[i].Guid == pv_list[j].Guid: shared_complex_e = pv_list[j] found = True break if found: break # If we found a match, we remove the previous word from the output and use the complex form if found: component_count = get_component_count(shared_complex_e) if ccc == component_count: ccc = 0 savedTags = '' pv_list = [] # remove n/adj/... and it's tag from being output saveStr = outputStrList.pop() # first pop may have just popped punctuation of spacing if len(outputStrList) > 0: saveStr = outputStrList.pop() # The first component(s) could have tags (from affixes or inflection info.) # Save these tags so they can be put on the end of the complex form. # This kind of assumes that inflection isn't happening on multiple components # because that might give a mess when it's all duplicated on the complex form. g = re.search(r'.+?<\w+>(<.+>)', saveStr) if (g): savedTags += g.group(1) prev_pv_list = copy.copy(pv_list) prev_e = e else: ccc = 0 if shared_complex_e: if shared_complex_e.SensesOS: senseNum = 0 # require only one sense for a complex form # Get headword and set homograph # if necessary headWord = ITsString(shared_complex_e.HeadWord).Text headWord = Utils.do_capitalization(headWord, saved1stbaselineWord) headWord = Utils.add_one(headWord) outStr += headWord + '.' + str(senseNum+1) senseOne = shared_complex_e.SensesOS.ToArray()[0] # Get the POS if senseOne.MorphoSyntaxAnalysisRA.PartOfSpeechRA: outStr += '<' + ITsString(senseOne.MorphoSyntaxAnalysisRA.PartOfSpeechRA.Abbreviation.BestAnalysisAlternative).Text + '>' else: report.Warning("PartOfSpeech object is null.") # Get inflection class abbreviation if senseOne.MorphoSyntaxAnalysisRA.InflectionClassRA: outStr += '<'+ITsString(senseOne.MorphoSyntaxAnalysisRA.InflectionClassRA.\ Abbreviation.BestAnalysisAlternative).Text+'>' # Get any features the stem or root might have if senseOne.MorphoSyntaxAnalysisRA.MsFeaturesOA: feat_abbr_list = [] # The features might be complex, make a recursive function call to find all features get_feat_abbr_list(senseOne.MorphoSyntaxAnalysisRA.MsFeaturesOA.FeatureSpecsOC, feat_abbr_list) # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(feat_abbr_list, key=lambda x: x[0]): outStr += '<' + abb + '>' # Get any features that come from irregularly inflected forms # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(inflFeatAbbrevs, key=lambda x: x[0]): outStr += '<' + abb + '>' # Add the saved tags from a previous complex form component outStr += savedTags else: report.Warning("No senses found for the complex form.") else: # Go through each sense and identify which sense number we have foundSense = False senseNum = 0 for i, mySense in enumerate(e.SensesOS): if mySense.Guid == bundle.SenseRA.Guid: foundSense = True break if foundSense: senseNum = i else: report.Warning("Couldn't find the sense for headword: "+ITsString(e.HeadWord).Text) # Get headword and set homograph # if necessary headWord = ITsString(e.HeadWord).Text headWord = Utils.do_capitalization(headWord, ITsString(analysisOccurance.BaselineText).Text) headWord = Utils.add_one(headWord) outStr += headWord + '.' + str(senseNum+1) # Get the POS if bundle.MsaRA.PartOfSpeechRA: outStr += '<' + ITsString(bundle.MsaRA.PartOfSpeechRA.Abbreviation.BestAnalysisAlternative).Text + '>' else: report.Warning("PartOfSpeech object is null.") # Get inflection class abbreviation if bundle.MsaRA.InflectionClassRA: outStr += '<'+ITsString(bundle.MsaRA.InflectionClassRA.\ Abbreviation.BestAnalysisAlternative).Text+'>' # Get any features the stem or root might have if bundle.MsaRA.MsFeaturesOA: feat_abbr_list = [] # The features might be complex, make a recursive function call to find all features get_feat_abbr_list(bundle.MsaRA.MsFeaturesOA.FeatureSpecsOC, feat_abbr_list) # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(feat_abbr_list, key=lambda x: x[0]): outStr += '<' + abb + '>' # Get any features that come from irregularly inflected forms # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(inflFeatAbbrevs, key=lambda x: x[0]): outStr += '<' + abb + '>' else: report.Warning("Morph object is null.") # We have an affix else: if bundle.SenseRA: # Get the affix gloss. Substitute periods with underscores to make it easier in Apertium. affixStr += '<' + re.sub(r'\.', r'_',ITsString(bundle.SenseRA.Gloss.BestAnalysisAlternative).Text) +'>' else: #e = GetEntryWithSense(e) report.Warning("Sense object for affix is null.") else: outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No morphosyntactic analysis found for some part of the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' break # go on to the next word else: # Part of the word has not been tied to a lexical entry-sense outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No sense found for some part of the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' break # go on to the next word outStr += affixStr outputStrList.append('^'+outStr+'$') # Write out all the words for outStr in outputStrList: # Split compound words outStr = split_compounds(outStr) f_out.write(outStr.encode('utf-8')) f_out.close() report.Info('Export of '+str(obj_cnt+1)+' analyses complete to the file: '+fullPathTextOutputFile+'.')
return error_list try: # Open the target database TargetDB.OpenDatabase(targetProj, verbose = True) if not targetProj: error_list.append(('Problem accessing the target project.', 2)) return error_list except FDA_DatabaseError, e: error_list.append((e.message, 2)) error_list.append(('There was an error opening target database: '+targetProj+'.', 2)) return error_list error_list.append(('Using: '+targetProj+' as the target database.', 0)) targetProject = ReadConfig.getConfigVal(configMap, 'TargetProject', report) morphNames = ReadConfig.getConfigVal(configMap, 'TargetMorphNamesCountedAsRoots', report) if not (targetProject and morphNames): error_list.append(('Configuration file problem.', 2)) return error_list # Create a path to the temporary folder + project name partPath = os.path.join(tempfile.gettempdir(), targetProject) # If the target database hasn't changed since we created the root databse file, don't do anything. if is_root_file_out_of_date(TargetDB, partPath+'_rt.dic') == False: error_list.append(('Target lexicon files are up to date.', 0)) return error_list # Create the dictionary files in a temp folder (f_pf, f_if, f_sf, f_rt, f_dec) = create_dictionary_files(partPath)
report.Info('Using: '+targetProj+' as the target database.') # Set objects for the two custom fields. Report errors if they don't exist in the source project. senseEquivField = DB.LexiconGetSenseCustomFieldNamed(linkField) senseSenseNumField = DB.LexiconGetSenseCustomFieldNamed(senseNumField) if not (senseEquivField): report.Error(linkField + " field doesn't exist. Please read the instructions.") return if not (senseSenseNumField): report.Error(senseNumField + " field doesn't exist. Please read the instructions.") return bilingFile = ReadConfig.getConfigVal(configMap, 'BilingualDictOutputFile', report) if not bilingFile: return fullPathBilingFile = bilingFile #fullPathBilingFile = os.path.join(tempfile.gettempdir(), bilingFile) #f_out = open(fullPathBilingFile, 'w') try: f_out = open(fullPathBilingFile, 'w') except IOError as e: report.Error('There was a problem creating the Bilingual Dictionary Output File: '+fullPathBilingFile+'. Please check the configuration file setting.') report.Info("Outputing category information...") f_out.write('<dictionary>\n')
def do_replacements(configMap, report): # See if we need to do replacements # See if the config setting is there or if it has valid info. if 'BilingualDictOutputFile' not in configMap or configMap['BilingualDictOutputFile'] == '': return biling = os.path.join(tempfile.gettempdir(), configMap['BilingualDictOutputFile']) replFile = ReadConfig.getConfigVal(configMap, 'BilingualDictReplacementFile', report) if not replFile: return shutil.copy2(biling, biling+'.old') f_a = open(biling+'.old') f_b = open(biling,'w') try: f_r = open(replFile) except IOError: report.Error('There is a problem with the Bilingual Dictionary Replacement File: '+replFile+'. Please check the configuration file setting.') return replMap = {} s_lines = [] append_lines = [] insertion_not_done = True do_append = False # First read the replacement file. Comment lines are ignored. # Read the additional sdef lines into a list. # Read the replacement lines into a map with the lemma as the key for line_r in f_r: line_r = unicode(line_r, 'utf-8') g = re.search(r'lines to be appended',line_r) if g: do_append = True g = re.search(r'<sdef ',line_r) if g: s_lines.append(line_r) continue g = re.search(r'<[li]>(.+?)<s',line_r) # get the lemma which is between <l> or <i> and <s...> if g: if do_append == True: append_lines.append(line_r) else: # replacement lines replMap[g.group(1)] = line_r # Read through the bilingual dictionary for line in f_a: line = unicode(line, 'utf-8') # if we find the first sdef line, insert the ones from the replace file here g = re.search(r'<sdef ',line) if insertion_not_done and g: insertion_not_done = False # Leave comments before and after the inserted lines f_b.write('<!-- Inserted sdef lines from replace file -->\n') for sdef_line in s_lines: f_b.write(sdef_line.encode('utf-8')) f_b.write('<!-- end of insertion -->\n') # get current lemma g = re.search(r'<[li]>(.+?)<s',line) if g: # we we match on the current lemma, do the replacement if g.group(1) in replMap: # Leave a comment before the old line f_b.write('<!-- This line replaced with the one below it from the file ' + replFile + ' -->\n') line = line.rstrip() # Comment out the old line f_b.write('<!-- '+line.encode('utf-8')+'-->\n') f_b.write(replMap[g.group(1)].encode('utf-8')) continue # find the end of the section g = re.search(r'/section',line) if g: # Append the new lines now f_b.write('<!-- Custom lines appended below. -->\n') for new_line in append_lines: f_b.write(new_line.encode('utf-8')) f_b.write(line.encode('utf-8'))
def readLexicalInfo(self): configMap = ReadConfig.readConfig(self.report) morphNames = ReadConfig.getConfigVal(configMap, 'TargetMorphNamesCountedAsRoots', self.report) if not morphNames: self.report.Warning('Configuration File Problem. Morphnames not found.') return # Loop through all the entries for i,e in enumerate(self.db.LexiconAllEntries()): morphType = ITsString(e.LexemeFormOA.MorphTypeRA.Name.BestAnalysisAlternative).Text # If no senses, skip it if e.SensesOS.Count == 0: continue else: # Entry with senses # Loop through senses for i, mySense in enumerate(e.SensesOS): gloss = ITsString(mySense.Gloss.BestAnalysisAlternative).Text # Process roots # Don't process clitics in this block if e.LexemeFormOA and \ e.LexemeFormOA.ClassName == 'MoStemAllomorph' and \ e.LexemeFormOA.MorphTypeRA and morphType in morphNames: # Set the headword value and the homograph #, if necessary headWord = ITsString(e.HeadWord).Text headWord = Utils.add_one(headWord) # Only take word senses that have a grammatical category set. if mySense.MorphoSyntaxAnalysisRA.ClassName == 'MoStemMsa': if mySense.MorphoSyntaxAnalysisRA.PartOfSpeechRA: # build the word sense and add it to the map wordSense = headWord+'.'+str(i+1) wordSense = re.sub(' ', '_', wordSense) # change spaces to underscores self.mapWordSenses[wordSense] = 7 # dummy value # Now process non-roots else: if gloss == None: continue elif e.LexemeFormOA == None: continue elif e.LexemeFormOA.MorphTypeRA == None: continue elif e.LexemeFormOA.ClassName != 'MoStemAllomorph': if e.LexemeFormOA.ClassName == 'MoAffixAllomorph': gloss = re.sub(r'\.', '_', gloss) self.__saveAffixGloss(gloss) else: continue # err_list.append(('Skipping entry since the lexeme is of type: '+e.LexemeFormOA.ClassName, 1, TargetDB.BuildGotoURL(e))) elif morphType not in morphNames: if morphType == 'proclitic' or morphType == 'enclitic': gloss = re.sub(r'\.', '_', gloss) self.__saveAffixGloss(gloss) else: continue # err_list.append(('Skipping entry because the morph type is: ' + morphType, 1, TargetDB.BuildGotoURL(e)))
def do_replacements(configMap, report, fullPathBilingFile): # See if we need to do replacements # See if the config setting is there or if it has valid info. if 'BilingualDictOutputFile' not in configMap or configMap['BilingualDictOutputFile'] == '': return #biling = os.path.join(tempfile.gettempdir(), configMap['BilingualDictOutputFile']) replFile = ReadConfig.getConfigVal(configMap, 'BilingualDictReplacementFile', report) if not replFile: return # Save a copy of the bilingual dictionary shutil.copy2(fullPathBilingFile, fullPathBilingFile+'.old') # Parse the replacement file as XML try: replEtree = ET.parse(replFile) except IOError: report.Error('There is a problem with the Bilingual Dictionary Replacement File: '+replFile+'. Please check the configuration file setting.') return replMap = {} replRoot = replEtree.getroot() ## Put the replacement entries into a map # Get the replacement entries section repl_sec = replRoot.find(".//*[@id='replacement']") # Loop through the entries in this section for entry in repl_sec: # Get the <l> text which is under the <p> which is under the <e> left = entry.find('p/l') replMap[left.text] = entry # Read in the bilingual xml file try: bilingEtree = ET.parse(fullPathBilingFile) except IOError: report.Error('There is a problem reading the Bilingual Dictionary File: '+fullPathBilingFile+'.') return ## Add in new symbol definitions from the replacement file bilingRoot = bilingEtree.getroot() # Get symbol definitions element (sdefs) bilingSdefs = bilingRoot.find('sdefs') replSdefs = replRoot.find('sdefs') # Create a map of all the symbol abbreviations in the bilingual dictionary sdfMap={} for mySdef in bilingSdefs: sdfMap[mySdef.attrib['n']]=1 # Add a comment before the new sdefs get added comment = ET.Comment('Inserted symbol definitions from replacement file') bilingSdefs.append(comment) # Loop through the replacement sdefs for symbol_def in replSdefs: # if the symbol abbreviation doesn't already exist, add it if symbol_def.attrib['n'] not in sdfMap: # add the sdef element from repl file to the end of the biling sdefs list bilingSdefs.append(symbol_def) ## Find entries that match replacement entries, comment out the old and insert the new # Get the section element biling_section = bilingRoot.find('section') # Create a new section element to replace the old new_biling_section = ET.Element('section') new_biling_section.attrib = biling_section.attrib # Loop through all the bilingual entries for entry in biling_section: # Get the left lemma text left = entry.find('p/l') # If we can't find it, use the identity text <e> should either have <l> (and <r>) or <i> if left == None: left = entry.find('i') # See if we have a match for replacing the entry if left.text in replMap: # Create a comment containing the old entry and a note and insert them into the entry list comment1 = ET.Comment('This entry was replaced with the one below it from the file ' + replFile + '.\n') # Create string with the old contents of the entry. Using tostring() didn't work because of &# symbols come out for non-ascii text if left.tag == 'i': s = 'identity: ' + left.text + ' (' + left.find('s').attrib['n'] + ')' else: s = 'left: ' + left.text + ' (' + left.find('s').attrib['n'] + ')' s += ', right: ' + entry.find('p/r').text + ' (' + entry.find('p/r/s').attrib['n'] + ')' comment2 = ET.Comment(s+'\n') new_biling_section.append(comment1) new_biling_section.append(comment2) # Insert the new entry from the replacement file map new_biling_section.append(replMap[left.text]) else: # copy the old entry to the new new_biling_section.append(entry) ## Add the entries from the replacement file marked as 'append' # Get the append entries section append_sec = replRoot.find(".//*[@id='append']") # Make a comment and adds it comment = ET.Comment('Custom entries appended below from the file ' + replFile + '.\n') new_biling_section.append(comment) # Loop through these entries for entry in append_sec: # add them to the list of bilingual entries new_biling_section.append(entry) # Remove the old entries list and add the new bilingRoot.remove(biling_section) bilingRoot.append(new_biling_section) bilingEtree.write(fullPathBilingFile, 'utf-8', True) # Insert the DOCTYPE as the 2nd line of the file. f = open(fullPathBilingFile, "r") contents = f.readlines() f.close() contents.insert(1, '<!DOCTYPE dictionary PUBLIC "-//XMLmind//DTD dictionary//EN" "dix.dtd">\n') f = open(fullPathBilingFile, 'w') contents = "".join(contents) f.write(contents) f.close()