def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return targetANA = ReadConfig.getConfigVal(configMap, 'TargetOutputANAFile', report) prefixFile = ReadConfig.getConfigVal(configMap, 'TargetPrefixGlossListFile', report) complexForms1st = ReadConfig.getConfigVal(configMap, 'TargetComplexFormsWithInflectionOn1stElement', report) complexForms2nd = ReadConfig.getConfigVal(configMap, 'TargetComplexFormsWithInflectionOn2ndElement', report) transferResults = ReadConfig.getConfigVal(configMap, 'TargetTranferResultsFile', report) sentPunct = ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report) if not (targetANA and prefixFile and transferResults and sentPunct): return # Check the validity of the complex forms lists if complexForms1st and not ReadConfig.configValIsList(configMap, 'TargetComplexFormsWithInflectionOn1stElement', report): return if complexForms2nd and not ReadConfig.configValIsList(configMap, 'TargetComplexFormsWithInflectionOn2ndElement', report): return TargetDB = FLExDBAccess() try: # Open the target database targetProj = ReadConfig.getConfigVal(configMap, 'TargetProject', report) if not targetProj: return TargetDB.OpenDatabase(targetProj, verbose = True) except FDA_DatabaseError, e: report.Error(e.message) print "FDO Cache Create failed!" print e.message return
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'AnalyzedTextOutputFile', report) if not outFileVal: return #fullPathTextOutputFile = os.path.join(tempfile.gettempdir(), outFileVal) fullPathTextOutputFile = outFileVal try: f_out = open(fullPathTextOutputFile, 'w') except IOError: report.Error('There is a problem with the Analyzed Text Output File path: '+fullPathTextOutputFile+'. Please check the configuration file setting.') return # Find the desired text text_desired_eng = ReadConfig.getConfigVal(configMap, 'SourceTextName', report) if not text_desired_eng: return foundText = False for text in DB.ObjectsIn(ITextRepository): if text_desired_eng == ITsString(text.Name.BestAnalysisAlternative).Text: foundText = True contents = text.ContentsOA break; if not foundText: # check if it's scripture text for section in DB.ObjectsIn(IScrSectionRepository): if text_desired_eng == ITsString(section.ContentOA.Title.BestAnalysisAlternative).Text: contents = section.ContentOA foundText = True break # Pattern not found if not foundText: report.Error('The text named: '+text_desired_eng+' not found.') return # Get punctuation string sent_punct = unicode(ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report), "utf-8") if not sent_punct: return # Process the text report.Info("Exporting analyses...") typesList = ReadConfig.getConfigVal(configMap, 'SourceComplexTypes', report) if not typesList: typesList = [] elif not ReadConfig.configValIsList(configMap, 'SourceComplexTypes', report): return getSurfaceForm = False outputStrList = Utils.get_interlin_data(DB, report, sent_punct, contents, typesList, getSurfaceForm) report.Info("Export of " + text_desired_eng + " complete.") # Write out all the words for outStr in outputStrList: # Split compound words outStr = Utils.split_compounds(outStr) f_out.write(outStr.encode('utf-8')) f_out.close()
def MainFunction(DB, report, modifyAllowed): # Read the configuration file which we assume is in the current directory. configMap = ReadConfig.readConfig(report) if not configMap: return # Build an output path using the system temp directory. outFileVal = ReadConfig.getConfigVal(configMap, 'AnalyzedTextOutputFile', report) if not outFileVal: return #fullPathTextOutputFile = os.path.join(tempfile.gettempdir(), outFileVal) fullPathTextOutputFile = outFileVal try: f_out = open(fullPathTextOutputFile, 'w') except IOError: report.Error('There is a problem with the Analyzed Text Output File path: '+fullPathTextOutputFile+'. Please check the configuration file setting.') return # Find the desired text text_desired_eng = ReadConfig.getConfigVal(configMap, 'SourceTextName', report) if not text_desired_eng: return foundText = False for text in DB.ObjectsIn(ITextRepository): if text_desired_eng == ITsString(text.Name.BestAnalysisAlternative).Text: foundText = True break; if not foundText: report.Error('The text named: '+text_desired_eng+' not found.') return # Get punctuation string sent_punct = ReadConfig.getConfigVal(configMap, 'SentencePunctuation', report) if not sent_punct: return prev_pv_list = [] prev_e = None outputStrList = [] ccc = 0 # current_complex_component # Process the text report.Info("Exporting analyses...") typesList = ReadConfig.getConfigVal(configMap, 'SourceComplexTypes', report) if not typesList: typesList = [] elif not ReadConfig.configValIsList(configMap, 'SourceComplexTypes', report): return prevEndOffset = 0 # count analysis objects ss = SegmentServices.StTextAnnotationNavigator(text.ContentsOA) for obj_cnt,analysisOccurance in enumerate(ss.GetAnalysisOccurrencesAdvancingInStText()): pass report.ProgressStart(obj_cnt+1) ss = SegmentServices.StTextAnnotationNavigator(text.ContentsOA) for prog_cnt,analysisOccurance in enumerate(ss.GetAnalysisOccurrencesAdvancingInStText()): report.ProgressUpdate(prog_cnt) outStr = affixStr = '' if prevEndOffset > 0: numSpaces = analysisOccurance.GetMyBeginOffsetInPara() - prevEndOffset if numSpaces > 0: outputStrList.append(' '*numSpaces) elif numSpaces < 0: # new paragraph outputStrList.append('\n') prevEndOffset = analysisOccurance.GetMyEndOffsetInPara() if analysisOccurance.Analysis.ClassName == "PunctuationForm": text_punct = ITsString(analysisOccurance.Analysis.Form).Text # See if one or more symbols is part of the user-defined sentence punctuation. If so output the # punctuation as part of a data stream along with the symbol/tag <sent> # convert to lists and take the set intersection if set(list(text_punct)).intersection(set(list(sent_punct))): outStr = "^"+text_punct+"<sent>$" # If not, assume this is non-sentence punctuation and just output the punctuation without a "symbol" e.g. <xxx> else: outStr = text_punct outputStrList.append(outStr) continue if analysisOccurance.Analysis.ClassName == "WfiGloss": wfiAnalysis = analysisOccurance.Analysis.Analysis # Same as Owner elif analysisOccurance.Analysis.ClassName == "WfiAnalysis": wfiAnalysis = analysisOccurance.Analysis # We get into this block if there are no analyses for the word or a analysis suggestion hasn't been accepted. elif analysisOccurance.Analysis.ClassName == "WfiWordform": outStr = ITsString(analysisOccurance.Analysis.Form.BestVernacularAlternative).Text report.Warning('No analysis found for the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' outputStrList.append('^'+outStr+'$') continue else: wfiAnalysis = None # Go through each morpheme in the word (i.e. bundle) for bundle in wfiAnalysis.MorphBundlesOS: if bundle.SenseRA: if bundle.MsaRA: # Get the LexEntry object e = bundleEntry = bundle.MorphRA.Owner # For a stem we just want the headword and it's POS if bundle.MsaRA.ClassName == 'MoStemMsa': # Check for valid POS if not bundle.MsaRA.PartOfSpeechRA: outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No POS found for the word: '+ outStr + ' Treating this is an unknown word.', DB.BuildGotoURL(e)) outStr += '<UNK>' break if bundle.MorphRA: # Go from variant(s) to entry/variant that has a sense # We are only dealing with senses, so we have to get to one. # Along the way collect inflection features associated with # irregularly inflected variant forms so they can be outputted inflFeatAbbrevs = [] e = GetEntryWithSense(e, inflFeatAbbrevs) # See if we have an enclitic or proclitic if ITsString(e.LexemeFormOA.MorphTypeRA.Name.BestAnalysisAlternative).Text in ('proclitic','enclitic'): # Get the clitic gloss. Substitute periods with underscores to make it easier in Apertium. affixStr += '<' + re.sub(r'\.', r'_',ITsString(bundle.SenseRA.Gloss.BestAnalysisAlternative).Text) +'>' # Otherwise we have a root or stem or phrase else: pv_list = [] shared_complex_e = None # Check for adjacent words that point to the same complex form # If the form is a phrasal verb use it as the headword to output if e.ComplexFormEntries.Count > 0: # each word could be part of multiple complex forms (e.g. ra -> char ra, ra raftan for complex_e in e.ComplexFormEntries: if complex_e.EntryRefsOS: # find the complex entry ref (there could be one or more variant entry refs listed along side the complex entry) for entryRef in complex_e.EntryRefsOS: if entryRef.RefType == 1: # 1=complex form, 0=variant if entryRef.ComplexEntryTypesRS: # there could be multiple types assigned to a complex form (e.g. Phrasal Verb, Derivative) # just see if one of them is Phrasal Verb for complexType in entryRef.ComplexEntryTypesRS: if ITsString(complexType.Name.BestAnalysisAlternative).Text in typesList: pos_in_list = get_position_in_component_list(e, complex_e) # The entry we are on has to be at the right postion in the complex form's component list if pos_in_list == ccc: pv_list.append(complex_e) break; # See if we ended up with any phrasal verbs if len(pv_list) == 0: # no phrasal verbs prev_pv_list = [] ccc = 0 else: # yes, we have phrasal verbs if ccc == 0: saved1stbaselineWord = ITsString(analysisOccurance.BaselineText).Text ccc += 1 # First make sure that the entry of the last word isn't the same as this word. In that case, of course there are going to be shared complex forms, but we are only interested in different entries forming a phrasal verb. # See if the previous word had a link to a complex phrasal verb if prev_e != e and len(prev_pv_list) > 0: found = False # See if there is a match between something on the list for the # previous word and this word. for i in range(0, len(prev_pv_list)): for j in range(0, len(pv_list)): if prev_pv_list[i].Guid == pv_list[j].Guid: shared_complex_e = pv_list[j] found = True break if found: break # If we found a match, we remove the previous word from the output and use the complex form if found: component_count = get_component_count(shared_complex_e) if ccc == component_count: ccc = 0 savedTags = '' pv_list = [] # remove n/adj/... and it's tag from being output saveStr = outputStrList.pop() # first pop may have just popped punctuation of spacing if len(outputStrList) > 0: saveStr = outputStrList.pop() # The first component(s) could have tags (from affixes or inflection info.) # Save these tags so they can be put on the end of the complex form. # This kind of assumes that inflection isn't happening on multiple components # because that might give a mess when it's all duplicated on the complex form. g = re.search(r'.+?<\w+>(<.+>)', saveStr) if (g): savedTags += g.group(1) prev_pv_list = copy.copy(pv_list) prev_e = e else: ccc = 0 if shared_complex_e: if shared_complex_e.SensesOS: senseNum = 0 # require only one sense for a complex form # Get headword and set homograph # if necessary headWord = ITsString(shared_complex_e.HeadWord).Text headWord = Utils.do_capitalization(headWord, saved1stbaselineWord) headWord = Utils.add_one(headWord) outStr += headWord + '.' + str(senseNum+1) senseOne = shared_complex_e.SensesOS.ToArray()[0] # Get the POS if senseOne.MorphoSyntaxAnalysisRA.PartOfSpeechRA: outStr += '<' + ITsString(senseOne.MorphoSyntaxAnalysisRA.PartOfSpeechRA.Abbreviation.BestAnalysisAlternative).Text + '>' else: report.Warning("PartOfSpeech object is null.") # Get inflection class abbreviation if senseOne.MorphoSyntaxAnalysisRA.InflectionClassRA: outStr += '<'+ITsString(senseOne.MorphoSyntaxAnalysisRA.InflectionClassRA.\ Abbreviation.BestAnalysisAlternative).Text+'>' # Get any features the stem or root might have if senseOne.MorphoSyntaxAnalysisRA.MsFeaturesOA: feat_abbr_list = [] # The features might be complex, make a recursive function call to find all features get_feat_abbr_list(senseOne.MorphoSyntaxAnalysisRA.MsFeaturesOA.FeatureSpecsOC, feat_abbr_list) # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(feat_abbr_list, key=lambda x: x[0]): outStr += '<' + abb + '>' # Get any features that come from irregularly inflected forms # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(inflFeatAbbrevs, key=lambda x: x[0]): outStr += '<' + abb + '>' # Add the saved tags from a previous complex form component outStr += savedTags else: report.Warning("No senses found for the complex form.") else: # Go through each sense and identify which sense number we have foundSense = False senseNum = 0 for i, mySense in enumerate(e.SensesOS): if mySense.Guid == bundle.SenseRA.Guid: foundSense = True break if foundSense: senseNum = i else: report.Warning("Couldn't find the sense for headword: "+ITsString(e.HeadWord).Text) # Get headword and set homograph # if necessary headWord = ITsString(e.HeadWord).Text headWord = Utils.do_capitalization(headWord, ITsString(analysisOccurance.BaselineText).Text) headWord = Utils.add_one(headWord) outStr += headWord + '.' + str(senseNum+1) # Get the POS if bundle.MsaRA.PartOfSpeechRA: outStr += '<' + ITsString(bundle.MsaRA.PartOfSpeechRA.Abbreviation.BestAnalysisAlternative).Text + '>' else: report.Warning("PartOfSpeech object is null.") # Get inflection class abbreviation if bundle.MsaRA.InflectionClassRA: outStr += '<'+ITsString(bundle.MsaRA.InflectionClassRA.\ Abbreviation.BestAnalysisAlternative).Text+'>' # Get any features the stem or root might have if bundle.MsaRA.MsFeaturesOA: feat_abbr_list = [] # The features might be complex, make a recursive function call to find all features get_feat_abbr_list(bundle.MsaRA.MsFeaturesOA.FeatureSpecsOC, feat_abbr_list) # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(feat_abbr_list, key=lambda x: x[0]): outStr += '<' + abb + '>' # Get any features that come from irregularly inflected forms # This sort will keep the groups in order e.g. 'gender' features will come before 'number' features for grpName, abb in sorted(inflFeatAbbrevs, key=lambda x: x[0]): outStr += '<' + abb + '>' else: report.Warning("Morph object is null.") # We have an affix else: if bundle.SenseRA: # Get the affix gloss. Substitute periods with underscores to make it easier in Apertium. affixStr += '<' + re.sub(r'\.', r'_',ITsString(bundle.SenseRA.Gloss.BestAnalysisAlternative).Text) +'>' else: #e = GetEntryWithSense(e) report.Warning("Sense object for affix is null.") else: outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No morphosyntactic analysis found for some part of the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' break # go on to the next word else: # Part of the word has not been tied to a lexical entry-sense outStr = ITsString(wfiAnalysis.Owner.Form.BestVernacularAlternative).Text report.Warning('No sense found for some part of the word: '+ outStr + ' Treating this is an unknown word.') outStr += '<UNK>' break # go on to the next word outStr += affixStr outputStrList.append('^'+outStr+'$') # Write out all the words for outStr in outputStrList: # Split compound words outStr = split_compounds(outStr) f_out.write(outStr.encode('utf-8')) f_out.close() report.Info('Export of '+str(obj_cnt+1)+' analyses complete to the file: '+fullPathTextOutputFile+'.')