def add_morph(original, new): """ Adds the original morphological analysis to the replacing words. >>> add_morph([(u'les', u'DET'), (u'petites', u'JJ'), (u'filles', u'NN')],\\ ... [(u'les', u'DET'), (u'mignon', u'JJ'), (u'chat', u'NN')]) ([u'^le<det><def><mf><pl>$', u'^mignon<adj><f><pl>$', \ u'^chat<n><m><pl>$'], \ [(u'mignon', u'JJ'), (u'chat', u'NN')]) :param original: list of the original words with their POS tags as tuples :param new: tuple list of the words of the new poem with their POS tags :return: a tuple with a list of the lemmas of the new poem with the morphological rules for the word form generation ('^word<tag1><tag2>...$'), and a tuple list of the words used in the replacement with their POS tags """ string_orig = '' string_new = '' analysis = [] new_words = [] if len(original) == len(new): for i in range(len(original)): string_orig = string_orig + original[i][0] + ' ' string_new = string_new + new[i][0] + ' ' morph_orig = morpha_fr.analyse_morphologically(string_orig, output='list') morph_new = morpha_fr.analyse_morphologically(string_new, output='list') if len(morph_orig) == len(morph_new): for i in range(len(morph_orig)): pos = original[i][1] block_list = [u'avoir', #have u'être', #be u'aller', #go u'venir', #come u'faire', #do u'vouloir']#want block = False for b in block_list: if morph_orig[i].lower().find(b) != -1: block = True if block: new_word = re.sub(ur'\^(.*?)/.*', '\g<1>', morph_orig[i]) elif morph_orig[i] != morph_new[i]: (new_word, is_new) = find_correct_analysis(morph_new[i], morph_orig[i], pos) if is_new: new_words.append(new[i]) else: (new_word, is_new) = find_correct_analysis(morph_orig[i], morph_orig[i], pos) analysis.append(new_word) return (analysis, new_words)
def tag_with_apertium(text, tagger='pos_tag_melt'): """ Uses Apertium to tokenize the words and tagger to tag the tokens. >>> tag_with_apertium(u'ses beaux cheveux') [u'ses_DET', u'beaux_JJ', u'cheveux_NN'] :param text: string of words :param tagger: method to use for POS-tagging :return: list of tagged words with an underscore separating the word and the tag """ analysed = morpha_fr.analyse_morphologically(text, output='list') string = '' for a in analysed: word = re.sub(ur'\^(.*?)/.*', '\g<1>', a) string = string + word + ' $ ' # Use the function given in the tagger variable tag = getattr(tag_fr, tagger) try: tagged = tag(string) # Use the Stanford tagger if MElt is not installed. except OSError: tagged = pos_tag_stanford(string) tagged = re.sub(ur'\$_.*? ', '$ ', tagged) words = tagged.split('$') trimmed = [] for w in words: w = w.strip() if w != '': trimmed.append(w) return trimmed