Ejemplo n.º 1
0
def add_morph(original, new):
    """
    Adds the original morphological analysis to the replacing words.

    >>> add_morph([(u'les', u'DET'), (u'petites', u'JJ'), (u'filles', u'NN')],\\
    ... [(u'les', u'DET'), (u'mignon', u'JJ'), (u'chat', u'NN')])
    ([u'^le<det><def><mf><pl>$', u'^mignon<adj><f><pl>$', \
u'^chat<n><m><pl>$'], \
[(u'mignon', u'JJ'), (u'chat', u'NN')])
    
    :param original: list of the original words with their POS tags as tuples
    :param new: tuple list of the words of the new poem with their POS tags
    :return: a tuple with a list of the lemmas of the new poem with
             the morphological rules for the word form generation
             ('^word<tag1><tag2>...$'), and
             a tuple list of the words used in the replacement
             with their POS tags
    """
    string_orig = ''
    string_new = ''
    analysis = []
    new_words = []
    if len(original) == len(new):
        for i in range(len(original)):
            string_orig = string_orig + original[i][0] + ' '
            string_new = string_new + new[i][0] + ' '
        morph_orig = morpha_fr.analyse_morphologically(string_orig,
                                                       output='list')
        morph_new = morpha_fr.analyse_morphologically(string_new,
                                                      output='list')

        if len(morph_orig) == len(morph_new):
            for i in range(len(morph_orig)):
                pos = original[i][1]
                block_list = [u'avoir',  #have
                              u'être',   #be
                              u'aller',  #go
                              u'venir',  #come
                              u'faire',  #do
                              u'vouloir']#want
                block = False
                for b in block_list:
                    if morph_orig[i].lower().find(b) != -1:
                        block = True
                if block:
                    new_word = re.sub(ur'\^(.*?)/.*', '\g<1>', morph_orig[i])
                elif morph_orig[i] != morph_new[i]:
                    (new_word, is_new) = find_correct_analysis(morph_new[i],
                                                               morph_orig[i],
                                                               pos)
                    if is_new:
                        new_words.append(new[i])
                else:
                    (new_word, is_new) = find_correct_analysis(morph_orig[i],
                                                               morph_orig[i],
                                                               pos)
                analysis.append(new_word)
    return (analysis, new_words)
Ejemplo n.º 2
0
def tag_with_apertium(text, tagger='pos_tag_melt'):
    """
    Uses Apertium to tokenize the words and tagger to tag the tokens.

    >>> tag_with_apertium(u'ses beaux cheveux')
    [u'ses_DET', u'beaux_JJ', u'cheveux_NN']

    :param text: string of words
    :param tagger: method to use for POS-tagging
    :return: list of tagged words with an underscore separating
             the word and the tag
    """
    analysed = morpha_fr.analyse_morphologically(text, output='list')
    string = ''
    for a in analysed:
        word = re.sub(ur'\^(.*?)/.*', '\g<1>', a)
        string = string + word + ' $ '
    # Use the function given in the tagger variable
    tag = getattr(tag_fr, tagger)
    try:
        tagged = tag(string)
    # Use the Stanford tagger if MElt is not installed.
    except OSError:
        tagged = pos_tag_stanford(string)
    tagged = re.sub(ur'\$_.*? ', '$ ', tagged)
    words = tagged.split('$')
    trimmed = []
    for w in words:
        w = w.strip()
        if w != '':
            trimmed.append(w)
    return trimmed
Ejemplo n.º 3
0
def tag_with_apertium(text, tagger='pos_tag_melt'):
    """
    Uses Apertium to tokenize the words and tagger to tag the tokens.

    >>> tag_with_apertium(u'ses beaux cheveux')
    [u'ses_DET', u'beaux_JJ', u'cheveux_NN']

    :param text: string of words
    :param tagger: method to use for POS-tagging
    :return: list of tagged words with an underscore separating
             the word and the tag
    """
    analysed = morpha_fr.analyse_morphologically(text, output='list')
    string = ''
    for a in analysed:
        word = re.sub(ur'\^(.*?)/.*', '\g<1>', a)
        string = string + word + ' $ '
    # Use the function given in the tagger variable
    tag = getattr(tag_fr, tagger)
    try:
        tagged = tag(string)
    # Use the Stanford tagger if MElt is not installed.
    except OSError:
        tagged = pos_tag_stanford(string)
    tagged = re.sub(ur'\$_.*? ', '$ ', tagged)
    words = tagged.split('$')
    trimmed = []
    for w in words:
        w = w.strip()
        if w != '':
            trimmed.append(w)
    return trimmed