Example #1
0
def pos_tag_stanford(text, output='str', input_file='temp.txt'):
    """
    Part-of-speech tagging using the Stanford tagger.

    >>> pos_tag_stanford(u'ses beaux cheveux')
    u'ses_D beaux_JJ cheveux_NN '
    >>> pos_tag_stanford(u'ses beaux cheveux', output='list')
    [u'ses_D', u'beaux_JJ', u'cheveux_NN']
    >>> pos_tag_stanford(u'ses beaux cheveux', output='tuple')
    [(u'ses', u'D'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]
    
    :param text: string or list of words
    :param output: the output format
    :param input_file: name of the file that stores the input for the tagger
    :return: string (default) with an underscore separating
             the word and the tag, list , or list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    script = '../apparatus/stanford-postagger.sh'
    model = '../stanford/models/french.tagger'
    tagger = subprocess.Popen([script, model, input_file],
                              shell=False,
                              stdout=subprocess.PIPE)
    tagged = tagger.communicate()[0]
    tagged = unicode(tagged, 'utf-8')
    tagged = tagged.replace('\n', ' ')
    tuples = helpers.strings_to_tuples(tagged, '_')
    tuples = correct_tags_stanford(tuples)
    if output == 'list':
        return helpers.tuples_to_strings(tuples, output='list')
    if output == 'tuple':
        return tuples
    return helpers.tuples_to_strings(tuples, output='str')
Example #2
0
def pos_tag(text, output='tuple', tagger='pos_tag_melt'):
    """
    Part-of-speech tagging. Use this to tag poems.

    >>> pos_tag(u'ses beaux cheveux')
    [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]
    >>> pos_tag(u'ses beaux cheveux', output='str')
    u'ses_DET beaux_JJ cheveux_NN '

    :param text: string of raw text
    :param tagger: the method to use for tagging
    :return: list of analysed words as word-tag tuples (default),
             or string with an underscore separating the word and the tag
    """
    preprocessed = preprocess(text)
    tagged_list = tag_with_apertium(preprocessed, tagger)
    tuples = []
    for item in tagged_list:
        tupl_list = helpers.strings_to_tuples(item)
        if len(tupl_list) == 1:
            if tupl_list[0][0] != '':
                tuples.append(tupl_list[0])
        elif len(tupl_list) > 1:
            words = ''
            for i in range(len(tupl_list) - 1):
                words = words + tupl_list[i][0] + ' '
            words = words + tupl_list[-1][0]
            tuples.append((words, tupl_list[-1][1]))
    if output == 'str':
        return helpers.tuples_to_strings(tuples)
    return tuples
Example #3
0
def analyse_morphologically(tagged_text, output='str'):
    """
    Morphological analysis of English using the morpha analyser.

    >>> analyse_morphologically([('O', 'UH'), ('Helen', 'NNP'), \\
    ... ('fair', 'JJ'), ('!', '.'), ('O', 'UH'), ('Helen', 'NNP'), \\
    ... ('chaste', 'JJ'), ('!', '.'), ('&', 'CC'), ('If', 'IN'), \\
    ... ('I', 'PRP'), ('were', 'VBDR'), ('with', 'IN'), ('thee', 'PRP'), \\
    ... (',', ','), ('I', 'PRP'), ('were', 'VBDR'), ('blest', 'VB'), \\
    ... ('.', '.')])
    'O_UH Helen_NNP fair_JJ !_. O_UH Helen_NNP chaste_JJ !_. \
&_CC If_IN I_PRP be+ed_VBDR with_IN thee_PRP ,_, \
I_PRP be+ed_VBDR bless+ed_VB ._. \\n'
    
    :param tagged_text: POS-tagged text with the tags separated by underscore
    :param output: the type of the output
    :return: a string of the form lemma+affix_tag or
             a list of tuples of the form ('lemma+affix', 'tag')
    """
    if isinstance(tagged_text, list):
        tagged_text = helpers.tuples_to_strings(tagged_text)
    path = '../morph/morpha'
    echo = subprocess.Popen(['echo', tagged_text],
                            shell=False,
                            stdout=subprocess.PIPE)
    morpha = subprocess.Popen([path, '-act'],
                              shell=False,
                              stdin=echo.stdout,
                              stdout=subprocess.PIPE)
    result = morpha.communicate()[0]
    if output == 'tuple':
        return helpers.strings_to_tuples(result)
    return result
Example #4
0
def pos_tag_stanford(text, input_file='temp.txt'):
    """
    Part-of-speech tagging using the Stanford tagger.

    >>> pos_tag_stanford('O Helen fair! O Helen chaste!\\n\\
    ... If I were with thee, I were blest.')
    [('O', 'UH'), ('Helen', 'NNP'), ('fair', 'JJ'), ('!', '.'), \
('O', 'UH'), ('Helen', 'NNP'), ('chaste', 'JJ'), ('!', '.'), \
('If', 'IN'), ('I', 'PRP'), ('were', 'VBDR'), ('with', 'IN'), \
('thee', 'PRP'), (',', ','), ('I', 'PRP'), ('were', 'VBDR'), \
('blest', 'VB'), ('.', '.')]
    
    :param text: string or list of words
    :param input_file: name of the file that stores the input for the tagger
    :return: list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    script = '../apparatus/stanford-postagger.sh'
    model = '../stanford/models/wsj-0-18-left3words-distsim.tagger'
    tagger = subprocess.Popen([script, model, input_file],
                              shell=False,
                              stdout=subprocess.PIPE)
    output = tagger.communicate()[0]
    output = output.replace('\n', ' ')
    tuples = helpers.strings_to_tuples(output)
    corrected = correct_tags(tuples)
    return corrected
Example #5
0
def pos_tag_stanford(text, input_file='temp.txt'):
    """
    Part-of-speech tagging using the Stanford tagger.

    >>> pos_tag_stanford('O Helen fair! O Helen chaste!\\n\\
    ... If I were with thee, I were blest.')
    [('O', 'UH'), ('Helen', 'NNP'), ('fair', 'JJ'), ('!', '.'), \
('O', 'UH'), ('Helen', 'NNP'), ('chaste', 'JJ'), ('!', '.'), \
('If', 'IN'), ('I', 'PRP'), ('were', 'VBDR'), ('with', 'IN'), \
('thee', 'PRP'), (',', ','), ('I', 'PRP'), ('were', 'VBDR'), \
('blest', 'VB'), ('.', '.')]
    
    :param text: string or list of words
    :param input_file: name of the file that stores the input for the tagger
    :return: list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    script = '../apparatus/stanford-postagger.sh'
    model = '../stanford/models/wsj-0-18-left3words-distsim.tagger'
    tagger = subprocess.Popen([script, model, input_file],
                              shell=False,
                              stdout=subprocess.PIPE)
    output = tagger.communicate()[0]
    output = output.replace('\n', ' ')
    tuples = helpers.strings_to_tuples(output)
    corrected = correct_tags(tuples)
    return corrected
Example #6
0
def pos_tag(text, output='tuple', tagger='pos_tag_melt'):
    """
    Part-of-speech tagging. Use this to tag poems.

    >>> pos_tag(u'ses beaux cheveux')
    [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]
    >>> pos_tag(u'ses beaux cheveux', output='str')
    u'ses_DET beaux_JJ cheveux_NN '

    :param text: string of raw text
    :param tagger: the method to use for tagging
    :return: list of analysed words as word-tag tuples (default),
             or string with an underscore separating the word and the tag
    """
    preprocessed = preprocess(text)
    tagged_list = tag_with_apertium(preprocessed, tagger)
    tuples = []
    for item in tagged_list:
        tupl_list = helpers.strings_to_tuples(item)
        if len(tupl_list) == 1:
            if tupl_list[0][0] != '':
                tuples.append(tupl_list[0])
        elif len(tupl_list) > 1:
            words = ''
            for i in range(len(tupl_list)-1):
                words = words + tupl_list[i][0] + ' '
            words = words + tupl_list[-1][0]
            tuples.append((words, tupl_list[-1][1]))
    if output == 'str':
        return helpers.tuples_to_strings(tuples)
    return tuples
Example #7
0
def pos_tag_stanford(text, output='str', input_file='temp.txt'):
    """
    Part-of-speech tagging using the Stanford tagger.

    >>> pos_tag_stanford(u'ses beaux cheveux')
    u'ses_D beaux_JJ cheveux_NN '
    >>> pos_tag_stanford(u'ses beaux cheveux', output='list')
    [u'ses_D', u'beaux_JJ', u'cheveux_NN']
    >>> pos_tag_stanford(u'ses beaux cheveux', output='tuple')
    [(u'ses', u'D'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]
    
    :param text: string or list of words
    :param output: the output format
    :param input_file: name of the file that stores the input for the tagger
    :return: string (default) with an underscore separating
             the word and the tag, list , or list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    script = '../apparatus/stanford-postagger.sh'
    model = '../stanford/models/french.tagger'
    tagger = subprocess.Popen([script, model, input_file],
                              shell=False,
                              stdout=subprocess.PIPE)
    tagged = tagger.communicate()[0]
    tagged = unicode(tagged, 'utf-8')
    tagged = tagged.replace('\n', ' ')
    tuples = helpers.strings_to_tuples(tagged, '_')
    tuples = correct_tags_stanford(tuples)
    if output == 'list':
        return helpers.tuples_to_strings(tuples, output='list')
    if output == 'tuple':
        return tuples
    return helpers.tuples_to_strings(tuples, output='str')
Example #8
0
def pos_tag_melt(text, output='str', input_file='temp.txt'):
    """
    Part-of-speech tagging using the MElt tagger.

    >>> pos_tag_melt(u'ses beaux cheveux')
    u'ses_DET beaux_JJ cheveux_NN '
    >>> pos_tag_melt(u'ses beaux cheveux', output='list')
    [u'ses_DET', u'beaux_JJ', u'cheveux_NN']
    >>> pos_tag_melt(u'ses beaux cheveux', output='tuple')
    [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]

    :param text: string or list of words
    :param output: the output format
    :param input_file: name of the file that stores the input for the tagger
    :return: string (default) with an underscore separating
             the word and the tag, list , or list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    cat = subprocess.Popen(['cat', input_file],
                           shell=False,
                           stdout=subprocess.PIPE)
    melt = subprocess.Popen\
             (['MElt'],
              shell=False,
              stdin=cat.stdout,
              stdout=subprocess.PIPE)
    tagged = melt.communicate()[0]
    tagged = unicode(tagged, 'utf-8')
    tuples = helpers.strings_to_tuples(tagged, '/')
    tuples = correct_tags_melt(tuples)
    if output == 'list':
        return helpers.tuples_to_strings(tuples, output='list')
    if output == 'tuple':
        return tuples
    return helpers.tuples_to_strings(tuples, output='str')
Example #9
0
def get_words_with_tag(language, words, pos_tag, tagger, separator='_'):
    """
    Returns the words that are tagged with the given tag.

    >>> get_words_with_tag('en', ['cat', 'little'], 'NN', 'pos_tag_stanford')
    [('cat', 'NN')]
    >>> get_words_with_tag('fr', [(u'jouer_VB'), (u'aimer_VB'), \\
    ... (u'peut-ĂȘtre_RB')], 'VB', 'pos_tag_melt')
    [(u'jouer', u'VB'), (u'aimer', u'VB')]

    :language: language id, currently en or fr
    :param words: list of words, optionally tagged <word>_<tag>
    :param tag: POS-tag
    :param tagger: method to use for tagging
    :param separator: the character that separates the word and the tag
                      in the word list
    :return: list of words of the given part of speech
    """
    selected = []
    if len(words) == 0:
        return selected
    else:
        if words[0].find(separator) == -1:
            if language == 'en':
                tagged = tag_en.quick_pos_tag(words, tagger=tagger)
            if language == 'fr':
                tagged = tag_fr.quick_pos_tag(words, tagger=tagger)
            if language == 'fi':
                tagged = tag_fi.quick_pos_tag(words)
        else:
            tagged = helpers.strings_to_tuples(words, separator)
        tagged = helpers.capitalize_nnp(tagged)
        for token in tagged:
            if len(token) > 1:
                if token[1] == pos_tag:
                    selected.append(token)
        return selected
Example #10
0
def get_words_with_tag(language, words, pos_tag, tagger, separator='_'):
    """
    Returns the words that are tagged with the given tag.

    >>> get_words_with_tag('en', ['cat', 'little'], 'NN', 'pos_tag_stanford')
    [('cat', 'NN')]
    >>> get_words_with_tag('fr', [(u'jouer_VB'), (u'aimer_VB'), \\
    ... (u'peut-ĂȘtre_RB')], 'VB', 'pos_tag_melt')
    [(u'jouer', u'VB'), (u'aimer', u'VB')]

    :language: language id, currently en or fr
    :param words: list of words, optionally tagged <word>_<tag>
    :param tag: POS-tag
    :param tagger: method to use for tagging
    :param separator: the character that separates the word and the tag
                      in the word list
    :return: list of words of the given part of speech
    """
    selected = []
    if len(words) == 0:
        return selected
    else:
        if words[0].find(separator) == -1:
            if language == 'en':
                tagged = tag_en.quick_pos_tag(words, tagger=tagger)
            if language == 'fr':
                tagged = tag_fr.quick_pos_tag(words, tagger=tagger)
            if language == 'fi':
                tagged = tag_fi.quick_pos_tag(words)
        else:
            tagged = helpers.strings_to_tuples(words, separator)
        tagged = helpers.capitalize_nnp(tagged)
        for token in tagged:
            if len(token) > 1:
                if token[1] == pos_tag:
                    selected.append(token)
        return selected
Example #11
0
def pos_tag_melt(text, output='str', input_file='temp.txt'):
    """
    Part-of-speech tagging using the MElt tagger.

    >>> pos_tag_melt(u'ses beaux cheveux')
    u'ses_DET beaux_JJ cheveux_NN '
    >>> pos_tag_melt(u'ses beaux cheveux', output='list')
    [u'ses_DET', u'beaux_JJ', u'cheveux_NN']
    >>> pos_tag_melt(u'ses beaux cheveux', output='tuple')
    [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')]

    :param text: string or list of words
    :param output: the output format
    :param input_file: name of the file that stores the input for the tagger
    :return: string (default) with an underscore separating
             the word and the tag, list , or list of word-tag tuples
    """
    if isinstance(text, list):
        text = ' '.join(text)
    helpers.write_to_file(text, input_file)
    cat = subprocess.Popen(['cat', input_file],
                            shell=False,
                            stdout=subprocess.PIPE)
    melt = subprocess.Popen\
             (['MElt'],
              shell=False,
              stdin=cat.stdout,
              stdout=subprocess.PIPE)
    tagged = melt.communicate()[0]
    tagged = unicode(tagged, 'utf-8')
    tuples = helpers.strings_to_tuples(tagged, '/')
    tuples = correct_tags_melt(tuples)
    if output == 'list':
        return helpers.tuples_to_strings(tuples, output='list')
    if output == 'tuple':
        return tuples
    return helpers.tuples_to_strings(tuples, output='str')